svm.h

// Copyright (C) 2007  Davis E. King (davisking@users.sourceforge.net)
// License: Boost Software License   See LICENSE.txt for the full license.
#ifndef DLIB_SVm_
#define DLIB_SVm_

#include "svm_abstract.h"
#include <cmath>
#include <limits>
#include <sstream>
#include "../matrix.h"
#include "../algs.h"
#include "../serialize.h"
#include "../rand.h"
#include "../std_allocator.h"
#include "function.h"
#include "kernel.h"
#include "../enable_if.h"

namespace dlib
{

// ----------------------------------------------------------------------------------------

    class invalid_svm_nu_error : public dlib::error 
    { 
    public: 
        invalid_svm_nu_error(const std::string& msg, double nu_) : dlib::error(msg), nu(nu_) {};
        const double nu;
    };

// ----------------------------------------------------------------------------------------

    template <
        typename T
        >
    typename T::type maximum_nu_impl (
        const T& y
    )
    {
        typedef typename T::type scalar_type;
        // make sure requires clause is not broken
        DLIB_ASSERT(y.nr() > 1 && y.nc() == 1,
            "\ttypedef T::type maximum_nu(y)"
            << "\n\ty should be a column vector with more than one entry"
            << "\n\ty.nr(): " << y.nr() 
            << "\n\ty.nc(): " << y.nc() 
            );

        long pos_count = 0;
        long neg_count = 0;
        for (long r = 0; r < y.nr(); ++r)
        {
            if (y(r) == 1.0)
            {
                ++pos_count;
            }
            else if (y(r) == -1.0)
            {
                ++neg_count;
            }
            else
            {
                // make sure requires clause is not broken
                DLIB_ASSERT(y(r) == -1.0 || y(r) == 1.0,
                       "\ttypedef T::type maximum_nu(y)"
                       << "\n\ty should contain only 1 and 0 entries"
                       << "\n\tr:    " << r 
                       << "\n\ty(r): " << y(r) 
                );
            }
        }
        return static_cast<scalar_type>(2.0*(scalar_type)std::min(pos_count,neg_count)/(scalar_type)y.nr());
    }

    template <
        typename T
        >
    typename T::type maximum_nu (
        const T& y
    )
    {
        return maximum_nu_impl(vector_to_matrix(y));
    }

    template <
        typename T
        >
    typename T::value_type maximum_nu (
        const T& y
    )
    {
        return maximum_nu_impl(vector_to_matrix(y));
    }

// ----------------------------------------------------------------------------------------

    template <
        typename T,
        typename U
        >
    bool is_binary_classification_problem_impl (
        const T& x,
        const U& x_labels
    )
    {
        if (x.nc() != 1 || x_labels.nc() != 1) return false; 
        if (x.nr() != x_labels.nr()) return false;
        if (x.nr() <= 1) return false;
        for (long r = 0; r < x_labels.nr(); ++r)
        {
            if (x_labels(r) != -1 && x_labels(r) != 1)
                return false;
        }

        return true;
    }

    template <
        typename T,
        typename U
        >
    bool is_binary_classification_problem (
        const T& x,
        const U& x_labels
    )
    {
        return is_binary_classification_problem_impl(vector_to_matrix(x), vector_to_matrix(x_labels));
    }

// ----------------------------------------------------------------------------------------

    template <
        typename K,
        typename sample_vector_type,
        typename scalar_vector_type
        >
    class kernel_matrix_cache
    {
        typedef typename K::scalar_type scalar_type;
        typedef typename K::sample_type sample_type;
        typedef typename K::mem_manager_type mem_manager_type;

        const sample_vector_type& x;
        const scalar_vector_type& y;
        K kernel_function;

        mutable matrix<scalar_type,0,0,mem_manager_type> cache;
        mutable matrix<scalar_type,0,1,mem_manager_type> diag_cache;
        mutable matrix<long,0,1,mem_manager_type> lookup;
        mutable matrix<long,0,1,mem_manager_type> rlookup;
        mutable long next;

        /*!
        INITIAL VALUE
            - for all valid x:
                - lookup(x) == -1 
                - rlookup(x) == -1 

        CONVENTION
            - if (lookup(c) != -1) then
                - cache(lookup(c),*) == the cached column c of the kernel matrix
                - rlookup(lookup(c)) == c

            - if (rlookup(x) != -1) then
                - lookup(rlookup(x)) == x
                - cache(x,*) == the cached column rlookup(x) of the kernel matrix

            - next == the next row in the cache table to use to cache something 
        !*/

    public:
        kernel_matrix_cache (
            const sample_vector_type& x_,
            const scalar_vector_type& y_,
            K kernel_function_,
            long max_size_megabytes
        ) : x(x_), y(y_), kernel_function(kernel_function_) 
        {
            // figure out how many rows of the kernel matrix we can have
            // with the given amount of memory.
            long max_size = (max_size_megabytes*1024*1024)/(x.nr()*sizeof(scalar_type));
            // don't let it be 0
            if (max_size == 0)
                max_size = 1;
            long size = std::min(max_size,x.nr());

            diag_cache.set_size(x.nr(),1);
            cache.set_size(size,x.nr());
            lookup.set_size(x.nr(),1);
            rlookup.set_size(size,1);
            set_all_elements(lookup,-1);
            set_all_elements(rlookup,-1);
            next = 0;

            for (long i = 0; i < diag_cache.nr(); ++i)
                diag_cache(i) = kernel_function(x(i),x(i));
        }

        inline bool is_cached (
            long r
        ) const
        {
            return (lookup(r) != -1);
        }

        inline scalar_type operator () (
            long r,
            long c
        ) const
        {
            if (lookup(c) != -1)
            {
                return cache(lookup(c),r);
            }
            else if (r == c)
            {
                return diag_cache(r);
            }
            else if (lookup(r) != -1)
            {
                // the kernel is symmetric so this is legit
                return cache(lookup(r),c);
            }
            else
            {
                // if the lookup table is pointing to cache(next,*) then clear lookup(next)
                if (rlookup(next) != -1)
                    lookup(rlookup(next)) = -1;

                // make the lookup table os that it says c is now cached at the spot indicated by next
                lookup(c) = next;
                rlookup(next) = c;

                // compute this column in the kernel matrix and store it in the cache
                for (long i = 0; i < cache.nc(); ++i)
                    cache(next,i) = y(c)*y(i)*kernel_function(x(c),x(i));

                scalar_type val = cache(next,r);
                next = (next + 1)%cache.nr();
                return val;
            }
        }

    };

// ----------------------------------------------------------------------------------------

    template <
        typename dec_funct_type,
        typename in_sample_vector_type,
        typename in_scalar_vector_type
        >
    const matrix<typename dec_funct_type::scalar_type, 1, 2, typename dec_funct_type::mem_manager_type> 
    test_binary_decision_function_impl (
        const dec_funct_type& dec_funct,
        const in_sample_vector_type& x_test,
        const in_scalar_vector_type& y_test
    )
    {
        typedef typename dec_funct_type::scalar_type scalar_type;
        typedef typename dec_funct_type::sample_type sample_type;
        typedef typename dec_funct_type::mem_manager_type mem_manager_type;
        typedef matrix<sample_type,0,1,mem_manager_type> sample_vector_type;
        typedef matrix<scalar_type,0,1,mem_manager_type> scalar_vector_type;

        // make sure requires clause is not broken
        DLIB_ASSERT( is_binary_classification_problem(x_test,y_test) == true,
                    "\tmatrix test_binary_decision_function()"
                    << "\n\t invalid inputs were given to this function"
                    << "\n\t is_binary_classification_problem(x_test,y_test): " 
                    << ((is_binary_classification_problem(x_test,y_test))? "true":"false"));


        // count the number of positive and negative examples
        long num_pos = 0;
        long num_neg = 0;


        long num_pos_correct = 0;
        long num_neg_correct = 0;


        // now test this trained object 
        for (long i = 0; i < x_test.nr(); ++i)
        {
            // if this is a positive example
            if (y_test(i) == +1.0)
            {
                ++num_pos;
                if (dec_funct(x_test(i)) >= 0)
                    ++num_pos_correct;
            }
            else if (y_test(i) == -1.0)
            {
                ++num_neg;
                if (dec_funct(x_test(i)) < 0)
                    ++num_neg_correct;
            }
            else
            {
                throw dlib::error("invalid input labels to the test_binary_decision_function() function");
            }
        }


        matrix<scalar_type, 1, 2, mem_manager_type> res;
        res(0) = (scalar_type)num_pos_correct/(scalar_type)(num_pos); 
        res(1) = (scalar_type)num_neg_correct/(scalar_type)(num_neg); 
        return res;
    }

    template <
        typename dec_funct_type,
        typename in_sample_vector_type,
        typename in_scalar_vector_type
        >
    const matrix<typename dec_funct_type::scalar_type, 1, 2, typename dec_funct_type::mem_manager_type> 
    test_binary_decision_function (
        const dec_funct_type& dec_funct,
        const in_sample_vector_type& x_test,
        const in_scalar_vector_type& y_test
    )
    {
        return test_binary_decision_function_impl(dec_funct,
                                 vector_to_matrix(x_test),
                                 vector_to_matrix(y_test));
    }

// ----------------------------------------------------------------------------------------

    template <
        typename trainer_type,
        typename in_sample_vector_type,
        typename in_scalar_vector_type
        >
    const matrix<typename trainer_type::scalar_type, 1, 2, typename trainer_type::mem_manager_type> 
    cross_validate_trainer_impl (
        const trainer_type& trainer,
        const in_sample_vector_type& x,
        const in_scalar_vector_type& y,
        const long folds
    )
    {
        typedef typename trainer_type::scalar_type scalar_type;
        typedef typename trainer_type::sample_type sample_type;
        typedef typename trainer_type::mem_manager_type mem_manager_type;
        typedef matrix<sample_type,0,1,mem_manager_type> sample_vector_type;
        typedef matrix<scalar_type,0,1,mem_manager_type> scalar_vector_type;

        // make sure requires clause is not broken
        DLIB_ASSERT(is_binary_classification_problem(x,y) == true &&
                    1 < folds && folds <= x.nr(),
            "\tmatrix cross_validate_trainer()"
            << "\n\t invalid inputs were given to this function"
            << "\n\t x.nr(): " << x.nr() 
            << "\n\t folds:  " << folds 
            << "\n\t is_binary_classification_problem(x,y): " << ((is_binary_classification_problem(x,y))? "true":"false")
            );


        // count the number of positive and negative examples
        long num_pos = 0;
        long num_neg = 0;
        for (long r = 0; r < y.nr(); ++r)
        {
            if (y(r) == +1.0)
                ++num_pos;
            else
                ++num_neg;
        }

        // figure out how many positive and negative examples we will have in each fold
        const long num_pos_test_samples = num_pos/folds; 
        const long num_pos_train_samples = num_pos - num_pos_test_samples; 
        const long num_neg_test_samples = num_neg/folds; 
        const long num_neg_train_samples = num_neg - num_neg_test_samples; 


        typename trainer_type::trained_function_type d;
        sample_vector_type x_test, x_train;
        scalar_vector_type y_test, y_train;
        x_test.set_size (num_pos_test_samples  + num_neg_test_samples);
        y_test.set_size (num_pos_test_samples  + num_neg_test_samples);
        x_train.set_size(num_pos_train_samples + num_neg_train_samples);
        y_train.set_size(num_pos_train_samples + num_neg_train_samples);

        long pos_idx = 0;
        long neg_idx = 0;

        matrix<scalar_type, 1, 2, mem_manager_type> res;
        set_all_elements(res,0);

        for (long i = 0; i < folds; ++i)
        {
            long cur = 0;

            // load up our positive test samples
            while (cur < num_pos_test_samples)
            {
                if (y(pos_idx) == +1.0)
                {
                    x_test(cur) = x(pos_idx);
                    y_test(cur) = +1.0;
                    ++cur;
                }
                pos_idx = (pos_idx+1)%x.nr();
            }

            // load up our negative test samples
            while (cur < x_test.nr())
            {
                if (y(neg_idx) == -1.0)
                {
                    x_test(cur) = x(neg_idx);
                    y_test(cur) = -1.0;
                    ++cur;
                }
                neg_idx = (neg_idx+1)%x.nr();
            }

            // load the training data from the data following whatever we loaded
            // as the testing data
            long train_pos_idx = pos_idx;
            long train_neg_idx = neg_idx;
            cur = 0;

            // load up our positive train samples
            while (cur < num_pos_train_samples)
            {
                if (y(train_pos_idx) == +1.0)
                {
                    x_train(cur) = x(train_pos_idx);
                    y_train(cur) = +1.0;
                    ++cur;
                }
                train_pos_idx = (train_pos_idx+1)%x.nr();
            }

            // load up our negative train samples
            while (cur < x_train.nr())
            {
                if (y(train_neg_idx) == -1.0)
                {
                    x_train(cur) = x(train_neg_idx);
                    y_train(cur) = -1.0;
                    ++cur;
                }
                train_neg_idx = (train_neg_idx+1)%x.nr();
            }

            // do the training and testing
            res += test_binary_decision_function(trainer.train(x_train,y_train),x_test,y_test);

        } // for (long i = 0; i < folds; ++i)

        return res/(scalar_type)folds;
    }

    template <
        typename trainer_type,
        typename in_sample_vector_type,
        typename in_scalar_vector_type
        >
    const matrix<typename trainer_type::scalar_type, 1, 2, typename trainer_type::mem_manager_type> 
    cross_validate_trainer (
        const trainer_type& trainer,
        const in_sample_vector_type& x,
        const in_scalar_vector_type& y,
        const long folds
    )
    {
        return cross_validate_trainer_impl(trainer,
                                           vector_to_matrix(x),
                                           vector_to_matrix(y),
                                           folds);
    }

// ----------------------------------------------------------------------------------------

    template <
        typename trainer_type,
        typename in_sample_vector_type,
        typename in_scalar_vector_type
        >
    const probabilistic_decision_function<typename trainer_type::kernel_type> train_probabilistic_decision_function_impl (
        const trainer_type& trainer,
        const in_sample_vector_type& x,
        const in_scalar_vector_type& y,
        const long folds
    )
    {
        typedef typename trainer_type::sample_type sample_type;
        typedef typename trainer_type::scalar_type scalar_type;
        typedef typename trainer_type::mem_manager_type mem_manager_type;
        typedef typename trainer_type::kernel_type K;


        /*
            This function fits a sigmoid function to the output of the 
            svm trained by svm_nu_train().  The technique used is the one
            described in the paper:
                
                Probabilistic Outputs for Support Vector Machines and
                Comparisons to Regularized Likelihood Methods by 
                John C. Platt.  Match 26, 1999
        */

        // make sure requires clause is not broken
        DLIB_ASSERT(is_binary_classification_problem(x,y) == true &&
                    1 < folds && folds <= x.nr(),
            "\tprobabilistic_decision_function train_probabilistic_decision_function()"
            << "\n\t invalid inputs were given to this function"
            << "\n\t x.nr(): " << x.nr() 
            << "\n\t y.nr(): " << y.nr() 
            << "\n\t x.nc(): " << x.nc() 
            << "\n\t y.nc(): " << y.nc() 
            << "\n\t folds:  " << folds 
            << "\n\t is_binary_classification_problem(x,y): " << ((is_binary_classification_problem(x,y))? "true":"false")
            );

        // count the number of positive and negative examples
        long num_pos = 0;
        long num_neg = 0;
        for (long r = 0; r < y.nr(); ++r)
        {
            if (y(r) == +1.0)
                ++num_pos;
            else
                ++num_neg;
        }

        // figure out how many positive and negative examples we will have in each fold
        const long num_pos_test_samples = num_pos/folds; 
        const long num_pos_train_samples = num_pos - num_pos_test_samples; 
        const long num_neg_test_samples = num_neg/folds; 
        const long num_neg_train_samples = num_neg - num_neg_test_samples; 

        decision_function<K> d;
        typename decision_function<K>::sample_vector_type x_test, x_train;
        typename decision_function<K>::scalar_vector_type y_test, y_train;
        x_test.set_size (num_pos_test_samples  + num_neg_test_samples);
        y_test.set_size (num_pos_test_samples  + num_neg_test_samples);
        x_train.set_size(num_pos_train_samples + num_neg_train_samples);
        y_train.set_size(num_pos_train_samples + num_neg_train_samples);

        typedef std_allocator<scalar_type, mem_manager_type> alloc_scalar_type_vector;
        typedef std::vector<scalar_type, alloc_scalar_type_vector > dvector;
        typedef std_allocator<int, mem_manager_type> alloc_int_vector;
        typedef std::vector<int, alloc_int_vector > ivector;

        dvector out;
        ivector target;

        long pos_idx = 0;
        long neg_idx = 0;

        for (long i = 0; i < folds; ++i)
        {
            long cur = 0;

            // load up our positive test samples
            while (cur < num_pos_test_samples)
            {
                if (y(pos_idx) == +1.0)
                {
                    x_test(cur) = x(pos_idx);
                    y_test(cur) = +1.0;
                    ++cur;
                }
                pos_idx = (pos_idx+1)%x.nr();
            }

            // load up our negative test samples
            while (cur < x_test.nr())
            {
                if (y(neg_idx) == -1.0)
                {
                    x_test(cur) = x(neg_idx);
                    y_test(cur) = -1.0;
                    ++cur;
                }
                neg_idx = (neg_idx+1)%x.nr();
            }

            // load the training data from the data following whatever we loaded
            // as the testing data
            long train_pos_idx = pos_idx;
            long train_neg_idx = neg_idx;
            cur = 0;

            // load up our positive train samples
            while (cur < num_pos_train_samples)
            {
                if (y(train_pos_idx) == +1.0)
                {
                    x_train(cur) = x(train_pos_idx);
                    y_train(cur) = +1.0;
                    ++cur;
                }
                train_pos_idx = (train_pos_idx+1)%x.nr();
            }

            // load up our negative train samples
            while (cur < x_train.nr())
            {
                if (y(train_neg_idx) == -1.0)
                {
                    x_train(cur) = x(train_neg_idx);
                    y_train(cur) = -1.0;
                    ++cur;
                }
                train_neg_idx = (train_neg_idx+1)%x.nr();
            }

            // do the training
            d = trainer.train (x_train,y_train);

            // now test this fold 
            for (long i = 0; i < x_test.nr(); ++i)
            {
                out.push_back(d(x_test(i)));
                // if this was a positive example
                if (y_test(i) == +1.0)
                {
                    target.push_back(1);
                }
                else if (y_test(i) == -1.0)
                {
                    target.push_back(0);
                }
                else
                {
                    throw dlib::error("invalid input labels to the train_probabilistic_decision_function() function");
                }
            }

        } // for (long i = 0; i < folds; ++i)

        // Now find the parameters of the sigmoid.  Do so using the method from the
        // above referenced paper.
        scalar_type prior0 = num_pos_test_samples*folds; 
        scalar_type prior1 = num_neg_test_samples*folds; 
        scalar_type A = 0;
        scalar_type B = std::log((prior0+1)/(prior1+1));

        const scalar_type hiTarget = (prior1+1)/(prior1+2);
        const scalar_type loTarget = 1.0/(prior0+2);
        scalar_type lambda = 1e-3;
        scalar_type olderr = std::numeric_limits<scalar_type>::max();;
        dvector pp(out.size(),(prior1+1)/(prior1+prior0+2));
        const scalar_type min_log = -200.0;

        scalar_type t = 0;
        int count = 0;
        for (int it = 0; it < 100; ++it)
        {
            scalar_type a = 0;
            scalar_type b = 0;
            scalar_type c = 0;
            scalar_type d = 0;
            scalar_type e = 0;

            // First, compute Hessian & gradient of error function with 
            // respect to A & B
            for (unsigned long i = 0; i < out.size(); ++i)
            {
                if (target[i])
                    t = hiTarget;
                else
                    t = loTarget;

                const scalar_type d1 = pp[i] - t;
                const scalar_type d2 = pp[i]*(1-pp[i]);
                a += out[i]*out[i]*d2;
                b += d2;
                c += out[i]*d2;
                d += out[i]*d1;
                e += d1;
            }
            
            // If gradient is really tiny, then stop.
            if (std::abs(d) < 1e-9 && std::abs(e) < 1e-9)
                break;

            scalar_type oldA = A;
            scalar_type oldB = B;
            scalar_type err = 0;

            // Loop until goodness of fit increases
            while (true)
            {
                scalar_type det = (a+lambda)*(b+lambda)-c*c;
                // if determinant of Hessian is really close to zero then increase stabilizer.
                if (std::abs(det) <= std::numeric_limits<scalar_type>::epsilon())
                {
                    lambda *= 10;
                    continue;
                }

                A = oldA + ((b+lambda)*d-c*e)/det;
                B = oldB + ((a+lambda)*e-c*d)/det;

                // Now, compute the goodness of fit
                err = 0;
                for (unsigned long i = 0; i < out.size(); ++i)
                {
                    if (target[i])
                        t = hiTarget;
                    else
                        t = loTarget;
                    scalar_type p = 1.0/(1.0+std::exp(out[i]*A+B));
                    pp[i] = p;
                    // At this step, make sure log(0) returns min_log 
                    err -= t*std::max(std::log(p),min_log) + (1-t)*std::max(std::log(1-p),min_log);
                }

                if (err < olderr*(1+1e-7))
                {
                    lambda *= 0.1;
                    break;
                }

                // error did not decrease: increase stabilizer by factor of 10 
                // & try again
                lambda *= 10;
                if (lambda >= 1e6) // something is broken. Give up
                    break;
            }

            scalar_type diff = err-olderr;
            scalar_type scale = 0.5*(err+olderr+1.0);
            if (diff > -1e-3*scale && diff < 1e-7*scale)
                ++count;
            else
                count = 0;

            olderr = err;

            if (count == 3)
                break;
        }

        return probabilistic_decision_function<K>( A, B, trainer.train(x,y) );
    }

    template <
        typename trainer_type,
        typename in_sample_vector_type,
        typename in_scalar_vector_type
        >
    const probabilistic_decision_function<typename trainer_type::kernel_type> train_probabilistic_decision_function (
        const trainer_type& trainer,
        const in_sample_vector_type& x,
        const in_scalar_vector_type& y,
        const long folds
    )
    {
        return train_probabilistic_decision_function_impl(trainer,
                                                          vector_to_matrix(x),
                                                          vector_to_matrix(y),
                                                          folds);
    }

// ----------------------------------------------------------------------------------------

    template <
        typename T,
        typename U
        >
    typename enable_if<is_matrix<T>,void>::type randomize_samples (
        T& t,
        U& u
    )
    {
        rand::kernel_1a r;

        long n = t.nr()-1;
        while (n > 0)
        {
            // put a random integer into idx
            unsigned long idx = r.get_random_32bit_number();

            // make idx be less than n
            idx %= n;

            // swap our randomly selected index into the n position
            exchange(t(idx), t(n));
            exchange(u(idx), u(n));

            --n;
        }
    }

// ----------------------------------------------------------------------------------------

    template <
        typename T,
        typename U
        >
    typename disable_if<is_matrix<T>,void>::type randomize_samples (
        T& t,
        U& u
    )
    {
        rand::kernel_1a r;

        long n = t.size()-1;
        while (n > 0)
        {
            // put a random integer into idx
            unsigned long idx = r.get_random_32bit_number();

            // make idx be less than n
            idx %= n;

            // swap our randomly selected index into the n position
            exchange(t[idx], t[n]);
            exchange(u[idx], u[n]);

            --n;
        }
    }

// ----------------------------------------------------------------------------------------

    template <
        typename T
        >
    typename enable_if<is_matrix<T>,void>::type randomize_samples (
        T& t
    )
    {
        rand::kernel_1a r;

        long n = t.nr()-1;
        while (n > 0)
        {
            // put a random integer into idx
            unsigned long idx = r.get_random_32bit_number();

            // make idx be less than n
            idx %= n;

            // swap our randomly selected index into the n position
            exchange(t(idx), t(n));

            --n;
        }
    }

// ----------------------------------------------------------------------------------------

    template <
        typename T
        >
    typename disable_if<is_matrix<T>,void>::type randomize_samples (
        T& t
    )
    {
        rand::kernel_1a r;

        long n = t.size()-1;
        while (n > 0)
        {
            // put a random integer into idx
            unsigned long idx = r.get_random_32bit_number();

            // make idx be less than n
            idx %= n;

            // swap our randomly selected index into the n position
            exchange(t[idx], t[n]);

            --n;
        }
    }

// ----------------------------------------------------------------------------------------
// ----------------------------------------------------------------------------------------
// ----------------------------------------------------------------------------------------

    template <
        typename K 
        >
    class svm_nu_trainer
    {
    public:
        typedef K kernel_type;
        typedef typename kernel_type::scalar_type scalar_type;
        typedef typename kernel_type::sample_type sample_type;
        typedef typename kernel_type::mem_manager_type mem_manager_type;
        typedef decision_function<kernel_type> trained_function_type;

        svm_nu_trainer (
        ) :
            nu(0.1),
            cache_size(200),
            eps(0.001)
        {
        }

        svm_nu_trainer (
            const kernel_type& kernel_, 
            const scalar_type& nu_
        ) :
            kernel_function(kernel_),
            nu(nu_),
            cache_size(200),
            eps(0.001)
        {
            // make sure requires clause is not broken
            DLIB_ASSERT(0 < nu && nu <= 1,
                "\tsvm_nu_trainer::svm_nu_trainer(kernel,nu)"
                << "\n\t invalid inputs were given to this function"
                << "\n\t nu: " << nu 
                );
        }

        void set_cache_size (
            long cache_size_
        )
        {
            // make sure requires clause is not broken
            DLIB_ASSERT(cache_size_ > 0,
                "\tvoid svm_nu_trainer::set_cache_size(cache_size_)"
                << "\n\t invalid inputs were given to this function"
                << "\n\t cache_size: " << cache_size_ 
                );
            cache_size = cache_size_;
        }

        long get_cache_size (
        ) const
        {
            return cache_size;
        }

        void set_epsilon (
            scalar_type eps_
        )
        {
            // make sure requires clause is not broken
            DLIB_ASSERT(eps_ > 0,
                "\tvoid svm_nu_trainer::set_epsilon(eps_)"
                << "\n\t invalid inputs were given to this function"
                << "\n\t eps: " << eps_ 
                );
            eps = eps_;
        }

        const scalar_type get_epsilon (
        ) const
        { 
            return eps;
        }

        void set_kernel (
            const kernel_type& k
        )
        {
            kernel_function = k;
        }

        const kernel_type& get_kernel (
        ) const
        {
            return kernel_function;
        }

        void set_nu (
            scalar_type nu_
        )
        {
            // make sure requires clause is not broken
            DLIB_ASSERT(0 < nu_ && nu_ <= 1,
                "\tvoid svm_nu_trainer::set_nu(nu_)"
                << "\n\t invalid inputs were given to this function"
                << "\n\t nu: " << nu_ 
                );
            nu = nu_;
        }

        const scalar_type get_nu (
        ) const
        {
            return nu;
        }

        template <
            typename in_sample_vector_type,
            typename in_scalar_vector_type
            >
        const decision_function<kernel_type> train (
            const in_sample_vector_type& x,
            const in_scalar_vector_type& y
        ) const
        {
            return do_train(vector_to_matrix(x), vector_to_matrix(y));
        }

        void swap (
            svm_nu_trainer& item
        )
        {
            exchange(kernel_function, item.kernel_function);
            exchange(nu,              item.nu);
            exchange(cache_size,      item.cache_size);
            exchange(eps,             item.eps);
        }

    private:

    // ------------------------------------------------------------------------------------

        template <
            typename in_sample_vector_type,
            typename in_scalar_vector_type
            >
        const decision_function<kernel_type> do_train (
            const in_sample_vector_type& x,
            const in_scalar_vector_type& y
        ) const
        {
            typedef typename K::scalar_type scalar_type;
            typedef typename decision_function<K>::sample_vector_type sample_vector_type;
            typedef typename decision_function<K>::scalar_vector_type scalar_vector_type;

            // make sure requires clause is not broken
            DLIB_ASSERT(is_binary_classification_problem(x,y) == true,
                "\tdecision_function svm_nu_trainer::train(x,y)"
                << "\n\t invalid inputs were given to this function"
                << "\n\t x.nr(): " << x.nr() 
                << "\n\t y.nr(): " << y.nr() 
                << "\n\t x.nc(): " << x.nc() 
                << "\n\t y.nc(): " << y.nc() 
                << "\n\t is_binary_classification_problem(x,y): " << ((is_binary_classification_problem(x,y))? "true":"false")
                );


            const scalar_type tau = 1e-12;
            scalar_vector_type df; // delta f(alpha)
            scalar_vector_type alpha;

            kernel_matrix_cache<K, in_sample_vector_type, in_scalar_vector_type> Q(x,y,kernel_function,cache_size);

            alpha.set_size(x.nr());
            df.set_size(x.nr());

            // now initialize alpha
            set_initial_alpha(y, nu, alpha);


            // initialize df.  Compute df = Q*alpha
            for (long r = 0; r < df.nr(); ++r)
            {
                df(r) = 0;
                for (long c = 0; c < alpha.nr(); ++c)
                {
                    if (alpha(c) != 0)
                        df(r) += Q(c,r)*alpha(c);
                }
            }

            // now perform the actual optimization of alpha
            long i, j;
            while (find_working_group(y,alpha,Q,df,tau,eps,i,j))
            {
                const scalar_type old_alpha_i = alpha(i);
                const scalar_type old_alpha_j = alpha(j);

                optimize_working_pair(y,alpha,Q,df,tau,i,j);

                // update the df vector now that we have modified alpha(i) and alpha(j)
                scalar_type delta_alpha_i = alpha(i) - old_alpha_i;
                scalar_type delta_alpha_j = alpha(j) - old_alpha_j;
                for(long k = 0; k < df.nr(); ++k)
                    df(k) += Q(k,i)*delta_alpha_i + Q(k,j)*delta_alpha_j;

            }

            scalar_type rho, b;
            calculate_rho_and_b(y,alpha,df,rho,b);
            alpha = pointwise_multiply(alpha,y)/rho;

            // count the number of support vectors
            long sv_count = 0;
            for (long i = 0; i < alpha.nr(); ++i)
            {
                if (alpha(i) != 0)
                    ++sv_count;
            }

            scalar_vector_type sv_alpha;
            sample_vector_type support_vectors;

            // size these column vectors so that they have an entry for each support vector
            sv_alpha.set_size(sv_count);
            support_vectors.set_size(sv_count);

            // load the support vectors and their alpha values into these new column matrices
            long idx = 0;
            for (long i = 0; i < alpha.nr(); ++i)
            {
                if (alpha(i) != 0)
                {
                    sv_alpha(idx) = alpha(i);
                    support_vectors(idx) = x(i);
                    ++idx;
                }
            }

            // now return the decision function
            return decision_function<K> (sv_alpha, b, kernel_function, support_vectors);
        }

    // ------------------------------------------------------------------------------------

        template <
            typename scalar_type,
            typename scalar_vector_type,
            typename scalar_vector_type2
            >
        inline void set_initial_alpha (
            const scalar_vector_type& y,
            const scalar_type nu,
            scalar_vector_type2& alpha
        ) const
        {
            set_all_elements(alpha,0);
            const scalar_type l = y.nr();
            scalar_type temp = nu*l/2;
            long num = (long)std::floor(temp);
            long num_total = (long)std::ceil(temp);

            int count = 0;
            for (int i = 0; i < alpha.nr(); ++i)
            {
                if (y(i) == 1)
                {
                    if (count < num)
                    {
                        ++count;
                        alpha(i) = 1;
                    }
                    else 
                    {
                        if (temp > num)
                        {
                            ++count;
                            alpha(i) = temp - std::floor(temp);
                        }
                        break;
                    }
                }
            }

            if (count != num_total)
            {
                std::ostringstream sout;
                sout << "invalid nu of " << nu << ".  Must be between 0 and " << (scalar_type)count/y.nr();
                throw invalid_svm_nu_error(sout.str(),nu);
            }

            count = 0;
            for (int i = 0; i < alpha.nr(); ++i)
            {
                if (y(i) == -1)
                {
                    if (count < num)
                    {
                        ++count;
                        alpha(i) = 1;
                    }
                    else 
                    {
                        if (temp > num)
                        {
                            ++count;
                            alpha(i) = temp - std::floor(temp);
                        }
                        break;
                    }
                }
            }

            if (count != num_total)
            {
                std::ostringstream sout;
                sout << "invalid nu of " << nu << ".  Must be between 0 and " << (scalar_type)count/y.nr();
                throw invalid_svm_nu_error(sout.str(),nu);
            }
        }

    // ------------------------------------------------------------------------------------

        template <
            typename sample_vector_type,
            typename scalar_vector_type,
            typename scalar_vector_type2,
            typename scalar_type
            >
        inline bool find_working_group (
            const scalar_vector_type2& y,
            const scalar_vector_type& alpha,
            const kernel_matrix_cache<K,sample_vector_type, scalar_vector_type2>& Q,
            const scalar_vector_type& df,
            const scalar_type tau,
            const scalar_type eps,
            long& i_out,
            long& j_out
        ) const
        {
            using namespace std;
            long ip = -1;
            long jp = -1;
            long in = -1;
            long jn = -1;

            scalar_type ip_val = -numeric_limits<scalar_type>::infinity();
            scalar_type jp_val = numeric_limits<scalar_type>::infinity();
            scalar_type in_val = -numeric_limits<scalar_type>::infinity();
            scalar_type jn_val = numeric_limits<scalar_type>::infinity();

            // loop over the alphas and find the maximum ip and in indices.
            for (long i = 0; i < alpha.nr(); ++i)
            {
                if (y(i) == 1)
                {
                    if (alpha(i) < 1.0)
                    {
                        if (-df(i) > ip_val)
                        {
                            ip_val = -df(i);
                            ip = i;
                        }
                    }
                }
                else
                {
                    if (alpha(i) > 0.0)
                    {
                        if (df(i) > in_val)
                        {
                            in_val = df(i);
                            in = i;
                        }
                    }
                }
            }

            scalar_type Mp = numeric_limits<scalar_type>::infinity();
            scalar_type Mn = numeric_limits<scalar_type>::infinity();
            scalar_type bp = -numeric_limits<scalar_type>::infinity();
            scalar_type bn = -numeric_limits<scalar_type>::infinity();

            // now we need to find the minimum jp and jn indices
            for (long j = 0; j < alpha.nr(); ++j)
            {
                if (y(j) == 1)
                {
                    if (alpha(j) > 0.0)
                    {
                        scalar_type b = ip_val + df(j);
                        if (-df(j) < Mp)
                            Mp = -df(j);

                        if (b > 0 && (Q.is_cached(j) || b > bp || jp == -1 ))
                        {
                            bp = b;
                            scalar_type a = Q(ip,ip) + Q(j,j) - 2*Q(j,ip); 
                            if (a <= 0)
                                a = tau;
                            scalar_type temp = -b*b/a;
                            if (temp < jp_val)
                            {
                                jp_val = temp;
                                jp = j;
                            }
                        }
                    }
                }
                else
                {
                    if (alpha(j) < 1.0)
                    {
                        scalar_type b = in_val - df(j);
                        if (df(j) < Mn)
                            Mn = df(j);

                        if (b > 0 && (Q.is_cached(j) || b > bn || jn == -1 ))
                        {
                            bn = b;
                            scalar_type a = Q(in,in) + Q(j,j) - 2*Q(j,in); 
                            if (a <= 0)
                                a = tau;
                            scalar_type temp = -b*b/a;
                            if (temp < jn_val)
                            {
                                jn_val = temp;
                                jn = j;
                            }
                        }
                    }
                }
            }

            // if we are at the optimal point then return false so the caller knows
            // to stop optimizing
            if (std::max(ip_val - Mp, in_val - Mn) < eps)
                return false;

            if (jp_val < jn_val)
            {
                i_out = ip;
                j_out = jp;
            }
            else
            {
                i_out = in;
                j_out = jn;
            }

            if (j_out >= 0 && i_out >= 0)
                return true;
            else
                return false;
        }

    // ------------------------------------------------------------------------------------

        template <
            typename scalar_vector_type,
            typename scalar_vector_type2,
            typename scalar_type
            >
        void calculate_rho_and_b(
            const scalar_vector_type2& y,
            const scalar_vector_type& alpha,
            const scalar_vector_type& df,
            scalar_type& rho, 
            scalar_type& b
        ) const
        {
            using namespace std;
            long num_p_free = 0;
            long num_n_free = 0;
            scalar_type sum_p_free = 0;
            scalar_type sum_n_free = 0;

            scalar_type upper_bound_p = -numeric_limits<scalar_type>::infinity();
            scalar_type upper_bound_n = -numeric_limits<scalar_type>::infinity();
            scalar_type lower_bound_p = numeric_limits<scalar_type>::infinity();
            scalar_type lower_bound_n = numeric_limits<scalar_type>::infinity();

            for(long i = 0; i < alpha.nr(); ++i)
            {
                if(y(i) == 1)
                {
                    if(alpha(i) == 1)
                    {
                        if (df(i) > upper_bound_p)
                            upper_bound_p = df(i);
                    }
                    else if(alpha(i) == 0)
                    {
                        if (df(i) < lower_bound_p)
                            lower_bound_p = df(i);
                    }
                    else
                    {
                        ++num_p_free;
                        sum_p_free += df(i);
                    }
                }
                else
                {
                    if(alpha(i) == 1)
                    {
                        if (df(i) > upper_bound_n)
                            upper_bound_n = df(i);
                    }
                    else if(alpha(i) == 0)
                    {
                        if (df(i) < lower_bound_n)
                            lower_bound_n = df(i);
                    }
                    else
                    {
                        ++num_n_free;
                        sum_n_free += df(i);
                    }
                }
            }

            scalar_type r1,r2;
            if(num_p_free > 0)
                r1 = sum_p_free/num_p_free;
            else
                r1 = (upper_bound_p+lower_bound_p)/2;

            if(num_n_free > 0)
                r2 = sum_n_free/num_n_free;
            else
                r2 = (upper_bound_n+lower_bound_n)/2;

            rho = (r1+r2)/2;
            b = (r1-r2)/2/rho;
        }

    // ------------------------------------------------------------------------------------

        template <
            typename sample_vector_type,
            typename scalar_vector_type,
            typename scalar_vector_type2,
            typename scalar_type
            >
        inline void optimize_working_pair (
            const scalar_vector_type2& y,
            scalar_vector_type& alpha,
            const kernel_matrix_cache<K, sample_vector_type, scalar_vector_type2>& Q,
            const scalar_vector_type& df,
            const scalar_type tau,
            const long i,
            const long j
        ) const
        {
            scalar_type quad_coef = Q(i,i)+Q(j,j)-2*Q(j,i);
            if (quad_coef <= 0)
                quad_coef = tau;
            scalar_type delta = (df(i)-df(j))/quad_coef;
            scalar_type sum = alpha(i) + alpha(j);
            alpha(i) -= delta;
            alpha(j) += delta;

            if(sum > 1)
            {
                if(alpha(i) > 1)
                {
                    alpha(i) = 1;
                    alpha(j) = sum - 1;
                }
                else if(alpha(j) > 1)
                {
                    alpha(j) = 1;
                    alpha(i) = sum - 1;
                }
            }
            else
            {
                if(alpha(j) < 0)
                {
                    alpha(j) = 0;
                    alpha(i) = sum;
                }
                else if(alpha(i) < 0)
                {
                    alpha(i) = 0;
                    alpha(j) = sum;
                }
            }
        }

    // ------------------------------------------------------------------------------------

        kernel_type kernel_function;
        scalar_type nu;
        long cache_size;
        scalar_type eps;
    }; // end of class svm_nu_trainer

// ----------------------------------------------------------------------------------------

    template <typename K>
    void swap (
        svm_nu_trainer<K>& a,
        svm_nu_trainer<K>& b
    ) { a.swap(b); }

// ----------------------------------------------------------------------------------------
// ----------------------------------------------------------------------------------------
// ----------------------------------------------------------------------------------------
}

#endif // DLIB_SVm_