Merge branch 'master' of https://github.com/davisking/dlib into dnn_group_layer

93e786db · Fm · 59892409 · 91163863 · 93e786db · 93e786db
Commit 93e786db authored May 26, 2016 by Fm
20 changed files
--- a/dlib/algs.h
+++ b/dlib/algs.h
@@ -488,6 +488,13 @@ namespace dlib
 // ----------------------------------------------------------------------------------------
+    struct general_ {};
+    struct special_ : general_ {};
+    template<typename> struct int_ { typedef int type; };
+// ----------------------------------------------------------------------------------------
    /*!A is_same_object 
        This is a templated function which checks if both of its arguments are actually

--- a/dlib/dnn/core.h
+++ b/dlib/dnn/core.h
@@ -24,6 +24,38 @@
 namespace dlib
 {
+// ----------------------------------------------------------------------------------------
+    namespace impl
+    {
+        template <typename T, typename int_<decltype(&T::get_learning_rate_multiplier)>::type = 0>
+        double get_learning_rate_multiplier (
+            const T& obj,
+            special_
+        ) { return obj.get_learning_rate_multiplier(); }
+        template <typename T>
+        double get_learning_rate_multiplier ( const T& obj, general_) { return 1; }
+    }
+    template <typename T>
+    double get_learning_rate_multiplier(const T& obj) { return impl::get_learning_rate_multiplier(obj, special_()); }
+// ----------------------------------------------------------------------------------------
+    namespace impl
+    {
+        template <typename T, typename int_<decltype(&T::get_weight_decay_multiplier)>::type = 0>
+        double get_weight_decay_multiplier (
+            const T& obj,
+            special_
+        ) { return obj.get_weight_decay_multiplier(); }
+        template <typename T>
+        double get_weight_decay_multiplier ( const T& obj, general_) { return 1; }
+    }
+    template <typename T>
+    double get_weight_decay_multiplier(const T& obj) { return impl::get_weight_decay_multiplier(obj, special_()); }
 // ----------------------------------------------------------------------------------------
    namespace impl
@@ -458,7 +490,7 @@ namespace dlib
        sstack pop(size_t num=1) 
        { 
-            DLIB_CASSERT(num < size(), "You can't pop more things from the stack than it has in it.");
+            DLIB_CASSERT(num <= size(), "You can't pop more things from the stack than it has in it.");
            return sstack(data+num, mysize-num);
        }
@@ -849,8 +881,9 @@ namespace dlib
        void update_parameters(sstack<solver_type> solvers, double learning_rate)
        {
            DLIB_CASSERT(solvers.size()>=num_computational_layers,"");
-            // Don't try to adjust the parameters if this layer doesn't have any.
+            // Don't try to adjust the parameters if this layer doesn't have any or the
-            if (params_grad.size() != 0)
+            // learning rate is disabled for this layer.
+            if (params_grad.size() != 0 && get_learning_rate_multiplier(details) != 0)
            {
                const tensor& step = solvers.top()(learning_rate, details, static_cast<const tensor&>(params_grad));
                tt::add(details.get_layer_params(), details.get_layer_params(), step);
@@ -1200,8 +1233,9 @@ namespace dlib
        void update_parameters(sstack<solver_type> solvers, double learning_rate)
        {
            DLIB_CASSERT(solvers.size()>=num_computational_layers,"");
-            // Don't try to adjust the parameters if this layer doesn't have any.
+            // Don't try to adjust the parameters if this layer doesn't have any or the
-            if (params_grad.size() != 0) 
+            // learning rate is disabled for this layer.
+            if (params_grad.size() != 0 && get_learning_rate_multiplier(details) != 0) 
            {
                const tensor& step = solvers.top()(learning_rate, details, static_cast<const tensor&>(params_grad));
                tt::add(details.get_layer_params(), details.get_layer_params(), step);
@@ -1817,9 +1851,7 @@ namespace dlib
    public:
        typedef INPUT_LAYER subnet_type;
        typedef typename subnet_type::input_type input_type;
-        // This layer counts as a computational layer because it copies and stores the
+        const static size_t num_computational_layers = 0;
-        // inputs.
-        const static size_t num_computational_layers = 1;
        const static size_t num_layers = 2;
        const static unsigned int sample_expansion_factor = subnet_type::sample_expansion_factor;
        static_assert(sample_expansion_factor >= 1,

--- a/dlib/dnn/core_abstract.h
+++ b/dlib/dnn/core_abstract.h
@@ -67,6 +67,32 @@ namespace dlib
              (except computes it using a numerically accurate method)
    !*/
+// ----------------------------------------------------------------------------------------
+    template <typename T>
+    double get_learning_rate_multiplier(
+        const T& obj
+    ); 
+    /*!
+        ensures
+            - if (obj has a get_learning_rate_multiplier() member function) then
+                - returns obj.get_learning_rate_multiplier()
+            - else
+                - returns 1
+    !*/
+    template <typename T>
+    double get_weight_decay_multiplier(
+        const T& obj
+    ); 
+    /*!
+        ensures
+            - if (obj has a get_weight_decay_multiplier() member function) then
+                - returns obj.get_weight_decay_multiplier()
+            - else
+                - returns 1
+    !*/
 // ----------------------------------------------------------------------------------------
    bool dnn_prefer_fastest_algorithms(
@@ -152,7 +178,7 @@ namespace dlib
        ); 
        /*!
            requires
-                - num < size()
+                - num <= size()
            ensures
                - returns a reference to the sub-stack S such that:
                    - S.size() == size()-num.

--- a/dlib/dnn/cpu_dlib.cpp
+++ b/dlib/dnn/cpu_dlib.cpp
@@ -385,6 +385,30 @@ namespace dlib
                d[i] = A*s1[i] + B*s2[i] + C*s3[i] + D;
        }
+        void affine_transform_range(
+            size_t begin,
+            size_t end,
+            tensor& dest,
+            const tensor& src1,
+            const tensor& src2,
+            const tensor& src3,
+            const float A,
+            const float B,
+            const float C
+        )
+        {
+            DLIB_CASSERT(dest.size()==src1.size(),"");
+            DLIB_CASSERT(dest.size()==src2.size(),"");
+            DLIB_CASSERT(dest.size()==src3.size(),"");
+            DLIB_CASSERT(begin <= end && end <= dest.size(),"");
+            const auto d = dest.host();
+            const auto s1 = src1.host();
+            const auto s2 = src2.host();
+            const auto s3 = src3.host();
+            for (size_t i = begin; i < end; ++i)
+                d[i] = A*s1[i] + B*s2[i] + C*s3[i];
+        }
    // -----------------------------------------------------------------------------------
        void affine_transform(
@@ -464,6 +488,8 @@ namespace dlib
    // -----------------------------------------------------------------------------------
        void compute_adam_update (
+            size_t begin,
+            size_t end,
            tensor& s,
            tensor& m,
            tensor& v,
@@ -480,6 +506,7 @@ namespace dlib
                         s.size() == v.size() &&
                         s.size() == params.size() &&
                         s.size() == params_grad.size(),"");
+            DLIB_CASSERT(begin <= end && end <= params.size(),"");
            const float eps = 1e-8;
            const float alpha = learning_rate*std::sqrt(1-std::pow(momentum2,t))/(1-std::pow(momentum1, t));
@@ -492,7 +519,7 @@ namespace dlib
            auto ps = s.host_write_only();
            auto pparams = params.host();
            auto ppgrad = params_grad.host();
-            for (size_t i = 0; i < params.size(); ++i)
+            for (size_t i = begin; i < end; ++i)
            {
                float g = weight_decay*pparams[i] + ppgrad[i];
                pm[i] = momentum1*pm[i] + (1-momentum1)*g;
@@ -504,6 +531,7 @@ namespace dlib
    // -----------------------------------------------------------------------------------
        void batch_normalize_inference (
+            const double eps,
            resizable_tensor& dest,
            const tensor& src,
            const tensor& gamma, 
@@ -519,7 +547,8 @@ namespace dlib
                gamma.k()  == src.k() &&
                have_same_dimensions(gamma, beta) &&
                have_same_dimensions(gamma, running_means) &&
-                have_same_dimensions(gamma, running_variances), 
+                have_same_dimensions(gamma, running_variances) && 
+                eps > 0, 
                "\ngamma.num_samples(): " << gamma.num_samples() << 
                "\ngamma.k():  " << gamma.k() << 
                "\ngamma.nr(): " << gamma.nr() << 
@@ -538,7 +567,8 @@ namespace dlib
                "\nrunning_variances.nc():  " << running_variances.nc() << 
                "\nsrc.k():   " << src.k() << 
                "\nsrc.nr():  " << src.nr() << 
-                "\nsrc.nc():  " << src.nc() 
+                "\nsrc.nc():  " << src.nc() <<
+                "\neps:  " << eps 
            );
            dest.copy_size(src);
@@ -554,7 +584,7 @@ namespace dlib
            {
                for (long k = 0; k < num; ++k)
                {
-                    *d = g[k]*(*s - m[k])/std::sqrt(v[k]+dlib::tt::BATCH_NORM_EPS) + b[k];
+                    *d = g[k]*(*s - m[k])/std::sqrt(v[k]+eps) + b[k];
                    ++d;
                    ++s;
                }
@@ -562,6 +592,7 @@ namespace dlib
        }
        void batch_normalize (
+            const double eps,
            resizable_tensor& dest,
            resizable_tensor& means,
            resizable_tensor& invstds,
@@ -582,7 +613,8 @@ namespace dlib
                beta.num_samples() == 1 && 
                gamma.nr() == beta.nr() && beta.nr() == src.nr() &&
                gamma.nc() == beta.nc() && beta.nc() == src.nc() &&
-                gamma.k()  == beta.k()  && beta.k() == src.k(), 
+                gamma.k()  == beta.k()  && beta.k() == src.k() &&
+                eps > 0, 
                "\ngamma.num_samples(): " << gamma.num_samples() << 
                "\ngamma.k():  " << gamma.k() << 
                "\ngamma.nr(): " << gamma.nr() << 
@@ -593,7 +625,8 @@ namespace dlib
                "\nbeta.nc():  " << beta.nc() << 
                "\nsrc.k():   " << src.k() << 
                "\nsrc.nr():  " << src.nr() << 
-                "\nsrc.nc():  " << src.nc() 
+                "\nsrc.nc():  " << src.nc() <<
+                "\neps:  " << eps 
            );
            dest.copy_size(src);
@@ -635,7 +668,7 @@ namespace dlib
                else
                    rvar[i] = (1-averaging_factor)*rvar[i] + scale*averaging_factor*actual_var;
-                p_invstds[i] = 1.0f/std::sqrt(actual_var + dlib::tt::BATCH_NORM_EPS);
+                p_invstds[i] = 1.0f/std::sqrt(actual_var + eps);
            }
            p_src = src.host();
@@ -662,6 +695,7 @@ namespace dlib
        }
        void batch_normalize_gradient (
+            const double eps,
            const tensor& gradient_input,
            const tensor& means,
            const tensor& invstds,
@@ -682,6 +716,7 @@ namespace dlib
            DLIB_CASSERT(num == beta_grad.size(),"");
            DLIB_CASSERT(have_same_dimensions(gradient_input, src),"");
            DLIB_CASSERT(have_same_dimensions(gradient_input, src_grad),"");
+            DLIB_CASSERT(eps > 0,"");
            beta_grad = 0;
            gamma_grad = 0;
@@ -757,6 +792,7 @@ namespace dlib
    // ----------------------------------------------------------------------------------------
        void batch_normalize_conv_inference (
+            const double eps,
            resizable_tensor& dest,
            const tensor& src,
            const tensor& gamma, 
@@ -772,7 +808,8 @@ namespace dlib
                gamma.k()  == src.k() &&
                have_same_dimensions(gamma, beta) &&
                have_same_dimensions(gamma, running_means) &&
-                have_same_dimensions(gamma, running_variances), 
+                have_same_dimensions(gamma, running_variances) &&
+                eps > 0, 
                "\ngamma.num_samples(): " << gamma.num_samples() << 
                "\ngamma.k():  " << gamma.k() << 
                "\ngamma.nr(): " << gamma.nr() << 
@@ -791,7 +828,8 @@ namespace dlib
                "\nrunning_variances.nc():  " << running_variances.nc() << 
                "\nsrc.k():   " << src.k() << 
                "\nsrc.nr():  " << src.nr() << 
-                "\nsrc.nc():  " << src.nc() 
+                "\nsrc.nc():  " << src.nc() <<
+                "\neps:  " << eps 
            );
            dest.copy_size(src);
@@ -807,7 +845,7 @@ namespace dlib
            {
                for (long k = 0; k < src.k(); ++k)
                {
-                    const float invstd = 1.0f/std::sqrt(v[k] + dlib::tt::BATCH_NORM_EPS);
+                    const float invstd = 1.0f/std::sqrt(v[k] + eps);
                    for (long j = 0; j < num; ++j)
                    {
                        *d = g[k]*(*s - m[k])*invstd + b[k];
@@ -819,6 +857,7 @@ namespace dlib
        }
        void batch_normalize_conv (
+            const double eps,
            resizable_tensor& dest,
            resizable_tensor& means,
            resizable_tensor& invstds,
@@ -841,7 +880,8 @@ namespace dlib
                beta.nr() == 1 && 
                gamma.nc() == 1 && 
                beta.nc() == 1 && 
-                gamma.k()  == beta.k()  && beta.k() == src.k(), 
+                gamma.k()  == beta.k()  && beta.k() == src.k() &&
+                eps > 0, 
                "\ngamma.num_samples(): " << gamma.num_samples() << 
                "\ngamma.k():  " << gamma.k() << 
                "\ngamma.nr(): " << gamma.nr() << 
@@ -852,7 +892,8 @@ namespace dlib
                "\nbeta.nc():  " << beta.nc() << 
                "\nsrc.k():   " << src.k() << 
                "\nsrc.nr():  " << src.nr() << 
-                "\nsrc.nc():  " << src.nc() 
+                "\nsrc.nc():  " << src.nc()  <<
+                "\neps:  " << eps 
            );
            dest.copy_size(src);
@@ -900,7 +941,7 @@ namespace dlib
                else
                    rvar[k] = (1-averaging_factor)*rvar[k] + scale*averaging_factor*actual_var;
-                p_invstds[k] = 1.0f/std::sqrt(actual_var + dlib::tt::BATCH_NORM_EPS);
+                p_invstds[k] = 1.0f/std::sqrt(actual_var + eps);
            }
            p_src = src.host();
@@ -928,6 +969,7 @@ namespace dlib
        }
        void batch_normalize_conv_gradient(
+            const double eps,
            const tensor& gradient_input,
            const tensor& means,
            const tensor& invstds,
@@ -948,6 +990,7 @@ namespace dlib
            DLIB_CASSERT(src.k() == beta_grad.size(),"");
            DLIB_CASSERT(have_same_dimensions(gradient_input, src),"");
            DLIB_CASSERT(have_same_dimensions(gradient_input, src_grad),"");
+            DLIB_CASSERT(eps > 0,"");
            beta_grad = 0;
            gamma_grad = 0;

--- a/dlib/dnn/cpu_dlib.h
+++ b/dlib/dnn/cpu_dlib.h
@@ -81,6 +81,18 @@ namespace dlib
            const float D
        );
+        void affine_transform_range(
+            size_t begin,
+            size_t end,
+            tensor& dest,
+            const tensor& src1,
+            const tensor& src2,
+            const tensor& src3,
+            const float A,
+            const float B,
+            const float C
+        );
    // -----------------------------------------------------------------------------------
        void affine_transform(
@@ -102,6 +114,8 @@ namespace dlib
    // -----------------------------------------------------------------------------------
        void compute_adam_update (
+            size_t begin,
+            size_t end,
            tensor& s,
            tensor& m,
            tensor& v,
@@ -117,6 +131,7 @@ namespace dlib
    // -----------------------------------------------------------------------------------
        void batch_normalize_inference (
+            const double eps,
            resizable_tensor& dest,
            const tensor& src,
            const tensor& gamma, 
@@ -126,6 +141,7 @@ namespace dlib
        );
        void batch_normalize (
+            const double eps,
            resizable_tensor& dest,
            resizable_tensor& means,
            resizable_tensor& invstds,
@@ -138,6 +154,7 @@ namespace dlib
        );
        void batch_normalize_gradient (
+            const double eps,
            const tensor& gradient_input,
            const tensor& means,
            const tensor& invstds,
@@ -149,6 +166,7 @@ namespace dlib
        );
        void batch_normalize_conv_inference (
+            const double eps,
            resizable_tensor& dest,
            const tensor& src,
            const tensor& gamma, 
@@ -158,6 +176,7 @@ namespace dlib
        );
        void batch_normalize_conv (
+            const double eps,
            resizable_tensor& dest,
            resizable_tensor& means,
            resizable_tensor& invstds,
@@ -170,6 +189,7 @@ namespace dlib
        );
        void batch_normalize_conv_gradient (
+            const double eps,
            const tensor& gradient_input,
            const tensor& means,
            const tensor& invstds,

--- a/dlib/dnn/cuda_dlib.cu
+++ b/dlib/dnn/cuda_dlib.cu
@@ -504,6 +504,40 @@ namespace dlib
                src2.device(), src3.device(), dest.size(), A, B, C, D);
        }
+    // ----------------------------------------------------------------------------------------
+        __global__ void _cuda_affine_transform_range(
+            float* d, const float* s1, const float* s2, const float* s3, size_t begin, size_t end, float A, float B, float C
+        )
+        {
+            for (auto i : grid_stride_range(begin, end))
+            {
+                d[i] = A*s1[i] + B*s2[i] + C*s3[i];
+            }
+        }
+        void affine_transform_range(
+            size_t begin,
+            size_t end,
+            tensor& dest,
+            const tensor& src1,
+            const tensor& src2,
+            const tensor& src3,
+            const float A,
+            const float B,
+            const float C
+        )
+        {
+            DLIB_CASSERT(dest.size()==src1.size(),"");
+            DLIB_CASSERT(dest.size()==src2.size(),"");
+            DLIB_CASSERT(dest.size()==src3.size(),"");
+            DLIB_CASSERT(begin <= end && end <= dest.size(),"");
+            launch_kernel(_cuda_affine_transform_range,max_jobs(end-begin),
+                dest.device(), src1.device(),
+                src2.device(), src3.device(), begin, end, A, B, C);
+        }
    // -----------------------------------------------------------------------------------
        __global__ void _cuda_affine_transform2(float* d, const float* s, size_t n, const float* A, const float* B)
@@ -549,7 +583,8 @@ namespace dlib
    // ----------------------------------------------------------------------------------------
        __global__ void _cuda_compute_adam_update(
-            size_t n,
+            size_t begin,
+            size_t end,
            float* s,
            float* m,
            float* v,
@@ -566,7 +601,7 @@ namespace dlib
            //   m = momentum1*m + (1-momentum1)    *   (weight_decay*params + params_grad);
            //   v = momentum2*v + (1-momentum2)*squared(weight_decay*params + params_grad);
            //   s = -alpha*m/(sqrt(v) + eps);
-            for (auto i : grid_stride_range(0, n))
+            for (auto i : grid_stride_range(begin, end))
            {
                float g = (weight_decay*params[i] + params_grad[i]);
                m[i] = momentum1*m[i] + (1-momentum1)*g;
@@ -576,6 +611,8 @@ namespace dlib
        }
        void compute_adam_update (
+            size_t begin,
+            size_t end,
            tensor& s,
            tensor& m,
            tensor& v,
@@ -592,10 +629,11 @@ namespace dlib
                         s.size() == v.size() &&
                         s.size() == params.size() &&
                         s.size() == params_grad.size(),"");
+            DLIB_CASSERT(begin <= end && end <= params.size(),"");
            const float alpha = learning_rate*std::sqrt(1-std::pow(momentum2,t))/(1-std::pow(momentum1, t));
-            launch_kernel(_cuda_compute_adam_update,max_jobs(s.size()),
+            launch_kernel(_cuda_compute_adam_update,max_jobs(end-begin),
-                    s.size(), s.device(), m.device(), v.device(), alpha, weight_decay,
+                    begin, end, s.device(), m.device(), v.device(), alpha, weight_decay,
                    momentum1, momentum2, params.device(), params_grad.device());
        }

--- a/dlib/dnn/cuda_dlib.h
+++ b/dlib/dnn/cuda_dlib.h
@@ -164,6 +164,18 @@ namespace dlib
            const float D
        );
+        void affine_transform_range(
+            size_t begin,
+            size_t end,
+            tensor& dest,
+            const tensor& src1,
+            const tensor& src2,
+            const tensor& src3,
+            const float A,
+            const float B,
+            const float C
+        );
        // Note that this function isn't in the tt:: namespace because add_scaled() is
        // called by cuda::add() so we don't need a tt:: version of add_scaled().  
        void add_scaled(
@@ -193,6 +205,8 @@ namespace dlib
    // ----------------------------------------------------------------------------------------
        void compute_adam_update (
+            size_t begin,
+            size_t end,
            tensor& s,
            tensor& m,
            tensor& v,

--- a/dlib/dnn/cudnn_dlibapi.cpp
+++ b/dlib/dnn/cudnn_dlibapi.cpp
@@ -338,6 +338,7 @@ namespace dlib
    // ------------------------------------------------------------------------------------
        void batch_normalize_inference (
+            const double eps,
            resizable_tensor& dest,
            const tensor& src,
            const tensor& gamma, 
@@ -353,7 +354,8 @@ namespace dlib
                gamma.k()  == src.k() &&
                have_same_dimensions(gamma, beta) &&
                have_same_dimensions(gamma, running_means) &&
-                have_same_dimensions(gamma, running_variances), 
+                have_same_dimensions(gamma, running_variances) && 
+                eps > 0, 
                "\ngamma.num_samples(): " << gamma.num_samples() << 
                "\ngamma.k():  " << gamma.k() << 
                "\ngamma.nr(): " << gamma.nr() << 
@@ -372,7 +374,8 @@ namespace dlib
                "\nrunning_variances.nc():  " << running_variances.nc() << 
                "\nsrc.k():   " << src.k() << 
                "\nsrc.nr():  " << src.nr() << 
-                "\nsrc.nc():  " << src.nc() 
+                "\nsrc.nc():  " << src.nc() <<
+                "\neps:  " << eps 
            );
            const float in_scale = 1;
            const float out_scale = 0;
@@ -393,10 +396,11 @@ namespace dlib
                                beta.device(),
                                running_means.device(),
                                running_variances.device(),
-                                dlib::tt::BATCH_NORM_EPS));
+                                eps));
        }
        void batch_normalize (
+            const double eps,
            resizable_tensor& dest,
            resizable_tensor& means,
            resizable_tensor& invstds,
@@ -417,7 +421,8 @@ namespace dlib
                beta.num_samples() == 1 && 
                gamma.nr() == beta.nr() && beta.nr() == src.nr() &&
                gamma.nc() == beta.nc() && beta.nc() == src.nc() &&
-                gamma.k()  == beta.k()  && beta.k() == src.k(), 
+                gamma.k()  == beta.k()  && beta.k() == src.k() &&
+                eps > 0, 
                "\ngamma.num_samples(): " << gamma.num_samples() << 
                "\ngamma.k():  " << gamma.k() << 
                "\ngamma.nr(): " << gamma.nr() << 
@@ -428,7 +433,8 @@ namespace dlib
                "\nbeta.nc():  " << beta.nc() << 
                "\nsrc.k():   " << src.k() << 
                "\nsrc.nr():  " << src.nr() << 
-                "\nsrc.nc():  " << src.nc() 
+                "\nsrc.nc():  " << src.nc() <<
+                "\neps:  " << eps 
            );
            const float in_scale = 1;
@@ -455,12 +461,13 @@ namespace dlib
                                averaging_factor,
                                running_means.device(),
                                running_variances.device(),
-                                dlib::tt::BATCH_NORM_EPS,
+                                eps,
                                means.device(),
                                invstds.device()));
        }
        void batch_normalize_gradient(
+            const double eps,
            const tensor& gradient_input,
            const tensor& means,
            const tensor& invstds,
@@ -480,6 +487,7 @@ namespace dlib
            DLIB_CASSERT(num == beta_grad.size(),"");
            DLIB_CASSERT(have_same_dimensions(gradient_input, src),"");
            DLIB_CASSERT(have_same_dimensions(gradient_input, src_grad),"");
+            DLIB_CASSERT(eps > 0,"");
            const float in_scale = 1;
            const float out_scale = 1;
@@ -503,7 +511,7 @@ namespace dlib
                                gamma.device(),
                                gamma_grad.device(),
                                beta_grad.device(),
-                                dlib::tt::BATCH_NORM_EPS,
+                                eps,
                                means.device(),
                                invstds.device()));
        }
@@ -511,6 +519,7 @@ namespace dlib
    // ------------------------------------------------------------------------------------
        void batch_normalize_conv_inference (
+            const double eps,
            resizable_tensor& dest,
            const tensor& src,
            const tensor& gamma, 
@@ -526,7 +535,8 @@ namespace dlib
                gamma.k()  == src.k() &&
                have_same_dimensions(gamma, beta) &&
                have_same_dimensions(gamma, running_means) &&
-                have_same_dimensions(gamma, running_variances), 
+                have_same_dimensions(gamma, running_variances) &&
+                eps > 0, 
                "\ngamma.num_samples(): " << gamma.num_samples() << 
                "\ngamma.k():  " << gamma.k() << 
                "\ngamma.nr(): " << gamma.nr() << 
@@ -545,7 +555,8 @@ namespace dlib
                "\nrunning_variances.nc():  " << running_variances.nc() << 
                "\nsrc.k():   " << src.k() << 
                "\nsrc.nr():  " << src.nr() << 
-                "\nsrc.nc():  " << src.nc() 
+                "\nsrc.nc():  " << src.nc() <<
+                "\neps:  " << eps 
            );
            const float in_scale = 1;
            const float out_scale = 0;
@@ -566,10 +577,11 @@ namespace dlib
                                beta.device(),
                                running_means.device(),
                                running_variances.device(),
-                                dlib::tt::BATCH_NORM_EPS));
+                                eps));
        }
        void batch_normalize_conv (
+            const double eps,
            resizable_tensor& dest,
            resizable_tensor& means,
            resizable_tensor& invstds,
@@ -592,7 +604,8 @@ namespace dlib
                beta.nr() == 1 && 
                gamma.nc() == 1 && 
                beta.nc() == 1 && 
-                gamma.k()  == beta.k()  && beta.k() == src.k(), 
+                gamma.k()  == beta.k()  && beta.k() == src.k() &&
+                eps > 0, 
                "\ngamma.num_samples(): " << gamma.num_samples() << 
                "\ngamma.k():  " << gamma.k() << 
                "\ngamma.nr(): " << gamma.nr() << 
@@ -603,7 +616,8 @@ namespace dlib
                "\nbeta.nc():  " << beta.nc() << 
                "\nsrc.k():   " << src.k() << 
                "\nsrc.nr():  " << src.nr() << 
-                "\nsrc.nc():  " << src.nc() 
+                "\nsrc.nc():  " << src.nc() <<
+                "\neps:  " << eps 
            );
            const float in_scale = 1;
            const float out_scale = 0;
@@ -629,12 +643,13 @@ namespace dlib
                                averaging_factor,
                                running_means.device(),
                                running_variances.device(),
-                                dlib::tt::BATCH_NORM_EPS,
+                                eps,
                                means.device(),
                                invstds.device()));
        }
        void batch_normalize_conv_gradient(
+            const double eps,
            const tensor& gradient_input,
            const tensor& means,
            const tensor& invstds,
@@ -653,6 +668,7 @@ namespace dlib
            DLIB_CASSERT(src.k() == beta_grad.size(),"");
            DLIB_CASSERT(have_same_dimensions(gradient_input, src),"");
            DLIB_CASSERT(have_same_dimensions(gradient_input, src_grad),"");
+            DLIB_CASSERT(eps > 0,"");
            const float in_scale = 1;
            const float out_scale = 1;
@@ -676,7 +692,7 @@ namespace dlib
                                gamma.device(),
                                gamma_grad.device(),
                                beta_grad.device(),
-                                dlib::tt::BATCH_NORM_EPS,
+                                eps,
                                means.device(),
                                invstds.device()));
        }

--- a/dlib/dnn/cudnn_dlibapi.h
+++ b/dlib/dnn/cudnn_dlibapi.h
@@ -135,6 +135,7 @@ namespace dlib
    // ------------------------------------------------------------------------------------
        void batch_normalize_inference (
+            const double eps,
            resizable_tensor& dest,
            const tensor& src,
            const tensor& gamma, 
@@ -144,6 +145,7 @@ namespace dlib
        );
        void batch_normalize (
+            const double eps,
            resizable_tensor& dest,
            resizable_tensor& means,
            resizable_tensor& invstds,
@@ -156,6 +158,7 @@ namespace dlib
        );
        void batch_normalize_gradient(
+            const double eps,
            const tensor& gradient_input,
            const tensor& means,
            const tensor& invstds,
@@ -169,6 +172,7 @@ namespace dlib
    // ------------------------------------------------------------------------------------
        void batch_normalize_conv_inference (
+            const double eps,
            resizable_tensor& dest,
            const tensor& src,
            const tensor& gamma, 
@@ -178,6 +182,7 @@ namespace dlib
        );
        void batch_normalize_conv (
+            const double eps,
            resizable_tensor& dest,
            resizable_tensor& means,
            resizable_tensor& invstds,
@@ -190,6 +195,7 @@ namespace dlib
        );
        void batch_normalize_conv_gradient(
+            const double eps,
            const tensor& gradient_input,
            const tensor& means,
            const tensor& invstds,

--- a/dlib/dnn/layers.h
+++ b/dlib/dnn/layers.h
@@ -42,6 +42,10 @@ namespace dlib
        con_(
        ) : 
+            learning_rate_multiplier(1),
+            weight_decay_multiplier(1),
+            bias_learning_rate_multiplier(1),
+            bias_weight_decay_multiplier(0),
            padding_y_(_padding_y),
            padding_x_(_padding_x)
        {}
@@ -54,12 +58,27 @@ namespace dlib
        long padding_y() const { return padding_y_; }
        long padding_x() const { return padding_x_; }
+        double get_learning_rate_multiplier () const  { return learning_rate_multiplier; }
+        double get_weight_decay_multiplier () const   { return weight_decay_multiplier; }
+        void set_learning_rate_multiplier(double val) { learning_rate_multiplier = val; }
+        void set_weight_decay_multiplier(double val)  { weight_decay_multiplier  = val; }
+        double get_bias_learning_rate_multiplier () const  { return bias_learning_rate_multiplier; }
+        double get_bias_weight_decay_multiplier () const   { return bias_weight_decay_multiplier; }
+        void set_bias_learning_rate_multiplier(double val) { bias_learning_rate_multiplier = val; }
+        void set_bias_weight_decay_multiplier(double val)  { bias_weight_decay_multiplier  = val; }
        con_ (
            const con_& item
        ) : 
            params(item.params),
            filters(item.filters),
            biases(item.biases),
+            learning_rate_multiplier(item.learning_rate_multiplier),
+            weight_decay_multiplier(item.weight_decay_multiplier),
+            bias_learning_rate_multiplier(item.bias_learning_rate_multiplier),
+            bias_weight_decay_multiplier(item.bias_weight_decay_multiplier),
            padding_y_(item.padding_y_),
            padding_x_(item.padding_x_)
        {
@@ -81,6 +100,10 @@ namespace dlib
            biases = item.biases;
            padding_y_ = item.padding_y_;
            padding_x_ = item.padding_x_;
+            learning_rate_multiplier = item.learning_rate_multiplier;
+            weight_decay_multiplier = item.weight_decay_multiplier;
+            bias_learning_rate_multiplier = item.bias_learning_rate_multiplier;
+            bias_weight_decay_multiplier = item.bias_weight_decay_multiplier;
            return *this;
        }
@@ -121,18 +144,22 @@ namespace dlib
        void backward(const tensor& gradient_input, SUBNET& sub, tensor& params_grad)
        {
            conv.get_gradient_for_data (gradient_input, filters(params,0), sub.get_gradient_input());
+            // no point computing the parameter gradients if they won't be used.
+            if (learning_rate_multiplier != 0)
+            {
                auto filt = filters(params_grad,0);
                conv.get_gradient_for_filters (gradient_input, sub.get_output(), filt);
                auto b = biases(params_grad, filters.size());
                tt::assign_conv_bias_gradient(b, gradient_input);
            }
+        }
        const tensor& get_layer_params() const { return params; }
        tensor& get_layer_params() { return params; }
        friend void serialize(const con_& item, std::ostream& out)
        {
-            serialize("con_2", out);
+            serialize("con_3", out);
            serialize(item.params, out);
            serialize(_num_filters, out);
            serialize(_nr, out);
@@ -143,6 +170,10 @@ namespace dlib
            serialize(item.padding_x_, out);
            serialize(item.filters, out);
            serialize(item.biases, out);
+            serialize(item.learning_rate_multiplier, out);
+            serialize(item.weight_decay_multiplier, out);
+            serialize(item.bias_learning_rate_multiplier, out);
+            serialize(item.bias_weight_decay_multiplier, out);
        }
        friend void deserialize(con_& item, std::istream& in)
@@ -167,7 +198,7 @@ namespace dlib
                item.padding_y_ = nr/2;
                item.padding_x_ = nc/2;
            }
-            else if (version == "con_2")
+            else if (version == "con_2" || version == "con_3")
            {
                deserialize(item.params, in);
                deserialize(num_filters, in);
@@ -180,6 +211,23 @@ namespace dlib
                deserialize(item.filters, in);
                deserialize(item.biases, in);
+                if (version == "con_3")
+                {
+                    deserialize(item.learning_rate_multiplier, in);
+                    deserialize(item.weight_decay_multiplier, in);
+                    deserialize(item.bias_learning_rate_multiplier, in);
+                    deserialize(item.bias_weight_decay_multiplier, in);
+                }
+                else
+                {
+                    // Previous versions didn't have these parameters, so they were
+                    // implicitly 1.
+                    item.learning_rate_multiplier = 1;
+                    item.weight_decay_multiplier = 1;
+                    item.bias_learning_rate_multiplier = 1;
+                    item.bias_weight_decay_multiplier = 1;
+                }
                if (item.padding_y_ != _padding_y) throw serialization_error("Wrong padding_y found while deserializing dlib::con_");
                if (item.padding_x_ != _padding_x) throw serialization_error("Wrong padding_x found while deserializing dlib::con_");
            }
@@ -207,6 +255,10 @@ namespace dlib
                << ", padding_y="<<item.padding_y_
                << ", padding_x="<<item.padding_x_
                << ")";
+            out << " learning_rate_mult="<<item.learning_rate_multiplier;
+            out << " weight_decay_mult="<<item.weight_decay_multiplier;
+            out << " bias_learning_rate_mult="<<item.bias_learning_rate_multiplier;
+            out << " bias_weight_decay_mult="<<item.bias_weight_decay_multiplier;
            return out;
        }
@@ -217,6 +269,10 @@ namespace dlib
        alias_tensor filters, biases;
        tt::tensor_conv conv;
+        double learning_rate_multiplier;
+        double weight_decay_multiplier;
+        double bias_learning_rate_multiplier;
+        double bias_weight_decay_multiplier;
        // These are here only because older versions of con (which you might encounter
        // serialized to disk) used different padding settings.
@@ -594,20 +650,43 @@ namespace dlib
        FC_MODE = 1
    };
+    const double DEFAULT_BATCH_NORM_EPS = 0.00001;
    template <
        layer_mode mode
        >
    class bn_
    {
    public:
-        bn_() : num_updates(0), running_stats_window_size(1000)
+        explicit bn_(
+            unsigned long window_size,
+            double eps_ = DEFAULT_BATCH_NORM_EPS
+        ) : 
+            num_updates(0), 
+            running_stats_window_size(window_size),
+            learning_rate_multiplier(1),
+            weight_decay_multiplier(0),
+            bias_learning_rate_multiplier(1),
+            bias_weight_decay_multiplier(1),
+            eps(eps_)
        {}
-        explicit bn_(unsigned long window_size) : num_updates(0), running_stats_window_size(window_size)
+        bn_() : bn_(1000) {}
-        {}
        layer_mode get_mode() const { return mode; }
        unsigned long get_running_stats_window_size () const { return running_stats_window_size; }
+        double get_eps() const { return eps; }
+        double get_learning_rate_multiplier () const  { return learning_rate_multiplier; }
+        double get_weight_decay_multiplier () const   { return weight_decay_multiplier; }
+        void set_learning_rate_multiplier(double val) { learning_rate_multiplier = val; }
+        void set_weight_decay_multiplier(double val)  { weight_decay_multiplier  = val; }
+        double get_bias_learning_rate_multiplier () const  { return bias_learning_rate_multiplier; }
+        double get_bias_weight_decay_multiplier () const   { return bias_weight_decay_multiplier; }
+        void set_bias_learning_rate_multiplier(double val) { bias_learning_rate_multiplier = val; }
+        void set_bias_weight_decay_multiplier(double val)  { bias_weight_decay_multiplier  = val; }
        template <typename SUBNET>
        void setup (const SUBNET& sub)
@@ -648,16 +727,16 @@ namespace dlib
                if (num_updates <running_stats_window_size)
                    ++num_updates;
                if (mode == FC_MODE)
-                    tt::batch_normalize(output, means, invstds, decay, running_means, running_variances, sub.get_output(), g, b);
+                    tt::batch_normalize(eps, output, means, invstds, decay, running_means, running_variances, sub.get_output(), g, b);
                else 
-                    tt::batch_normalize_conv(output, means, invstds, decay, running_means, running_variances, sub.get_output(), g, b);
+                    tt::batch_normalize_conv(eps, output, means, invstds, decay, running_means, running_variances, sub.get_output(), g, b);
            }
            else // we are running in testing mode so we just linearly scale the input tensor.
            {
                if (mode == FC_MODE)
-                    tt::batch_normalize_inference(output, sub.get_output(), g, b, running_means, running_variances);
+                    tt::batch_normalize_inference(eps, output, sub.get_output(), g, b, running_means, running_variances);
                else
-                    tt::batch_normalize_conv_inference(output, sub.get_output(), g, b, running_means, running_variances);
+                    tt::batch_normalize_conv_inference(eps, output, sub.get_output(), g, b, running_means, running_variances);
            }
        } 
@@ -668,9 +747,9 @@ namespace dlib
            auto g_grad = gamma(params_grad, 0);
            auto b_grad = beta(params_grad, gamma.size());
            if (mode == FC_MODE)
-                tt::batch_normalize_gradient(gradient_input, means, invstds, sub.get_output(), g, sub.get_gradient_input(), g_grad, b_grad );
+                tt::batch_normalize_gradient(eps, gradient_input, means, invstds, sub.get_output(), g, sub.get_gradient_input(), g_grad, b_grad );
            else
-                tt::batch_normalize_conv_gradient(gradient_input, means, invstds, sub.get_output(), g, sub.get_gradient_input(), g_grad, b_grad );
+                tt::batch_normalize_conv_gradient(eps, gradient_input, means, invstds, sub.get_output(), g, sub.get_gradient_input(), g_grad, b_grad );
        }
        const tensor& get_layer_params() const { return params; }
@@ -679,9 +758,9 @@ namespace dlib
        friend void serialize(const bn_& item, std::ostream& out)
        {
            if (mode == CONV_MODE)
-                serialize("bn_con", out);
+                serialize("bn_con2", out);
            else // if FC_MODE
-                serialize("bn_fc", out);
+                serialize("bn_fc2", out);
            serialize(item.params, out);
            serialize(item.gamma, out);
            serialize(item.beta, out);
@@ -691,6 +770,11 @@ namespace dlib
            serialize(item.running_variances, out);
            serialize(item.num_updates, out);
            serialize(item.running_stats_window_size, out);
+            serialize(item.learning_rate_multiplier, out);
+            serialize(item.weight_decay_multiplier, out);
+            serialize(item.bias_learning_rate_multiplier, out);
+            serialize(item.bias_weight_decay_multiplier, out);
+            serialize(item.eps, out);
        }
        friend void deserialize(bn_& item, std::istream& in)
@@ -701,12 +785,12 @@ namespace dlib
            {
                if (mode == CONV_MODE) 
                {
-                    if (version != "bn_con")
+                    if (version != "bn_con" && version != "bn_con2")
                        throw serialization_error("Unexpected version '"+version+"' found while deserializing dlib::bn_.");
                }
                else // must be in FC_MODE
                {
-                    if (version != "bn_fc")
+                    if (version != "bn_fc" && version != "bn_fc2")
                        throw serialization_error("Unexpected version '"+version+"' found while deserializing dlib::bn_.");
                }
            }
@@ -731,16 +815,38 @@ namespace dlib
                // We also need to flip the running_variances around since the previous
                // format saved the inverse standard deviations instead of variances.
-                item.running_variances = 1.0f/squared(mat(item.running_variances)) - tt::BATCH_NORM_EPS;
+                item.running_variances = 1.0f/squared(mat(item.running_variances)) - DEFAULT_BATCH_NORM_EPS;
+            }
+            else if (version == "bn_con2" || version == "bn_fc2")
+            {
+                deserialize(item.learning_rate_multiplier, in);
+                deserialize(item.weight_decay_multiplier, in);
+                deserialize(item.bias_learning_rate_multiplier, in);
+                deserialize(item.bias_weight_decay_multiplier, in);
+                deserialize(item.eps, in);
+            }
+            else
+            {
+                // Previous versions didn't have these parameters, so they were
+                // implicitly 1.
+                item.learning_rate_multiplier = 1;
+                item.weight_decay_multiplier = 1;
+                item.eps = DEFAULT_BATCH_NORM_EPS;
            }
        }
        friend std::ostream& operator<<(std::ostream& out, const bn_& item)
        {
            if (mode == CONV_MODE)
-                out << "bn_con";
+                out << "bn_con  ";
            else
-                out << "bn_fc";
+                out << "bn_fc   ";
+            out << " eps="<<item.eps;
+            out << " learning_rate_mult="<<item.learning_rate_multiplier;
+            out << " weight_decay_mult="<<item.weight_decay_multiplier;
+            out << " bias_learning_rate_mult="<<item.bias_learning_rate_multiplier;
+            out << " bias_weight_decay_mult="<<item.bias_weight_decay_multiplier;
            return out;
        }
@@ -754,6 +860,11 @@ namespace dlib
        resizable_tensor invstds, running_variances;
        unsigned long num_updates;
        unsigned long running_stats_window_size;
+        double learning_rate_multiplier;
+        double weight_decay_multiplier;
+        double bias_learning_rate_multiplier;
+        double bias_weight_decay_multiplier;
+        double eps;
    };
    template <typename SUBNET>
@@ -784,11 +895,24 @@ namespace dlib
        static_assert(num_outputs_ > 0, "The number of outputs from a fc_ layer must be > 0");
    public:
-        fc_() : num_outputs(num_outputs_), num_inputs(0)
+        fc_(num_fc_outputs o) : num_outputs(o.num_outputs), num_inputs(0),
-        {
+            learning_rate_multiplier(1),
-        }
+            weight_decay_multiplier(1),
+            bias_learning_rate_multiplier(1),
+            bias_weight_decay_multiplier(0)
+        {}
+        fc_() : fc_(num_fc_outputs(num_outputs_)) {}
-        fc_(num_fc_outputs o) : num_outputs(o.num_outputs), num_inputs(0) {}
+        double get_learning_rate_multiplier () const  { return learning_rate_multiplier; }
+        double get_weight_decay_multiplier () const   { return weight_decay_multiplier; }
+        void set_learning_rate_multiplier(double val) { learning_rate_multiplier = val; }
+        void set_weight_decay_multiplier(double val)  { weight_decay_multiplier  = val; }
+        double get_bias_learning_rate_multiplier () const  { return bias_learning_rate_multiplier; }
+        double get_bias_weight_decay_multiplier () const   { return bias_weight_decay_multiplier; }
+        void set_bias_learning_rate_multiplier(double val) { bias_learning_rate_multiplier = val; }
+        void set_bias_weight_decay_multiplier(double val)  { bias_weight_decay_multiplier  = val; }
        unsigned long get_num_outputs (
        ) const { return num_outputs; }
@@ -834,6 +958,9 @@ namespace dlib
        template <typename SUBNET>
        void backward(const tensor& gradient_input, SUBNET& sub, tensor& params_grad)
+        {
+            // no point computing the parameter gradients if they won't be used.
+            if (learning_rate_multiplier != 0)
            {
                // compute the gradient of the weight parameters.  
                auto pw = weights(params_grad, 0);
@@ -845,6 +972,7 @@ namespace dlib
                    auto pb = biases(params_grad, weights.size());
                    tt::assign_bias_gradient(pb, gradient_input);
                }
+            }
            // compute the gradient for the data
            auto w = weights(params, 0);
@@ -856,20 +984,24 @@ namespace dlib
        friend void serialize(const fc_& item, std::ostream& out)
        {
-            serialize("fc_", out);
+            serialize("fc_2", out);
            serialize(item.num_outputs, out);
            serialize(item.num_inputs, out);
            serialize(item.params, out);
            serialize(item.weights, out);
            serialize(item.biases, out);
            serialize((int)bias_mode, out);
+            serialize(item.learning_rate_multiplier, out);
+            serialize(item.weight_decay_multiplier, out);
+            serialize(item.bias_learning_rate_multiplier, out);
+            serialize(item.bias_weight_decay_multiplier, out);
        }
        friend void deserialize(fc_& item, std::istream& in)
        {
            std::string version;
            deserialize(version, in);
-            if (version != "fc_")
+            if (version != "fc_" && version != "fc_2")
                throw serialization_error("Unexpected version '"+version+"' found while deserializing dlib::fc_.");
            deserialize(item.num_outputs, in);
@@ -880,6 +1012,22 @@ namespace dlib
            int bmode = 0;
            deserialize(bmode, in);
            if (bias_mode != (fc_bias_mode)bmode) throw serialization_error("Wrong fc_bias_mode found while deserializing dlib::fc_");
+            if (version == "fc_2")
+            {
+                deserialize(item.learning_rate_multiplier, in);
+                deserialize(item.weight_decay_multiplier, in);
+                deserialize(item.bias_learning_rate_multiplier, in);
+                deserialize(item.bias_weight_decay_multiplier, in);
+            }
+            else
+            {
+                // Previous versions didn't have these parameters, so they were
+                // implicitly 1.
+                item.learning_rate_multiplier = 1;
+                item.weight_decay_multiplier = 1;
+                item.bias_learning_rate_multiplier = 1;
+                item.bias_weight_decay_multiplier = 1;
+            }
        }
        friend std::ostream& operator<<(std::ostream& out, const fc_& item)
@@ -889,12 +1037,18 @@ namespace dlib
                out << "fc\t ("
                    << "num_outputs="<<item.num_outputs
                    << ")";
+                out << " learning_rate_mult="<<item.learning_rate_multiplier;
+                out << " weight_decay_mult="<<item.weight_decay_multiplier;
+                out << " bias_learning_rate_mult="<<item.bias_learning_rate_multiplier;
+                out << " bias_weight_decay_mult="<<item.bias_weight_decay_multiplier;
            }
            else
            {
                out << "fc_no_bias ("
                    << "num_outputs="<<item.num_outputs
                    << ")";
+                out << " learning_rate_mult="<<item.learning_rate_multiplier;
+                out << " weight_decay_mult="<<item.weight_decay_multiplier;
            }
            return out;
        }
@@ -905,6 +1059,10 @@ namespace dlib
        unsigned long num_inputs;
        resizable_tensor params;
        alias_tensor weights, biases;
+        double learning_rate_multiplier;
+        double weight_decay_multiplier;
+        double bias_learning_rate_multiplier;
+        double bias_weight_decay_multiplier;
    };
    template <
@@ -1143,7 +1301,7 @@ namespace dlib
            auto sg = gamma(temp,0);
            auto sb = beta(temp,gamma.size());
-            g = pointwise_multiply(mat(sg), 1.0f/sqrt(mat(item.running_variances)+tt::BATCH_NORM_EPS));
+            g = pointwise_multiply(mat(sg), 1.0f/sqrt(mat(item.running_variances)+item.get_eps()));
            b = mat(sb) - pointwise_multiply(mat(g), mat(item.running_means));
        }
@@ -1223,7 +1381,7 @@ namespace dlib
        {
            std::string version;
            deserialize(version, in);
-            if (version == "bn_con")
+            if (version == "bn_con" || version == "bn_con2")
            {
                // Since we can build an affine_ from a bn_ we check if that's what is in
                // the stream and if so then just convert it right here.
@@ -1233,7 +1391,7 @@ namespace dlib
                item = temp;
                return;
            }
-            else if (version == "bn_fc")
+            else if (version == "bn_fc" || version == "bn_fc2")
            {
                // Since we can build an affine_ from a bn_ we check if that's what is in
                // the stream and if so then just convert it right here.
@@ -1289,8 +1447,13 @@ namespace dlib
        template <typename SUBNET>
        void forward(const SUBNET& sub, resizable_tensor& output)
        {
-            output.copy_size(sub.get_output());
+            auto&& t1 = sub.get_output();
-            tt::add(output, sub.get_output(), layer<tag>(sub).get_output());
+            auto&& t2 = layer<tag>(sub).get_output();
+            output.set_size(std::max(t1.num_samples(),t2.num_samples()),
+                            std::max(t1.k(),t2.k()),
+                            std::max(t1.nr(),t2.nr()),
+                            std::max(t1.nc(),t2.nc()));
+            tt::add(output, t1, t2);
        }
        template <typename SUBNET>

--- a/dlib/dnn/layers_abstract.h
+++ b/dlib/dnn/layers_abstract.h
@@ -123,6 +123,16 @@ namespace dlib
                      allow dlib to make some layers execute in-place and therefore run a
                      little faster and use less memory.  Do not implement forward() and
                      backward().
+                It should also be noted that layers may define additional layer specific
+                fields and the solvers can use these fields as they see fit.  For example,
+                some layers define get_learning_rate_multiplier() and
+                get_weight_decay_multiplier() methods.  The solvers that come with dlib
+                look at these methods, if they exist, and adjust the learning rate or
+                weight decay for that layer according to the multiplier.  Therefore, you
+                can add these methods to your layer types if you want, or even define new
+                fields and new solvers that use those fields in some way.  
        !*/
    public:
@@ -367,6 +377,10 @@ namespace dlib
            ensures
                - #get_num_outputs() == num_outputs
                - #get_bias_mode() == bias_mode 
+                - #get_learning_rate_multiplier()      == 1
+                - #get_weight_decay_multiplier()       == 1
+                - #get_bias_learning_rate_multiplier() == 1
+                - #get_bias_weight_decay_multiplier()  == 0
        !*/
        unsigned long get_num_outputs (
@@ -389,6 +403,82 @@ namespace dlib
                  is added to each of the outputs of this layer. 
        !*/
+        double get_learning_rate_multiplier(
+        ) const;  
+        /*!
+            ensures
+                - returns a multiplier number.  The interpretation is that this object is
+                  requesting that the learning rate used to optimize its parameters be
+                  multiplied by get_learning_rate_multiplier().
+        !*/
+        double get_weight_decay_multiplier(
+        ) const; 
+        /*!
+            ensures
+                - returns a multiplier number.  The interpretation is that this object is
+                  requesting that the weight decay used to optimize its parameters be
+                  multiplied by get_weight_decay_multiplier().
+        !*/
+        void set_learning_rate_multiplier(
+            double val
+        );
+        /*!
+            requires
+                - val >= 0
+            ensures
+                - #get_learning_rate_multiplier() == val
+        !*/
+        void set_weight_decay_multiplier(
+            double val
+        ); 
+        /*!
+            requires
+                - val >= 0
+            ensures
+                - #get_weight_decay_multiplier() == val
+        !*/
+        double get_bias_learning_rate_multiplier(
+        ) const; 
+        /*!
+            ensures
+                - returns a multiplier number.  The interpretation is that this object is
+                  requesting that the learning rate used to optimize its bias parameters be
+                  multiplied by get_learning_rate_multiplier()*get_bias_learning_rate_multiplier().
+        !*/
+        double get_bias_weight_decay_multiplier(
+        ) const; 
+        /*!
+            ensures
+                - returns a multiplier number.  The interpretation is that this object is
+                  requesting that the weight decay used to optimize its bias parameters be
+                  multiplied by get_weight_decay_multiplier()*get_bias_weight_decay_multiplier().
+        !*/
+        void set_bias_learning_rate_multiplier(
+            double val
+        ); 
+        /*!
+            requires
+                - val >= 0
+            ensures
+                - #get_bias_learning_rate_multiplier() == val
+        !*/
+        void set_bias_weight_decay_multiplier(
+            double val
+        ); 
+        /*!
+            requires
+                - val >= 0
+            ensures
+                - #get_bias_weight_decay_multiplier() == val
+        !*/
        template <typename SUBNET> void setup (const SUBNET& sub);
        template <typename SUBNET> void forward(const SUBNET& sub, resizable_tensor& output);
        template <typename SUBNET> void backward(const tensor& gradient_input, SUBNET& sub, tensor& params_grad);
@@ -458,6 +548,10 @@ namespace dlib
                - #stride_x() == _stride_x
                - #padding_y() == _padding_y
                - #padding_x() == _padding_x
+                - #get_learning_rate_multiplier()      == 1
+                - #get_weight_decay_multiplier()       == 1
+                - #get_bias_learning_rate_multiplier() == 1
+                - #get_bias_weight_decay_multiplier()  == 0
        !*/
        long num_filters(
@@ -517,6 +611,82 @@ namespace dlib
                  sides of the image.
        !*/
+        double get_learning_rate_multiplier(
+        ) const;  
+        /*!
+            ensures
+                - returns a multiplier number.  The interpretation is that this object is
+                  requesting that the learning rate used to optimize its parameters be
+                  multiplied by get_learning_rate_multiplier().
+        !*/
+        double get_weight_decay_multiplier(
+        ) const; 
+        /*!
+            ensures
+                - returns a multiplier number.  The interpretation is that this object is
+                  requesting that the weight decay used to optimize its parameters be
+                  multiplied by get_weight_decay_multiplier().
+        !*/
+        void set_learning_rate_multiplier(
+            double val
+        );
+        /*!
+            requires
+                - val >= 0
+            ensures
+                - #get_learning_rate_multiplier() == val
+        !*/
+        void set_weight_decay_multiplier(
+            double val
+        ); 
+        /*!
+            requires
+                - val >= 0
+            ensures
+                - #get_weight_decay_multiplier() == val
+        !*/
+        double get_bias_learning_rate_multiplier(
+        ) const; 
+        /*!
+            ensures
+                - returns a multiplier number.  The interpretation is that this object is
+                  requesting that the learning rate used to optimize its bias parameters be
+                  multiplied by get_learning_rate_multiplier()*get_bias_learning_rate_multiplier().
+        !*/
+        double get_bias_weight_decay_multiplier(
+        ) const; 
+        /*!
+            ensures
+                - returns a multiplier number.  The interpretation is that this object is
+                  requesting that the weight decay used to optimize its bias parameters be
+                  multiplied by get_weight_decay_multiplier()*get_bias_weight_decay_multiplier().
+        !*/
+        void set_bias_learning_rate_multiplier(
+            double val
+        ); 
+        /*!
+            requires
+                - val >= 0
+            ensures
+                - #get_bias_learning_rate_multiplier() == val
+        !*/
+        void set_bias_weight_decay_multiplier(
+            double val
+        ); 
+        /*!
+            requires
+                - val >= 0
+            ensures
+                - #get_bias_weight_decay_multiplier() == val
+        !*/
        template <typename SUBNET> void setup (const SUBNET& sub);
        template <typename SUBNET> void forward(const SUBNET& sub, resizable_tensor& output);
        template <typename SUBNET> void backward(const tensor& gradient_input, SUBNET& sub, tensor& params_grad);
@@ -648,6 +818,8 @@ namespace dlib
        FC_MODE = 1    // fully connected mode
    };
+    const double DEFAULT_BATCH_NORM_EPS = 0.00001;
    template <
        layer_mode mode
        >
@@ -684,16 +856,29 @@ namespace dlib
        /*!
            ensures
                - #get_mode() == mode
-                - get_running_stats_window_size() == 1000
+                - #get_running_stats_window_size()      == 1000
+                - #get_learning_rate_multiplier()       == 1
+                - #get_weight_decay_multiplier()        == 0
+                - #get_bias_learning_rate_multiplier()  == 1
+                - #get_bias_weight_decay_multiplier()   == 1
+                - #get_eps() == tt::DEFAULT_BATCH_NORM_EPS
        !*/
        explicit bn_(
-            unsigned long window_size
+            unsigned long window_size,
+            double eps = tt::DEFAULT_BATCH_NORM_EPS
        );
        /*!
+            requires
+                - eps > 0
            ensures
                - #get_mode() == mode 
-                - get_running_stats_window_size() == window_size
+                - #get_running_stats_window_size()     == window_size
+                - #get_learning_rate_multiplier()      == 1
+                - #get_weight_decay_multiplier()       == 0
+                - #get_bias_learning_rate_multiplier() == 1
+                - #get_bias_weight_decay_multiplier()  == 1
+                - #get_eps() == eps
        !*/
        layer_mode get_mode(
@@ -712,6 +897,15 @@ namespace dlib
                  normalization after a convolutional layer you should use CONV_MODE.
        !*/
+        double get_eps(
+        ) const; 
+        /*!
+            ensures
+                - When doing batch normalization, we are dividing by the standard
+                  deviation.  This epsilon value returned by this function is added to the
+                  variance to prevent the division from dividing by zero.
+        !*/
        unsigned long get_running_stats_window_size (
        ) const; 
        /*!
@@ -725,6 +919,82 @@ namespace dlib
                  the running average.
        !*/
+        double get_learning_rate_multiplier(
+        ) const;  
+        /*!
+            ensures
+                - returns a multiplier number.  The interpretation is that this object is
+                  requesting that the learning rate used to optimize its parameters be
+                  multiplied by get_learning_rate_multiplier().
+        !*/
+        double get_weight_decay_multiplier(
+        ) const; 
+        /*!
+            ensures
+                - returns a multiplier number.  The interpretation is that this object is
+                  requesting that the weight decay used to optimize its parameters be
+                  multiplied by get_weight_decay_multiplier().
+        !*/
+        void set_learning_rate_multiplier(
+            double val
+        );
+        /*!
+            requires
+                - val >= 0
+            ensures
+                - #get_learning_rate_multiplier() == val
+        !*/
+        void set_weight_decay_multiplier(
+            double val
+        ); 
+        /*!
+            requires
+                - val >= 0
+            ensures
+                - #get_weight_decay_multiplier() == val
+        !*/
+        double get_bias_learning_rate_multiplier(
+        ) const; 
+        /*!
+            ensures
+                - returns a multiplier number.  The interpretation is that this object is
+                  requesting that the learning rate used to optimize its bias parameters be
+                  multiplied by get_learning_rate_multiplier()*get_bias_learning_rate_multiplier().
+        !*/
+        double get_bias_weight_decay_multiplier(
+        ) const; 
+        /*!
+            ensures
+                - returns a multiplier number.  The interpretation is that this object is
+                  requesting that the weight decay used to optimize its bias parameters be
+                  multiplied by get_weight_decay_multiplier()*get_bias_weight_decay_multiplier().
+        !*/
+        void set_bias_learning_rate_multiplier(
+            double val
+        ); 
+        /*!
+            requires
+                - val >= 0
+            ensures
+                - #get_bias_learning_rate_multiplier() == val
+        !*/
+        void set_bias_weight_decay_multiplier(
+            double val
+        ); 
+        /*!
+            requires
+                - val >= 0
+            ensures
+                - #get_bias_weight_decay_multiplier() == val
+        !*/
        template <typename SUBNET> void setup (const SUBNET& sub);
        template <typename SUBNET> void forward(const SUBNET& sub, resizable_tensor& output);
        template <typename SUBNET> void backward(const tensor& gradient_input, SUBNET& sub, tensor& params_grad);
@@ -1330,7 +1600,13 @@ namespace dlib
                what layer to add to the output of the previous layer.  The result of this
                addition is output by add_prev_.  Finally, the addition happens pointwise
                according to 4D tensor arithmetic.  If the dimensions don't match then
-                missing elements are presumed to be equal to 0.
+                missing elements are presumed to be equal to 0.  Moreover, each dimension
+                of the output tensor is equal to the maximum dimension of either of the
+                inputs.  That is, if the tensors A and B are being added to produce C then:
+                    - C.num_samples() == max(A.num_samples(), B.num_samples())
+                    - C.k()  == max(A.k(), B.k())
+                    - C.nr() == max(A.nr(), B.nr())
+                    - C.nc() == max(A.nc(), B.nc())
        !*/
    public:

--- a/dlib/dnn/solvers.h
+++ b/dlib/dnn/solvers.h
@@ -6,6 +6,7 @@
 #include "solvers_abstract.h"
 #include "tensor.h"
 #include <iostream>
+#include "layers.h"
 namespace dlib
 {
@@ -49,10 +50,53 @@ namespace dlib
                v = 0;
            }
-            //perform: v = momentum*mat(v) - weight_decay*learning_rate*mat(params) - learning_rate*mat(params_grad);
+            const double lr = learning_rate*get_learning_rate_multiplier(l);
-            tt::affine_transform(v, v, params, params_grad, 
+            const double wd = weight_decay*get_weight_decay_multiplier(l);
-                               momentum, -weight_decay*learning_rate, -learning_rate, 0);
+            //perform: v = momentum*mat(v) - wd*lr*mat(params) - lr*mat(params_grad);
+            tt::affine_transform(v, v, params, params_grad, momentum, -wd*lr, -lr);
+            return v;
+        }
+        template <unsigned long N>
+        const tensor& operator() (
+            const float learning_rate,
+            const fc_<N,FC_HAS_BIAS>& l,
+            const tensor& params_grad
+        )
+        {
+            update_considering_bias(learning_rate, l, params_grad, l.get_num_outputs());
+            return v;
+        }
+        template <
+            long _num_filters,
+            long _nr,
+            long _nc,
+            int _stride_y,
+            int _stride_x,
+            int _padding_y,
+            int _padding_x
+            >
+        const tensor& operator() (
+            const float learning_rate,
+            const con_<_num_filters,_nr,_nc,_stride_y,_stride_x,_padding_y,_padding_x>& l,
+            const tensor& params_grad
+        )
+        {
+            update_considering_bias(learning_rate, l, params_grad, l.num_filters());
+            return v;
+        }
+        template < layer_mode mode >
+        const tensor& operator() (
+            const float learning_rate,
+            const bn_<mode>& l,
+            const tensor& params_grad
+        )
+        {
+            update_considering_bias(learning_rate, l, params_grad, params_grad.size()/2);
            return v;
        }
@@ -76,9 +120,49 @@ namespace dlib
        }
    private:
+        template <typename layer_type> 
+        void update_considering_bias(
+            const float learning_rate,
+            const layer_type& l,
+            const tensor& params_grad,
+            unsigned long bias_offset
+        )
+        {
+            const tensor& params = l.get_layer_params();
+            DLIB_CASSERT(params.size() != 0,"");
+            if (v.size() == 0)
+            {
+                v.copy_size(params_grad);
+                v = 0;
+            }
+            double lr = learning_rate*get_learning_rate_multiplier(l);
+            double wd = weight_decay*get_weight_decay_multiplier(l);
+            //perform: v = momentum*mat(v) - wd*lr*mat(params) - lr*mat(params_grad);
+            if (l.get_bias_learning_rate_multiplier() == 1 && l.get_bias_weight_decay_multiplier() == 1)
+            {
+                tt::affine_transform(v, v, params, params_grad, momentum, -wd*lr, -lr);
+            }
+            else
+            {
+                tt::affine_transform_range(0, bias_offset, v, v, params, params_grad, momentum, -wd*lr, -lr);
+                // now update the biases but apply their multipliers
+                lr *= l.get_bias_learning_rate_multiplier();
+                wd *= l.get_bias_weight_decay_multiplier();
+                tt::affine_transform_range(bias_offset, v.size(), v, v, params, params_grad, momentum, -wd*lr, -lr);
+            }
+        }
        resizable_tensor v;
        float weight_decay;
        float momentum;
    };
 // ----------------------------------------------------------------------------------------
@@ -132,11 +216,57 @@ namespace dlib
            ++t;
-            tt::compute_adam_update(s, m, v, t, learning_rate, weight_decay, momentum1, momentum2, params, params_grad);
+            tt::compute_adam_update(0, params.size(), s, m, v, t,
+                learning_rate*get_learning_rate_multiplier(l),
+                weight_decay*get_weight_decay_multiplier(l), 
+                momentum1, momentum2, params, params_grad);
            return s;
        }
+        template <unsigned long N>
+        const tensor& operator() (
+            const float learning_rate,
+            const fc_<N,FC_HAS_BIAS>& l,
+            const tensor& params_grad
+        )
+        {
+            update_considering_bias(learning_rate, l, params_grad, l.get_num_outputs());
+            return s;
+        }
+        template <
+            long _num_filters,
+            long _nr,
+            long _nc,
+            int _stride_y,
+            int _stride_x,
+            int _padding_y,
+            int _padding_x
+            >
+        const tensor& operator() (
+            const float learning_rate,
+            const con_<_num_filters,_nr,_nc,_stride_y,_stride_x,_padding_y,_padding_x>& l,
+            const tensor& params_grad
+        )
+        {
+            update_considering_bias(learning_rate, l, params_grad, l.num_filters());
+            return s;
+        }
+        template < layer_mode mode >
+        const tensor& operator() (
+            const float learning_rate,
+            const bn_<mode>& l,
+            const tensor& params_grad
+        )
+        {
+            update_considering_bias(learning_rate, l, params_grad, params_grad.size()/2);
+            return s;
+        }
        friend void serialize(const adam& item, std::ostream& out)
        {
            serialize("adam2", out);
@@ -165,6 +295,49 @@ namespace dlib
        }
    private:
+        template <typename layer_type> 
+        void update_considering_bias(
+            const float learning_rate,
+            const layer_type& l,
+            const tensor& params_grad,
+            unsigned long bias_offset
+        )
+        {
+            const tensor& params = l.get_layer_params();
+            DLIB_CASSERT(params.size() != 0,"");
+            if (v.size() == 0)
+            {
+                m.copy_size(params_grad);
+                m = 0;
+                v.copy_size(params_grad);
+                v = 0;
+                s.copy_size(params_grad);
+            }
+            ++t;
+            if (l.get_bias_learning_rate_multiplier() == 1 && l.get_bias_weight_decay_multiplier() == 1)
+            {
+                tt::compute_adam_update(0, params.size(), s, m, v, t,
+                    learning_rate*get_learning_rate_multiplier(l),
+                    weight_decay*get_weight_decay_multiplier(l), 
+                    momentum1, momentum2, params, params_grad);
+            }
+            else
+            {
+                tt::compute_adam_update(0, bias_offset, s, m, v, t,
+                    learning_rate*get_learning_rate_multiplier(l),
+                    weight_decay*get_weight_decay_multiplier(l), 
+                    momentum1, momentum2, params, params_grad);
+                tt::compute_adam_update(bias_offset, params.size(), s, m, v, t,
+                    learning_rate*get_learning_rate_multiplier(l)*l.get_bias_learning_rate_multiplier(),
+                    weight_decay*get_weight_decay_multiplier(l)*l.get_bias_weight_decay_multiplier(), 
+                    momentum1, momentum2, params, params_grad);
+            }
+        }
        resizable_tensor m;
        resizable_tensor v;
        resizable_tensor s;

--- a/dlib/dnn/solvers_abstract.h
+++ b/dlib/dnn/solvers_abstract.h
@@ -78,6 +78,15 @@ namespace dlib
                    V = momentum*V - weight_decay*learning_rate*l.get_layer_params() - learning_rate*params_grad;
                Here V is a momentum term that is remembered by the solver from one
                invocation of operator() to the next.  
+                Note that the actual learning rate and weight decay used by the solver are
+                multiplied by the per layer multipliers.  That is, the solver will call
+                get_learning_rate_multiplier(l) and get_weight_decay_multiplier(l) and
+                multiply these values with the nominal learning rate and weight decay,
+                respectively, to determine the values it will use during each step.  It is
+                also overloaded to allow additional learning rate multipliers to be applied
+                to fc_ and con_ bias parameters.
        !*/
    public:
@@ -123,6 +132,15 @@ namespace dlib
                paper:
                    Kingma, Diederik P., and Jimmy Ba Adam. "A method for stochastic
                    optimization." International Conference on Learning Representation. 2015.
+                Note that the actual learning rate and weight decay used by the solver are
+                multiplied by the per layer multipliers.  That is, the solver will call
+                get_learning_rate_multiplier(l) and get_weight_decay_multiplier(l) and
+                multiply these values with the nominal learning rate and weight decay,
+                respectively, to determine the values it will use during each step.  It is
+                also overloaded to allow additional learning rate multipliers to be applied
+                to fc_ and con_ bias parameters.
        !*/
    public:

--- a/dlib/dnn/tensor_tools.cpp
+++ b/dlib/dnn/tensor_tools.cpp
@@ -240,6 +240,42 @@ namespace dlib { namespace tt
 #endif
    }
+    void affine_transform_range(
+        size_t begin,
+        size_t end,
+        tensor& dest,
+        const tensor& src1,
+        const tensor& src2,
+        const tensor& src3,
+        const float A,
+        const float B,
+        const float C
+    )
+    {
+#ifdef DLIB_USE_CUDA
+        cuda::affine_transform_range(begin, end, dest,src1,src2,src3,A,B,C);
+#else
+        cpu::affine_transform_range(begin, end, dest,src1,src2,src3,A,B,C);
+#endif
+    }
+    void affine_transform(
+        tensor& dest,
+        const tensor& src1,
+        const tensor& src2,
+        const tensor& src3,
+        const float A,
+        const float B,
+        const float C
+    )
+    {
+#ifdef DLIB_USE_CUDA
+        cuda::affine_transform_range(0,dest.size(),dest,src1,src2,src3,A,B,C);
+#else
+        cpu::affine_transform_range(0,dest.size(),dest,src1,src2,src3,A,B,C);
+#endif
+    }
 // ----------------------------------------------------------------------------------------
    void affine_transform(
@@ -275,6 +311,8 @@ namespace dlib { namespace tt
 // ----------------------------------------------------------------------------------------
    void compute_adam_update (
+        size_t begin,
+        size_t end,
        tensor& s,
        tensor& m,
        tensor& v,
@@ -288,10 +326,10 @@ namespace dlib { namespace tt
    )
    {
 #ifdef DLIB_USE_CUDA
-        cuda::compute_adam_update(s, m, v, t, learning_rate, weight_decay, momentum1,
+        cuda::compute_adam_update(begin, end, s, m, v, t, learning_rate, weight_decay, momentum1,
            momentum2, params, params_grad);
 #else
-        cpu::compute_adam_update(s, m, v, t, learning_rate, weight_decay, momentum1,
+        cpu::compute_adam_update(begin, end, s, m, v, t, learning_rate, weight_decay, momentum1,
            momentum2, params, params_grad);
 #endif
    }
@@ -299,6 +337,7 @@ namespace dlib { namespace tt
 // ----------------------------------------------------------------------------------------
    void batch_normalize_inference (
+        const double eps,
        resizable_tensor& dest,
        const tensor& src,
        const tensor& gamma, 
@@ -308,13 +347,14 @@ namespace dlib { namespace tt
    )
    {
 #ifdef DLIB_USE_CUDA
-        cuda::batch_normalize_inference(dest,src,gamma,beta,running_means,running_variances);
+        cuda::batch_normalize_inference(eps,dest,src,gamma,beta,running_means,running_variances);
 #else
-        cpu::batch_normalize_inference(dest,src,gamma,beta,running_means,running_variances);
+        cpu::batch_normalize_inference(eps,dest,src,gamma,beta,running_means,running_variances);
 #endif
    }
    void batch_normalize (
+        const double eps,
        resizable_tensor& dest,
        resizable_tensor& means,
        resizable_tensor& vars,
@@ -327,13 +367,14 @@ namespace dlib { namespace tt
    )
    {
 #ifdef DLIB_USE_CUDA
-        cuda::batch_normalize(dest,means,vars,averaging_factor,running_means,running_variances,src,gamma,beta);
+        cuda::batch_normalize(eps,dest,means,vars,averaging_factor,running_means,running_variances,src,gamma,beta);
 #else
-        cpu::batch_normalize(dest,means,vars,averaging_factor,running_means,running_variances,src,gamma,beta);
+        cpu::batch_normalize(eps,dest,means,vars,averaging_factor,running_means,running_variances,src,gamma,beta);
 #endif
    }
    void batch_normalize_gradient (
+        const double eps,
            const tensor& gradient_input,
            const tensor& means,
            const tensor& invstds,
@@ -346,15 +387,16 @@ namespace dlib { namespace tt
    {
 #ifdef DLIB_USE_CUDA
-        cuda::batch_normalize_gradient(gradient_input, means, invstds, src, gamma, src_grad, gamma_grad, beta_grad);
+        cuda::batch_normalize_gradient(eps,gradient_input, means, invstds, src, gamma, src_grad, gamma_grad, beta_grad);
 #else
-        cpu::batch_normalize_gradient(gradient_input, means, invstds, src, gamma, src_grad, gamma_grad, beta_grad);
+        cpu::batch_normalize_gradient(eps,gradient_input, means, invstds, src, gamma, src_grad, gamma_grad, beta_grad);
 #endif
    }
 // ----------------------------------------------------------------------------------------
    void batch_normalize_conv_inference (
+        const double eps,
        resizable_tensor& dest,
        const tensor& src,
        const tensor& gamma, 
@@ -364,13 +406,14 @@ namespace dlib { namespace tt
    )
    {
 #ifdef DLIB_USE_CUDA
-        cuda::batch_normalize_conv_inference(dest,src,gamma,beta,running_means,running_variances);
+        cuda::batch_normalize_conv_inference(eps,dest,src,gamma,beta,running_means,running_variances);
 #else
-        cpu::batch_normalize_conv_inference(dest,src,gamma,beta,running_means,running_variances);
+        cpu::batch_normalize_conv_inference(eps,dest,src,gamma,beta,running_means,running_variances);
 #endif
    }
    void batch_normalize_conv (
+        const double eps,
        resizable_tensor& dest,
        resizable_tensor& means,
        resizable_tensor& vars,
@@ -383,13 +426,14 @@ namespace dlib { namespace tt
    )
    {
 #ifdef DLIB_USE_CUDA
-        cuda::batch_normalize_conv(dest,means,vars,averaging_factor,running_means,running_variances,src,gamma,beta);
+        cuda::batch_normalize_conv(eps,dest,means,vars,averaging_factor,running_means,running_variances,src,gamma,beta);
 #else
-        cpu::batch_normalize_conv(dest,means,vars,averaging_factor,running_means,running_variances,src,gamma,beta);
+        cpu::batch_normalize_conv(eps,dest,means,vars,averaging_factor,running_means,running_variances,src,gamma,beta);
 #endif
    }
    void batch_normalize_conv_gradient (
+        const double eps,
        const tensor& gradient_input,
        const tensor& means,
        const tensor& invstds,
@@ -402,9 +446,9 @@ namespace dlib { namespace tt
    {
 #ifdef DLIB_USE_CUDA
-        cuda::batch_normalize_conv_gradient(gradient_input, means, invstds, src, gamma, src_grad, gamma_grad, beta_grad);
+        cuda::batch_normalize_conv_gradient(eps,gradient_input, means, invstds, src, gamma, src_grad, gamma_grad, beta_grad);
 #else
-        cpu::batch_normalize_conv_gradient(gradient_input, means, invstds, src, gamma, src_grad, gamma_grad, beta_grad);
+        cpu::batch_normalize_conv_gradient(eps,gradient_input, means, invstds, src, gamma, src_grad, gamma_grad, beta_grad);
 #endif
    }

--- a/dlib/dnn/tensor_tools.h
+++ b/dlib/dnn/tensor_tools.h
@@ -229,13 +229,58 @@ namespace dlib { namespace tt
        const float D
    );
    /*!
-        requires - dest.size()==src1.size()
+        requires 
+            - dest.size()==src1.size()
            - dest.size()==src2.size()
            - dest.size()==src3.size()
        ensures
            - #dest == A*src1 + B*src2 + C*src3 + D
    !*/
+    void affine_transform(
+        tensor& dest,
+        const tensor& src1,
+        const tensor& src2,
+        const tensor& src3,
+        const float A,
+        const float B,
+        const float C
+    );
+    /*!
+        requires 
+            - dest.size()==src1.size()
+            - dest.size()==src2.size()
+            - dest.size()==src3.size()
+        ensures
+            - #dest == A*src1 + B*src2 + C*src3
+    !*/
+    void affine_transform_range(
+        size_t begin,
+        size_t end,
+        tensor& dest,
+        const tensor& src1,
+        const tensor& src2,
+        const tensor& src3,
+        const float A,
+        const float B,
+        const float C
+    );
+    /*!
+        requires 
+            - dest.size()==src1.size()
+            - dest.size()==src2.size()
+            - dest.size()==src3.size()
+            - begin <= end <= dest.size()
+        ensures
+            - This function operates much like
+              affine_transform(dest,src1,src2,src3,A,B,C,0), except that it runs over only
+              the half open range [begin,end) rather than processing the entire tensor.
+              Specifically, it does this:
+                - for i in the range [begin, end):
+                    - #dest.host()[i] == A*src1.host()[i] + B*src2.host()[i] + C*src3.host()[i]
+    !*/
 // ----------------------------------------------------------------------------------------
    void affine_transform(
@@ -290,6 +335,8 @@ namespace dlib { namespace tt
 // ----------------------------------------------------------------------------------------
    void compute_adam_update (
+        size_t begin,
+        size_t end,
        tensor& s,
        tensor& m,
        tensor& v,
@@ -309,19 +356,22 @@ namespace dlib { namespace tt
            - weight_decay >= 0
            - 0 <= momentum1 < 1
            - 0 <= momentum2 < 1
+            - begin <= end <= params.size()
        ensures
            - This function implements the ADAM parameter update method described in the paper:
                Kingma, Diederik P., and Jimmy Ba Adam. "A method for stochastic
                optimization." International Conference on Learning Representation. 2015.
              Specifically, it implements the method shown as Algorithm 1.
            - #s is the update vector that should be added to the parameters.
+            - The function only operates in the half open range [begin,end) of the memory
+              blocks of each tensor.  E.g. to make this function run on the entire tensor
+              set begin to 0 and end to params.size().
    !*/
 // ----------------------------------------------------------------------------------------
-    const double BATCH_NORM_EPS = 0.00001;
    void batch_normalize_inference (
+        const double eps,
        resizable_tensor& dest,
        const tensor& src,
        const tensor& gamma, 
@@ -331,6 +381,7 @@ namespace dlib { namespace tt
    );
    /*!
        requires
+            - eps > 0
            - gamma.num_samples() == 1 
            - gamma.nr() == src.nr() 
            - gamma.nc() == src.nc() 
@@ -342,11 +393,12 @@ namespace dlib { namespace tt
            - Linearly transforms src as a call to batch_normalize() would if src had means
              and variances as given by running_means and running_variances.  That is, this
              function performs: 
-                dest = gamma*(src-running_means)/sqrt(running_variances+BATCH_NORM_EPS) + beta
+                dest = gamma*(src-running_means)/sqrt(running_variances+eps) + beta
              Note that it does it in a pointwise fashion over the samples in src.
    !*/
    void batch_normalize (
+        const double eps,
        resizable_tensor& dest,
        resizable_tensor& means,
        resizable_tensor& invstds,
@@ -359,6 +411,7 @@ namespace dlib { namespace tt
    );
    /*!
        requires
+            - eps > 0
            - src.num_samples() > 1
            - gamma.num_samples() == 1
            - beta.num_samples() == 1
@@ -384,6 +437,7 @@ namespace dlib { namespace tt
    !*/
    void batch_normalize_gradient (
+        const double eps,
        const tensor& gradient_input,
        const tensor& means,
        const tensor& invstds,
@@ -395,8 +449,9 @@ namespace dlib { namespace tt
    );
    /*!
        requires
+            - eps > 0
            - invstds and means should be the output of a call to
-              batch_normalize(dest,means,invstds,src,gamma,beta)
+              batch_normalize(eps,dest,means,invstds,src,gamma,beta)
            - have_same_dimensions(gradient_input, src) == true
            - have_same_dimensions(src, src_grad) == true
            - src.num_samples() > 1
@@ -410,7 +465,7 @@ namespace dlib { namespace tt
            - have_same_dimensions(invstds, gamma) == true
        ensures
            - Let f(src,gamma,beta) == dot(gradient_input, dest output of
-              batch_normalize(dest,means,invstds,src,gamma,beta))
+              batch_normalize(eps,dest,means,invstds,src,gamma,beta))
            - Adds the gradient of f() with respect to src to #src_grad.
            - Assigns the gradient of f() with respect to gamma to #gamma_grad.
            - Assigns the gradient of f() with respect to beta to #beta_grad.
@@ -419,6 +474,7 @@ namespace dlib { namespace tt
 // ----------------------------------------------------------------------------------------
    void batch_normalize_conv_inference (
+        const double eps,
        resizable_tensor& dest,
        const tensor& src,
        const tensor& gamma, 
@@ -428,6 +484,7 @@ namespace dlib { namespace tt
    );
    /*!
        requires
+            - eps > 0
            - gamma.num_samples() == 1 
            - gamma.nr() == 1 
            - gamma.nc() == 1 
@@ -439,12 +496,13 @@ namespace dlib { namespace tt
            - Linearly transforms src as a call to batch_normalize_conv() would if src had
              means and variances as given by running_means and running_variances.  That
              is, this function performs: 
-                dest = gamma*(src-running_means)/sqrt(running_variances+BATCH_NORM_EPS) + beta
+                dest = gamma*(src-running_means)/sqrt(running_variances+eps) + beta
              Note that it does this in a pointwise fashion over the samples, rows, and
              columns in src.
    !*/
    void batch_normalize_conv (
+        const double eps,
        resizable_tensor& dest,
        resizable_tensor& means,
        resizable_tensor& invstds,
@@ -457,6 +515,7 @@ namespace dlib { namespace tt
    );
    /*!
        requires
+            - eps > 0
            - src.num_samples() > 1
            - gamma.num_samples()==gamma.nr()==gamma.nc() == 1
            - beta.num_samples() ==beta.nr() ==gamma.nc() == 1
@@ -478,6 +537,7 @@ namespace dlib { namespace tt
    !*/
    void batch_normalize_conv_gradient (
+        const double eps,
        const tensor& gradient_input,
        const tensor& means,
        const tensor& invstds,
@@ -489,8 +549,9 @@ namespace dlib { namespace tt
    );
    /*!
        requires
+            - eps > 0
            - invstds and means should be the output of a call to
-              batch_normalize_conv(dest,means,invstds,src,gamma,beta)
+              batch_normalize_conv(eps,dest,means,invstds,src,gamma,beta)
            - have_same_dimensions(gradient_input, src) == true
            - have_same_dimensions(src, src_grad) == true
            - src.num_samples() > 1
@@ -502,7 +563,7 @@ namespace dlib { namespace tt
            - have_same_dimensions(invstds, gamma) == true
        ensures
            - Let f(src,gamma,beta) == dot(gradient_input, dest output of
-              batch_normalize_conv(dest,means,invstds,src,gamma,beta))
+              batch_normalize_conv(eps,dest,means,invstds,src,gamma,beta))
            - Adds the gradient of f() with respect to src to #src_grad.
            - Assigns the gradient of f() with respect to gamma to #gamma_grad.
            - Assigns the gradient of f() with respect to beta to #beta_grad.

--- a/dlib/dnn/trainer.h
+++ b/dlib/dnn/trainer.h
@@ -526,8 +526,7 @@ namespace dlib
            label_type pick_which_run_update;
            job_t next_job;
-            std::vector<std::future<double>> losses(devices.size());
+            std::vector<dlib::future<double>> losses(devices.size());
-            std::vector<std::future<void>> update_futs(devices.size());
            std::vector<tt::multi_device_tensor_averager> averagers;
            // An array of all the parameter tensors in the first network.  We will
@@ -536,6 +535,16 @@ namespace dlib
            std::vector<tensor*> reference_params;
            visit_layer_parameters(devices[0]->net, [&](size_t, tensor& t) { reference_params.push_back(&t); });
+            // We make separate thread pools with just one thread in them because we want
+            // to make sure each device is always executed on the same thread.  We care
+            // about this because there are thread_local context variables for some cuda
+            // components and they get regenerated when the current cuda device changes.
+            // Recreating them over and over is somewhat expensive so we want to avoid
+            // that.
+            std::vector<std::shared_ptr<thread_pool>> tp;
+            for (size_t i = 0; i < devices.size(); ++i)
+                tp.push_back(std::make_shared<thread_pool>(1));
            size_t iteration = 0;
            while(job_pipe.dequeue(next_job))
@@ -545,7 +554,7 @@ namespace dlib
                // right version for unsupervised or supervised training based on the type
                // of label_type.
                for (size_t i = 0; i < devices.size(); ++i)
-                    losses[i] = std::async(std::launch::async,[&,i](){ return compute_parameter_gradients(i, next_job, pick_which_run_update); });
+                    tp[i]->add_task_by_value([&,i](double& loss){ loss = compute_parameter_gradients(i, next_job, pick_which_run_update); }, losses[i]);
                // aggregate loss values from all the network computations.
                double theloss = 0;
                for (auto&& loss : losses)
@@ -596,10 +605,10 @@ namespace dlib
                // Now apply all the updates to each device.
                for (size_t i = 0; i < devices.size(); ++i)
-                    update_futs[i] = std::async(std::launch::async, [&,i](){ if (next_job.have_data[i]) update_parameters(i); });
+                    tp[i]->add_task_by_value([&,i](){ if (next_job.have_data[i]) update_parameters(i); });
                // and wait for the updates to all happen.
-                for (auto&& f : update_futs)
+                for (size_t i = 0; i < devices.size(); ++i)
-                    f.wait();
+                    tp[i]->wait_for_all_tasks();
                // Evey now and then force all the parameters to be the same just to make

--- a/dlib/optimization/optimization.h
+++ b/dlib/optimization/optimization.h
@@ -482,7 +482,7 @@ namespace dlib
            << "\n\t x_upper.size():         " << x_upper.size()
        );
        DLIB_ASSERT (
-            min(x_upper-x_lower) > 0,
+            min(x_upper-x_lower) >= 0,
            "\tdouble find_min_box_constrained()"
            << "\n\t You have to supply proper box constraints to this function."
            << "\n\r min(x_upper-x_lower): " << min(x_upper-x_lower)
@@ -610,7 +610,7 @@ namespace dlib
            << "\n\t x_upper.size():         " << x_upper.size()
        );
        DLIB_ASSERT (
-            min(x_upper-x_lower) > 0,
+            min(x_upper-x_lower) >= 0,
            "\tdouble find_max_box_constrained()"
            << "\n\t You have to supply proper box constraints to this function."
            << "\n\r min(x_upper-x_lower): " << min(x_upper-x_lower)

--- a/dlib/optimization/optimization_abstract.h
+++ b/dlib/optimization/optimization_abstract.h
@@ -297,7 +297,7 @@ namespace dlib
            - is_col_vector(x_upper) == true
            - x.size() == x_lower.size() == x_upper.size()
              (i.e. x, x_lower, and x_upper need to all be column vectors of the same dimensionality)
-            - min(x_upper-x_lower) > 0
+            - min(x_upper-x_lower) >= 0
              (i.e. x_upper must contain upper bounds relative to x_lower)
        ensures
            - Performs a box constrained minimization of the function f() using the given
@@ -391,7 +391,7 @@ namespace dlib
            - is_col_vector(x_upper) == true
            - x.size() == x_lower.size() == x_upper.size()
              (i.e. x, x_lower, and x_upper need to all be column vectors of the same dimensionality)
-            - min(x_upper-x_lower) > 0
+            - min(x_upper-x_lower) >= 0
              (i.e. x_upper must contain upper bounds relative to x_lower)
        ensures
            - Performs a box constrained maximization of the function f() using the given

--- a/dlib/test/dnn.cpp
+++ b/dlib/test/dnn.cpp
@@ -165,13 +165,13 @@ namespace
        resizable_tensor running_means;
        resizable_tensor running_variances;
-        batch_normalize(dest, means, vars, 1, running_means, running_variances, src, gamma, beta);
+        batch_normalize(DEFAULT_BATCH_NORM_EPS,dest, means, vars, 1, running_means, running_variances, src, gamma, beta);
        const double scale = (src.num_samples())/(src.num_samples()-1.0);
        // Turn back into biased variance estimate because that's how batch_normalize() works, so if we want to match it this is necessary.
        running_variances = mat(running_variances)/scale; 
-        batch_normalize_inference(dest2, src, gamma, beta, running_means, running_variances);
+        batch_normalize_inference(DEFAULT_BATCH_NORM_EPS,dest2, src, gamma, beta, running_means, running_variances);
        DLIB_TEST_MSG(max(abs(mat(dest2)-mat(dest))) < 1e-5, max(abs(mat(dest2)-mat(dest))));
-        cpu::batch_normalize_inference(dest3, src, gamma, beta, running_means, running_variances);
+        cpu::batch_normalize_inference(DEFAULT_BATCH_NORM_EPS,dest3, src, gamma, beta, running_means, running_variances);
        DLIB_TEST_MSG(max(abs(mat(dest3)-mat(dest))) < 1e-5, max(abs(mat(dest3)-mat(dest))));
@@ -179,7 +179,7 @@ namespace
            auto f = [&](float eps) {
                const float old = src.host()[idx];
                src.host()[idx] += eps;
-                batch_normalize(dest, means, vars, 1, running_means, running_variances, src, gamma, beta);
+                batch_normalize(DEFAULT_BATCH_NORM_EPS,dest, means, vars, 1, running_means, running_variances, src, gamma, beta);
                float result = dot(gradient_input, dest);
                src.host()[idx] = old;
                return result;
@@ -191,7 +191,7 @@ namespace
            auto f = [&](float eps) {
                const float old = gamma.host()[idx];
                gamma.host()[idx] += eps;
-                batch_normalize(dest, means, vars, 1, running_means, running_variances, src, gamma, beta);
+                batch_normalize(DEFAULT_BATCH_NORM_EPS,dest, means, vars, 1, running_means, running_variances, src, gamma, beta);
                float result = dot(gradient_input, dest);
                gamma.host()[idx] = old;
                return result;
@@ -203,7 +203,7 @@ namespace
            auto f = [&](float eps) {
                const float old = beta.host()[idx];
                beta.host()[idx] += eps;
-                batch_normalize(dest, means, vars, 1, running_means, running_variances, src, gamma, beta);
+                batch_normalize(DEFAULT_BATCH_NORM_EPS,dest, means, vars, 1, running_means, running_variances, src, gamma, beta);
                float result = dot(gradient_input, dest);
                beta.host()[idx] = old;
                return result;
@@ -220,7 +220,7 @@ namespace
        gamma_grad = 8;
        beta_grad = 8;
-        batch_normalize_gradient(gradient_input, means, vars, src, gamma, src_grad, gamma_grad, beta_grad);
+        batch_normalize_gradient(DEFAULT_BATCH_NORM_EPS,gradient_input, means, vars, src, gamma, src_grad, gamma_grad, beta_grad);
        auto grad_error = compare_gradients(src_grad, grad_src);
        dlog << LINFO << "src error: " << grad_error;
@@ -250,14 +250,14 @@ namespace
        resizable_tensor running_means;
        resizable_tensor running_variances;
-        batch_normalize_conv(dest, means, vars, 1, running_means, running_variances, src, gamma, beta);
+        batch_normalize_conv(DEFAULT_BATCH_NORM_EPS,dest, means, vars, 1, running_means, running_variances, src, gamma, beta);
        const double scale = (src.num_samples()*src.nr()*src.nc())/(src.num_samples()*src.nr()*src.nc()-1.0);
        // Turn back into biased variance estimate because that's how
        // batch_normalize_conv() works, so if we want to match it this is necessary.
        running_variances = mat(running_variances)/scale; 
-        batch_normalize_conv_inference(dest2, src, gamma, beta, running_means, running_variances);
+        batch_normalize_conv_inference(DEFAULT_BATCH_NORM_EPS,dest2, src, gamma, beta, running_means, running_variances);
        DLIB_TEST(max(abs(mat(dest2)-mat(dest))) < 1e-5);
-        cpu::batch_normalize_conv_inference(dest3, src, gamma, beta, running_means, running_variances);
+        cpu::batch_normalize_conv_inference(DEFAULT_BATCH_NORM_EPS,dest3, src, gamma, beta, running_means, running_variances);
        DLIB_TEST(max(abs(mat(dest3)-mat(dest))) < 1e-5);
@@ -265,7 +265,7 @@ namespace
            auto f = [&](float eps) {
                const float old = src.host()[idx];
                src.host()[idx] += eps;
-                batch_normalize_conv(dest, means, vars, 1, running_means, running_variances, src, gamma, beta);
+                batch_normalize_conv(DEFAULT_BATCH_NORM_EPS,dest, means, vars, 1, running_means, running_variances, src, gamma, beta);
                float result = dot(gradient_input, dest);
                src.host()[idx] = old;
                return result;
@@ -277,7 +277,7 @@ namespace
            auto f = [&](float eps) {
                const float old = gamma.host()[idx];
                gamma.host()[idx] += eps;
-                batch_normalize_conv(dest, means, vars, 1, running_means, running_variances, src, gamma, beta);
+                batch_normalize_conv(DEFAULT_BATCH_NORM_EPS,dest, means, vars, 1, running_means, running_variances, src, gamma, beta);
                float result = dot(gradient_input, dest);
                gamma.host()[idx] = old;
                return result;
@@ -289,7 +289,7 @@ namespace
            auto f = [&](float eps) {
                const float old = beta.host()[idx];
                beta.host()[idx] += eps;
-                batch_normalize_conv(dest, means, vars, 1, running_means, running_variances, src, gamma, beta);
+                batch_normalize_conv(DEFAULT_BATCH_NORM_EPS,dest, means, vars, 1, running_means, running_variances, src, gamma, beta);
                float result = dot(gradient_input, dest);
                beta.host()[idx] = old;
                return result;
@@ -307,7 +307,7 @@ namespace
        gamma_grad = 9;
        beta_grad = 9;
-        batch_normalize_conv_gradient(gradient_input, means, vars, src, gamma, src_grad, gamma_grad, beta_grad);
+        batch_normalize_conv_gradient(DEFAULT_BATCH_NORM_EPS,gradient_input, means, vars, src, gamma, src_grad, gamma_grad, beta_grad);
        auto grad_error = compare_gradients(src_grad, grad_src);
@@ -662,11 +662,11 @@ namespace
        rnd.fill_uniform(params_grad);
        resizable_tensor mm(m), vv(v);
-        cpu::compute_adam_update(s, mm, vv, t, 0.01, 0.001, 0.9, 0.99, params, params_grad);
+        cpu::compute_adam_update(0,params.size(),s, mm, vv, t, 0.01, 0.001, 0.9, 0.99, params, params_grad);
        matrix<float> s1 = mat(s);
        rnd.fill_uniform(s);
-        cuda::compute_adam_update(s, m, v, t, 0.01, 0.001, 0.9, 0.99, params, params_grad);
+        cuda::compute_adam_update(0,params.size(),s, m, v, t, 0.01, 0.001, 0.9, 0.99, params, params_grad);
        matrix<float> s2 = mat(s);
        DLIB_TEST_MSG(max(abs(s1-s2)) < 1e-6, max(abs(s1-s2)));
@@ -775,6 +775,27 @@ namespace
        cpu::affine_transform(dest2, src2, srcb2, srcc2, 2, 3, 4, 5);
        DLIB_TEST(equal(mat(dest),mat(dest2)));
+        cuda::affine_transform(dest, src, srcb, srcc, 2, 3, 4, 0);
+        cpu::affine_transform(dest2, src2, srcb2, srcc2, 2, 3, 4, 0);
+        DLIB_TEST(equal(mat(dest),mat(dest2)));
+        cuda::affine_transform_range(0, dest.size(), dest, src, srcb, srcc, 2, 3, 4);
+        cpu::affine_transform_range(0, dest2.size(), dest2, src2, srcb2, srcc2, 2, 3, 4);
+        DLIB_TEST(equal(mat(dest),mat(dest2)));
+        if (3 < dest.size())
+        {
+            dest = 999;
+            dest2 = 999;
+            cuda::affine_transform_range(3, dest.size()-1, dest, src, srcb, srcc, 2, 3, 4);
+            cpu::affine_transform_range(3, dest2.size()-1, dest2, src2, srcb2, srcc2, 2, 3, 4);
+            DLIB_TEST(equal(mat(dest),mat(dest2)));
+            cuda::affine_transform_range(dest.size(), dest.size(), dest, src, srcb, srcc, 2, 3, 4);
+            cpu::affine_transform_range(dest2.size(), dest2.size(), dest2, src2, srcb2, srcc2, 2, 3, 4);
+            DLIB_TEST(equal(mat(dest),mat(dest2)));
+        }
        rnd.fill_uniform(dest);
        rnd.fill_uniform(src);
@@ -863,8 +884,8 @@ namespace
        rnd.fill_uniform(src);
-        cpu::batch_normalize(dest, means, invstds, 1, running_means, running_variances, src, gamma, beta);
+        cpu::batch_normalize(DEFAULT_BATCH_NORM_EPS,dest, means, invstds, 1, running_means, running_variances, src, gamma, beta);
-        cuda::batch_normalize(dest2,means2,invstds2, 1, running_means2, running_variances2, src, gamma, beta);
+        cuda::batch_normalize(DEFAULT_BATCH_NORM_EPS,dest2,means2,invstds2, 1, running_means2, running_variances2, src, gamma, beta);
        dlog << LINFO << "dest error:    "<< max(abs(mat(dest) -mat(dest2)));
        dlog << LINFO << "means error:   "<< max(abs(mat(means) -mat(means2)));
@@ -890,8 +911,8 @@ namespace
        rnd.fill_uniform(gradient_input);
-        cpu::batch_normalize_gradient(gradient_input, means, invstds, src, gamma, src_grad, gamma_grad, beta_grad);
+        cpu::batch_normalize_gradient(DEFAULT_BATCH_NORM_EPS,gradient_input, means, invstds, src, gamma, src_grad, gamma_grad, beta_grad);
-        cuda::batch_normalize_gradient(gradient_input, means, invstds, src, gamma, src_grad2, gamma_grad2, beta_grad2);
+        cuda::batch_normalize_gradient(DEFAULT_BATCH_NORM_EPS,gradient_input, means, invstds, src, gamma, src_grad2, gamma_grad2, beta_grad2);
        dlog << LINFO << "src_grad error:   " << max(abs(mat(src_grad)-mat(src_grad2)));
        dlog << LINFO << "gamma_grad error: " << max(abs(mat(gamma_grad)-mat(gamma_grad2)));
@@ -917,8 +938,8 @@ namespace
        tt::tensor_rand rnd;
        rnd.fill_uniform(src);
-        cpu::batch_normalize_conv(dest,means,invstds,1,running_means,running_variances, src, gamma, beta);
+        cpu::batch_normalize_conv(DEFAULT_BATCH_NORM_EPS,dest,means,invstds,1,running_means,running_variances, src, gamma, beta);
-        cuda::batch_normalize_conv(dest2,means2,invstds2,1,running_means2,running_variances2, src, gamma, beta);
+        cuda::batch_normalize_conv(DEFAULT_BATCH_NORM_EPS,dest2,means2,invstds2,1,running_means2,running_variances2, src, gamma, beta);
        dlog << LINFO << "dest error:    "<< max(abs(mat(dest) -mat(dest2)));
        dlog << LINFO << "means error:   "<< max(abs(mat(means) -mat(means2)));
@@ -942,8 +963,8 @@ namespace
        rnd.fill_uniform(gradient_input);
-        cpu::batch_normalize_conv_gradient(gradient_input, means, invstds, src, gamma, src_grad, gamma_grad, beta_grad);
+        cpu::batch_normalize_conv_gradient(DEFAULT_BATCH_NORM_EPS,gradient_input, means, invstds, src, gamma, src_grad, gamma_grad, beta_grad);
-        cuda::batch_normalize_conv_gradient(gradient_input, means, invstds, src, gamma, src_grad2, gamma_grad2, beta_grad2);
+        cuda::batch_normalize_conv_gradient(DEFAULT_BATCH_NORM_EPS,gradient_input, means, invstds, src, gamma, src_grad2, gamma_grad2, beta_grad2);
        dlog << LINFO << "src_grad error:   " << max(abs(mat(src_grad)-mat(src_grad2)));
        dlog << LINFO << "gamma_grad error: " << max(abs(mat(gamma_grad)-mat(gamma_grad2)));
@@ -1318,6 +1339,72 @@ namespace
        DLIB_TEST(net2.subnet().subnet().subnet().layer_details().get_num_outputs() == 4);
    }
+// ----------------------------------------------------------------------------------------
+    template <
+        int N, 
+        template <typename> class BN, 
+        int stride, 
+        typename SUBNET
+        > 
+    using block  = BN<con<N,3,3,1,1,relu<BN<con<N,3,3,stride,stride,SUBNET>>>>>;
+    template <
+        template <int,template<typename>class,int,typename> class block, 
+        int N, 
+        template<typename>class BN, 
+        typename SUBNET
+        >
+    using residual = add_prev1<block<N,BN,1,tag1<SUBNET>>>;
+    template <
+        template <int,template<typename>class,int,typename> class block, 
+        int N, 
+        template<typename>class BN, 
+        typename SUBNET
+        >
+    using residual_down = add_prev2<avg_pool<2,2,2,2,skip1<tag2<block<N,BN,2,tag1<SUBNET>>>>>>;
+    template <typename SUBNET> using res       = relu<residual<block,8,bn_con,SUBNET>>;
+    template <typename SUBNET> using ares      = relu<residual<block,8,affine,SUBNET>>;
+    template <typename SUBNET> using res_down  = relu<residual_down<block,8,bn_con,SUBNET>>;
+    template <typename SUBNET> using ares_down = relu<residual_down<block,8,affine,SUBNET>>;
+    template <typename SUBNET> 
+    using pres  = prelu<add_prev1<bn_con<con<8,3,3,1,1,prelu<bn_con<con<8,3,3,1,1,tag1<SUBNET>>>>>>>>;
+    void test_visit_funcions()
+    {
+        using net_type2 = loss_multiclass_log<fc<10,
+            avg_pool_everything<
+            pres<res<res<res_down< // 2 prelu layers here
+            tag4<repeat<9,pres,    // 9 groups, each containing 2 prelu layers  
+            res_down<
+            res<
+            input<matrix<unsigned char>>
+            >>>>>>>>>>>;
+        net_type2 pnet;
+        DLIB_CASSERT(pnet.num_layers == 131, pnet.num_layers);
+        DLIB_CASSERT(pnet.num_computational_layers == 109, pnet.num_computational_layers);
+        std::vector<bool> hit(pnet.num_computational_layers, false);
+        size_t count = 0;
+        visit_layer_parameter_gradients(pnet, [&](size_t i, tensor& ){hit[i] = true; ++count; });
+        for (auto x : hit)
+            DLIB_TEST(x);
+        DLIB_TEST(count == pnet.num_computational_layers);
+        count = 0;
+        std::vector<bool> hit2(pnet.num_computational_layers, false);
+        visit_layer_parameters(pnet, [&](size_t i, tensor& ){hit2[i] = true; ++count; });
+        for (auto x : hit2)
+            DLIB_TEST(x);
+        DLIB_TEST(count == pnet.num_computational_layers);
+    }
 // ----------------------------------------------------------------------------------------
    class dnn_tester : public tester
@@ -1378,6 +1465,7 @@ namespace
            test_batch_normalize_conv();
            test_basic_tensor_ops();
            test_layers();
+            test_visit_funcions();
        }
    } a;

--- a/examples/dnn_mnist_advanced_ex.cpp
+++ b/examples/dnn_mnist_advanced_ex.cpp
@@ -20,29 +20,76 @@ using namespace dlib;
 // ----------------------------------------------------------------------------------------
-// Let's start by showing how you can conveniently define large networks.  The
+// Let's start by showing how you can conveniently define large and complex
-// most important tool for doing this are C++'s alias templates.  These let us
+// networks.  The most important tool for doing this are C++'s alias templates.
-// define new layer types that are combinations of a bunch of other layers.
+// These let us define new layer types that are combinations of a bunch of other
-// These will form the building blocks for more complex networks.
+// layers.  These will form the building blocks for more complex networks.
 // So let's begin by defining the building block of a residual network (see
 // Figure 2 in Deep Residual Learning for Image Recognition by He, Zhang, Ren,
-// and Sun).  You can see a few things in this statement.  The most obvious is
+// and Sun).  We are going to decompose the residual block into a few alias
-// that we have combined a bunch of layers into the name "base_res".  You can
+// statements.  First, we define the core block.
-// also see the use of the tag1 layer.  This layer doesn't do any computation.
-// It exists solely so other layers can refer to it.  In this case, the
+// Here we have parameterized the "block" layer on a BN layer (nominally some
-// add_prev1 layer looks for the tag1 layer and will take the tag1 output and
+// kind of batch normalization), the number of filter outputs N, and the stride
-// add it to the input of the add_prev1 layer.  This combination allows us to
+// the block operates at.
-// implement skip and residual style networks.  We have also made base_res
+template <
-// parameterized by BN, which will let us insert different batch normalization
+    int N, 
-// layers.
+    template <typename> class BN, 
-template <template <typename> class BN, typename SUBNET> 
+    int stride, 
-using base_res  = relu<add_prev1<BN<con<8,3,3,1,1,relu<BN<con<8,3,3,1,1,tag1<SUBNET>>>>>>>>;
+    typename SUBNET
+    > 
-// We also want a residual block that begins by doing downsampling.  We can
+using block  = BN<con<N,3,3,1,1,relu<BN<con<N,3,3,stride,stride,SUBNET>>>>>;
-// reuse base_res to define it like this:
-template <template <typename> class BN, typename SUBNET> 
+// Next, we need to define the skip layer mechanism used in the residual network
-using base_res_down  = base_res<BN,avg_pool<1,1,2,2,SUBNET>>;
+// paper.  They create their blocks by adding the input tensor to the output of
+// each block.  So we define an alias statement that takes a block and wraps it
+// with this skip/add structure.
+// Note the tag layer.  This layer doesn't do any computation.  It exists solely
+// so other layers can refer to it.  In this case, the add_prev1 layer looks for
+// the tag1 layer and will take the tag1 output and add it to the input of the
+// add_prev1 layer.  This combination allows us to implement skip and residual
+// style networks.  We have also set the block stride to 1 in this statement.
+// The significance of that is explained next.
+template <
+    template <int,template<typename>class,int,typename> class block, 
+    int N, 
+    template<typename>class BN, 
+    typename SUBNET
+    >
+using residual = add_prev1<block<N,BN,1,tag1<SUBNET>>>;
+// Some residual blocks do downsampling.  They do this by using a stride of 2
+// instead of 1.  However, when downsampling we need to also take care to
+// downsample the part of the network that adds the original input to the output
+// or the sizes won't make sense (the network will still run, but the results
+// aren't as good).  So here we define a downsampling version of residual.  In
+// it, we make use of the skip1 layer.  This layer simply outputs whatever is
+// output by the tag1 layer.  Therefore, the skip1 layer (there are also skip2,
+// skip3, etc. in dlib) allows you to create branching network structures.
+// residual_down creates a network structure like this:
+/*
+         input from SUBNET
+             /     \
+            /       \
+         block     downsample(using avg_pool)
+            \       /
+             \     /
+           add tensors (using add_prev2 which adds the output of tag2 with avg_pool's output)
+                |
+              output
+*/
+template <
+    template <int,template<typename>class,int,typename> class block, 
+    int N, 
+    template<typename>class BN, 
+    typename SUBNET
+    >
+using residual_down = add_prev2<avg_pool<2,2,2,2,skip1<tag2<block<N,BN,2,tag1<SUBNET>>>>>>;
 // Now we can define 4 different residual blocks we will use in this example.
 // The first two are non-downsampling residual blocks while the last two
@@ -50,10 +97,10 @@ using base_res_down  = base_res<BN,avg_pool<1,1,2,2,SUBNET>>;
 // ares_down have had the batch normalization replaced with simple affine
 // layers.  We will use the affine version of the layers when testing our
 // networks.
-template <typename SUBNET> using res       = base_res<bn_con,SUBNET>;
+template <typename SUBNET> using res       = relu<residual<block,8,bn_con,SUBNET>>;
-template <typename SUBNET> using ares      = base_res<affine,SUBNET>;
+template <typename SUBNET> using ares      = relu<residual<block,8,affine,SUBNET>>;
-template <typename SUBNET> using res_down  = base_res_down<bn_con,SUBNET>;
+template <typename SUBNET> using res_down  = relu<residual_down<block,8,bn_con,SUBNET>>;
-template <typename SUBNET> using ares_down = base_res_down<affine,SUBNET>;
+template <typename SUBNET> using ares_down = relu<residual_down<block,8,affine,SUBNET>>;
@@ -145,39 +192,41 @@ int main(int argc, char** argv) try
    // These print statements will output this (I've truncated it since it's
    // long, but you get the idea):
    /*
-        The pnet has 127 layers in it.
+        The pnet has 131 layers in it.
        layer<0>    loss_multiclass_log
-        layer<1>    fc       (num_outputs=10)
+        layer<1>    fc       (num_outputs=10) learning_rate_mult=1 weight_decay_mult=1 bias_learning_rate_mult=1 bias_weight_decay_mult=0
        layer<2>    avg_pool (nr=0, nc=0, stride_y=1, stride_x=1, padding_y=0, padding_x=0)
        layer<3>    prelu    (initial_param_value=0.2)
        layer<4>    add_prev
-        layer<5>    bn_con
+        layer<5>    bn_con   eps=1e-05 learning_rate_mult=1 weight_decay_mult=0 bias_learning_rate_mult=1 bias_weight_decay_mult=1
-        layer<6>    con      (num_filters=8, nr=3, nc=3, stride_y=1, stride_x=1, padding_y=1, padding_x=1)
+        layer<6>    con      (num_filters=8, nr=3, nc=3, stride_y=1, stride_x=1, padding_y=1, padding_x=1) learning_rate_mult=1 weight_decay_mult=1 bias_learning_rate_mult=1 bias_weight_decay_mult=0
        layer<7>    prelu    (initial_param_value=0.25)
-        layer<8>    bn_con
+        layer<8>    bn_con   eps=1e-05 learning_rate_mult=1 weight_decay_mult=0 bias_learning_rate_mult=1 bias_weight_decay_mult=1
-        layer<9>    con      (num_filters=8, nr=3, nc=3, stride_y=1, stride_x=1, padding_y=1, padding_x=1)
+        layer<9>    con     (num_filters=8, nr=3, nc=3, stride_y=1, stride_x=1, padding_y=1, padding_x=1) learning_rate_mult=1 weight_decay_mult=1 bias_learning_rate_mult=1 bias_weight_decay_mult=0
        layer<10>   tag1
        ...
-        layer<33>   con      (num_filters=8, nr=3, nc=3, stride_y=1, stride_x=1, padding_y=1, padding_x=1)
+        layer<34>   relu
-        layer<34>   tag1
+        layer<35>   bn_con   eps=1e-05 learning_rate_mult=1 weight_decay_mult=0 bias_learning_rate_mult=1 bias_weight_decay_mult=1
-        layer<35>   avg_pool (nr=1, nc=1, stride_y=2, stride_x=2, padding_y=0, padding_x=0)
+        layer<36>   con      (num_filters=8, nr=3, nc=3, stride_y=2, stride_x=2, padding_y=0, padding_x=0) learning_rate_mult=1 weight_decay_mult=1 bias_learning_rate_mult=1 bias_weight_decay_mult=0
-        layer<36>   tag4
+        layer<37>   tag1
-        layer<37>   prelu    (initial_param_value=0.3)
+        layer<38>   tag4
-        layer<38>   add_prev
+        layer<39>   prelu    (initial_param_value=0.3)
-        layer<39>   bn_con
+        layer<40>   add_prev
+        layer<41>   bn_con   eps=1e-05 learning_rate_mult=1 weight_decay_mult=0 bias_learning_rate_mult=1 bias_weight_decay_mult=1
        ...
-        layer<115>  con      (num_filters=8, nr=3, nc=3, stride_y=1, stride_x=1, padding_y=1, padding_x=1)
-        layer<116>  tag1
-        layer<117>  avg_pool (nr=1, nc=1, stride_y=2, stride_x=2, padding_y=0, padding_x=0)
        layer<118>  relu
-        layer<119>  add_prev
+        layer<119>  bn_con   eps=1e-05 learning_rate_mult=1 weight_decay_mult=0 bias_learning_rate_mult=1 bias_weight_decay_mult=1
-        layer<120>  bn_con
+        layer<120>  con      (num_filters=8, nr=3, nc=3, stride_y=2, stride_x=2, padding_y=0, padding_x=0) learning_rate_mult=1 weight_decay_mult=1 bias_learning_rate_mult=1 bias_weight_decay_mult=0
-        layer<121>  con      (num_filters=8, nr=3, nc=3, stride_y=1, stride_x=1, padding_y=1, padding_x=1)
+        layer<121>  tag1
        layer<122>  relu
-        layer<123>  bn_con
+        layer<123>  add_prev
-        layer<124>  con      (num_filters=8, nr=3, nc=3, stride_y=1, stride_x=1, padding_y=1, padding_x=1)
+        layer<124>  bn_con   eps=1e-05 learning_rate_mult=1 weight_decay_mult=0 bias_learning_rate_mult=1 bias_weight_decay_mult=1
-        layer<125>  tag1
+        layer<125>  con      (num_filters=8, nr=3, nc=3, stride_y=1, stride_x=1, padding_y=1, padding_x=1) learning_rate_mult=1 weight_decay_mult=1 bias_learning_rate_mult=1 bias_weight_decay_mult=0
-        layer<126>  input<matrix>
+        layer<126>  relu
+        layer<127>  bn_con   eps=1e-05 learning_rate_mult=1 weight_decay_mult=0 bias_learning_rate_mult=1 bias_weight_decay_mult=1
+        layer<128>  con      (num_filters=8, nr=3, nc=3, stride_y=1, stride_x=1, padding_y=1, padding_x=1) learning_rate_mult=1 weight_decay_mult=1 bias_learning_rate_mult=1 bias_weight_decay_mult=0
+        layer<129>  tag1
+        layer<130>  input<matrix>
    */
    // Now that we know the index numbers for each layer, we can access them
@@ -195,7 +244,7 @@ int main(int argc, char** argv) try
    // parts of your network and access them by layer<tag>().  You can also
    // index relative to a tag.  So for example, to access the layer immediately
    // after tag4 you can say:
-    layer<tag4,1>(pnet); // Equivalent to layer<36+1>(pnet).
+    layer<tag4,1>(pnet); // Equivalent to layer<38+1>(pnet).
    // Or to access the layer 2 layers after tag4:
    layer<tag4,2>(pnet);