core.h 74.7 KB
Newer Older
1
2
3
4
5
6
7
8
9
// Copyright (C) 2015  Davis E. King (davis@dlib.net)
// License: Boost Software License   See LICENSE.txt for the full license.
#ifndef DLIB_DNn_CORE_H_
#define DLIB_DNn_CORE_H_

#include "core_abstract.h"
#include "tensor.h"
#include <iterator>
#include <memory>
10
#include <sstream>
11
#include <type_traits>
Davis King's avatar
Davis King committed
12
13
#include "../statistics.h"
#include "../rand.h"
14
#include "../algs.h"
15
#include <utility>
16
#include <tuple>
Davis King's avatar
Davis King committed
17
#include <cmath>
18
19
20
21
22


namespace dlib
{

Davis King's avatar
Davis King committed
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
// ----------------------------------------------------------------------------------------

    inline double log1pexp(double x)
    {
        using std::exp;
        using namespace std; // Do this instead of using std::log1p because some compilers
                             // error out otherwise (E.g. gcc 4.9 in cygwin)
        if (x <= -37)
            return exp(x);
        else if (-37 < x && x <= 18)
            return log1p(exp(x));
        else if (18 < x && x <= 33.3)
            return x + exp(-x);
        else
            return x;
    }
    
40
41
// ----------------------------------------------------------------------------------------

Davis King's avatar
Davis King committed
42
43
44
45
    // Tell us if T is one of the special layer types (i.e. add_layer, add_tag_layer, or
    // add_skip_layer).
    template <typename T> struct is_nonloss_layer_type : std::false_type {};
    // Tell us if T is an instance of add_loss_layer.
46
47
    template <typename T> struct is_loss_layer_type : std::false_type {};

48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
    namespace impl
    {
        template <size_t... n>
        struct ct_integers_list {
            template <size_t m>
            struct push_back
            {
                typedef ct_integers_list<n..., m> type;
            };
        };

        template <size_t max>
        struct ct_make_integer_range
        {
            // recursively call push_back on ct_integers_list to build a range from 0 to max
            // inclusive.
            typedef typename ct_make_integer_range<max-1>::type::template push_back<max>::type type;
        };

        template <>
        struct ct_make_integer_range<0>
        {
            typedef ct_integers_list<> type;
        };

        template <size_t... indices, typename Tuple>
        auto tuple_subset(
            const Tuple& item, 
            ct_integers_list<indices...>
        ) -> decltype(std::make_tuple(std::get<indices>(item)...))
        {
            return std::make_tuple(std::get<indices>(item)...);
        }

82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
        template <typename T> struct alwaysbool { typedef bool type; };

        resizable_tensor& rt();

        // The significance of a layer's backward method requiring forward's outputs is
        // that such as layer can't have an in-place layer stacked on top of it because
        // in-place layers overwrite the output of the layer they sit on top of.
        template <typename layer_type, typename SUBNET>
        constexpr auto backward_requires_forward_output(
            layer_type& layer,
            SUBNET& sub
        ) -> typename alwaysbool<decltype(layer.backward(rt(),rt(),sub,rt()))>::type
        {
            return true;
        }

        template <typename layer_type, typename SUBNET>
        constexpr auto backward_requires_forward_output(
            layer_type& layer,
            SUBNET& sub
        ) -> typename alwaysbool<decltype(layer.backward(rt(),sub,rt()))>::type
        {
            return false;
        }

        template <typename layer_type, typename SUBNET>
        constexpr auto backward_requires_forward_output(
            layer_type& layer,
            SUBNET& sub
        ) -> typename alwaysbool<decltype(layer.backward_inplace(rt(),rt(),sub.get_gradient_input(),rt()))>::type
        {
            return true;
        }

        template <typename layer_type, typename SUBNET>
        constexpr auto has_inplace_backward(
            layer_type& layer,
            SUBNET& sub
        ) -> typename alwaysbool<decltype(layer.backward(rt(),rt(),sub,rt()))>::type
        {
            return false;
        }

        template <typename layer_type, typename SUBNET>
        constexpr auto has_inplace_backward(
            layer_type& layer,
            SUBNET& sub
        ) -> typename alwaysbool<decltype(layer.backward(rt(),sub,rt()))>::type
        {
            return false;
        }

        template <typename layer_type, typename SUBNET>
        constexpr auto has_inplace_backward(
            layer_type& layer,
            SUBNET& sub
        ) -> typename alwaysbool<decltype(layer.backward_inplace(rt(),rt(),sub.get_gradient_input(),rt()))>::type
        {
            return true;
        }

        template <typename layer_type, typename SUBNET>
        constexpr auto is_inplace_layer(
            layer_type& layer,
            const SUBNET& sub 
        ) -> typename alwaysbool<decltype(layer.forward(sub,rt()))>::type
        {
            return false;
        }

        template <typename layer_type, typename SUBNET>
        constexpr auto is_inplace_layer(
            layer_type& layer,
            const SUBNET& sub
        ) -> typename alwaysbool<decltype(layer.forward_inplace(sub.get_output(),rt()))>::type
        {
            return true;
        }

        template <typename layer_type, typename SUBNET>
        auto call_layer_backward(
            layer_type& layer,
            const tensor& computed_output, 
            const tensor& gradient_input, 
            SUBNET& sub, 
            tensor& params_grad
        ) -> decltype(layer.backward(computed_output,gradient_input,sub,params_grad))
        {
            layer.backward(computed_output,gradient_input,sub,params_grad);
        }

        template <typename layer_type, typename SUBNET>
        auto call_layer_backward(
            layer_type& layer,
            const tensor& , 
            const tensor& gradient_input, 
            SUBNET& sub, 
            tensor& params_grad
        ) -> decltype(layer.backward(gradient_input,sub,params_grad))
        {
            layer.backward(gradient_input,sub,params_grad);
        }

        template <typename layer_type, typename SUBNET>
        auto call_layer_backward(
            layer_type& layer,
            const tensor& computed_output, 
            const tensor& gradient_input, 
            SUBNET& sub, 
            tensor& params_grad
        ) -> decltype(layer.backward_inplace(computed_output,gradient_input,sub.get_gradient_input(),params_grad))
        {
            layer.backward_inplace(computed_output,gradient_input,sub.get_gradient_input(),params_grad);
        }


        template <typename layer_type, typename SUBNET>
        auto call_layer_forward(
            layer_type& layer,
            const SUBNET& sub, 
202
            tensor& /*data_output*/
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
        ) -> decltype(layer.forward(sub,rt()))
        {
            // This overload of call_layer_forward() is here because this template
            // naturally gets instantiated but only on code paths that never get executed.
            // So rather than writing a bunch of hard to read template magic around call
            // sites we just have this overload that doesn't do anything (and an assert to
            // make sure that's the case).
            DLIB_CASSERT(false, "This should never happen");
        }

        template <typename layer_type, typename SUBNET>
        auto call_layer_forward(
            layer_type& layer,
            const SUBNET& sub, 
            resizable_tensor& data_output
        ) -> decltype(layer.forward(sub,data_output))
        {
            layer.forward(sub,data_output);
        }

        template <typename layer_type, typename SUBNET>
        auto call_layer_forward(
            layer_type& layer,
            const SUBNET& sub, 
            tensor& data_output
        ) -> decltype(layer.forward_inplace(sub.get_output(),data_output))
        {
            layer.forward_inplace(sub.get_output(),data_output);
        }

        template <typename layer_type, typename SUBNET>
        auto call_layer_forward(
            layer_type& layer,
            const SUBNET& sub, 
            resizable_tensor& data_output
        ) -> decltype(layer.forward_inplace(sub.get_output(),data_output))
        {
            if (!have_same_dimensions(data_output, sub.get_output()))
                data_output.copy_size(sub.get_output());
            layer.forward_inplace(sub.get_output(),data_output);
        }


    } // end namespace impl
247
248
249
250
251
252
253
254
255

    template <typename Head, typename... Tail>
    std::tuple<Tail...> tuple_tail(
        const std::tuple<Head, Tail...>& item
    )
    {
        return impl::tuple_subset(item, typename impl::ct_make_integer_range<sizeof...(Tail)>::type());
    }

256
257
258
259
260
261
262
263
// ----------------------------------------------------------------------------------------

    inline void randomize_parameters (
        tensor& params,
        unsigned long num_inputs_and_outputs,
        dlib::rand& rnd
    )
    {
Davis King's avatar
Davis King committed
264
        for (auto& val : params)
265
266
267
268
        {
            // Draw a random number to initialize the layer according to formula (16)
            // from Understanding the difficulty of training deep feedforward neural
            // networks by Xavier Glorot and Yoshua Bengio.
Davis King's avatar
Davis King committed
269
            val = 2*rnd.get_random_float()-1;
270
271
272
273
274
275
276
277
278
            val *= std::sqrt(6.0/(num_inputs_and_outputs));
        }
    }

// ----------------------------------------------------------------------------------------

    template <typename T, size_t N>
    class sstack
    {
Davis King's avatar
Davis King committed
279
280
281
282
    public:
        static_assert(N > 0, "You can't create an empty sstack.");
        typedef T value_type;
        const static size_t num_elements = N;
283

Davis King's avatar
Davis King committed
284
285
        sstack() {}
        sstack(const T& item_) : item(item_), data(item_) {}
286

Davis King's avatar
Davis King committed
287
288
        const T& top() const { return item; }
        T& top() { return item; }
289

Davis King's avatar
Davis King committed
290
        size_t size() const { return N; }
291

Davis King's avatar
Davis King committed
292
293
        const sstack<T,N-1>& pop() const { return data; }
        sstack<T,N-1>& pop() { return data; }
294

295
296
297
298
299
300
301
302
303
304
305
306
        friend void serialize(const sstack& item, std::ostream& out)
        {
            serialize(item.top(), out);
            serialize(item.pop(), out);
        }

        friend void deserialize(sstack& item, std::istream& in)
        {
            deserialize(item.top(), in);
            deserialize(item.pop(), in);
        }

Davis King's avatar
Davis King committed
307
308
309
    private:
        T item;
        sstack<T,N-1> data;
310
311
312
313
314
315
316
    };

    template <typename T>
    class sstack<T,1> // base case of recursive definition.
    {
    public:
        sstack() {}
Davis King's avatar
Davis King committed
317
        sstack(const T& item_) : item(item_) {}
318
319
320
321
322

        const T& top() const { return item; }
        T& top() { return item; }

        size_t size() const { return 1; }
323
324
325
326
327
328
329
330
331
332
333

        friend void serialize(const sstack& item, std::ostream& out)
        {
            serialize(item.top(), out);
        }

        friend void deserialize(sstack& item, std::istream& in)
        {
            deserialize(item.top(), in);
        }

334
335
336
337
338
339
340
341
342
343
    private:
        T item;
    };

// ----------------------------------------------------------------------------------------
// ----------------------------------------------------------------------------------------
// ----------------------------------------------------------------------------------------

    namespace dimpl
    {
344
        template <typename T, bool is_first = true, typename enabled=void>
Davis King's avatar
Davis King committed
345
        class subnet_wrapper
346
347
348
        {
            /*!
                WHAT THIS OBJECT REPRESENTS
Davis King's avatar
Davis King committed
349
                    This is a tool that makes an add_layer or add_loss_layer object
Davis King's avatar
Davis King committed
350
                    expose only the part of its interface defined by the SUBNET
351
                    type in layers_abstract.h.  This way, when we pass subnetwork
352
                    objects to the layer callbacks those callbacks won't be able to 
353
                    interact with the subnetworks in a way other than specified 
Davis King's avatar
Davis King committed
354
                    by the SUBNET interface spec.
355
356
357
358
359
360
361

                    We also allow the top layer of a subnet_wrapper stack to call the
                    private_get_output() and private_get_gradient_input() functions.  This
                    way, layers that have had their output/gradient overwritten by in-place
                    layers can only be accessed from the in-place layers that sit directly
                    on top of them since those in-place layers are the only layers that
                    know how to interact with them properly.
362
363
364
            !*/

        public:
Davis King's avatar
Davis King committed
365
366
            subnet_wrapper(const subnet_wrapper&) = delete;
            subnet_wrapper& operator=(const subnet_wrapper&) = delete;
367

Davis King's avatar
Davis King committed
368
            subnet_wrapper(T& l_) {}
369
370
371
372
373
            // Nothing here because in this case T is one of the input layer types 
            // that doesn't have anything in it.
        };

        template <typename T>
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
        class subnet_wrapper<T,true, typename std::enable_if<is_nonloss_layer_type<T>::value>::type>
        {

        public:
            subnet_wrapper(const subnet_wrapper&) = delete;
            subnet_wrapper& operator=(const subnet_wrapper&) = delete;

            typedef T wrapped_type;
            const static size_t num_layers = T::num_layers;

            subnet_wrapper(T& l_) : l(l_),subnetwork(l.subnet()) {}

            const tensor& get_output() const { return l.private_get_output(); }
            tensor& get_gradient_input() { return l.private_get_gradient_input(); }

Davis King's avatar
Davis King committed
389
390
            const subnet_wrapper<typename T::subnet_type,false>& subnet() const { return subnetwork; }
            subnet_wrapper<typename T::subnet_type,false>& subnet() { return subnetwork; }
391
392
393
394
395
396
397
398

        private:
            T& l;
            subnet_wrapper<typename T::subnet_type,false> subnetwork;
        };

        template <typename T>
        class subnet_wrapper<T,false, typename std::enable_if<is_nonloss_layer_type<T>::value>::type>
399
400
401
        {

        public:
Davis King's avatar
Davis King committed
402
403
            subnet_wrapper(const subnet_wrapper&) = delete;
            subnet_wrapper& operator=(const subnet_wrapper&) = delete;
404

405
406
407
            typedef T wrapped_type;
            const static size_t num_layers = T::num_layers;

Davis King's avatar
Davis King committed
408
            subnet_wrapper(T& l_) : l(l_),subnetwork(l.subnet()) {}
409
410
411
412

            const tensor& get_output() const { return l.get_output(); }
            tensor& get_gradient_input() { return l.get_gradient_input(); }

Davis King's avatar
Davis King committed
413
414
            const subnet_wrapper<typename T::subnet_type,false>& subnet() const { return subnetwork; }
            subnet_wrapper<typename T::subnet_type,false>& subnet() { return subnetwork; }
415
416
417

        private:
            T& l;
Davis King's avatar
Davis King committed
418
            subnet_wrapper<typename T::subnet_type,false> subnetwork;
419
420
421
        };
    }

Davis King's avatar
Davis King committed
422
423
// ----------------------------------------------------------------------------------------

Davis King's avatar
Davis King committed
424
    template <typename LAYER_DETAILS, typename SUBNET, typename enabled = void>
425
426
427
    class add_layer;

    template <typename T, typename U>
Davis King's avatar
Davis King committed
428
    struct is_nonloss_layer_type<add_layer<T,U>> : std::true_type {};
429

Davis King's avatar
Davis King committed
430
431
432
    template <typename LAYER_DETAILS, typename SUBNET>
    class add_layer<LAYER_DETAILS,SUBNET, 
            typename std::enable_if<is_nonloss_layer_type<SUBNET>::value>::type>
433
434
435
    {
    public:
        typedef LAYER_DETAILS layer_details_type;
Davis King's avatar
Davis King committed
436
437
438
439
        typedef SUBNET subnet_type;
        typedef typename subnet_type::input_type input_type;
        const static size_t num_layers = subnet_type::num_layers + 1;
        const static unsigned int sample_expansion_factor = subnet_type::sample_expansion_factor;
440
441
442
443

        add_layer(
        ):
            this_layer_setup_called(false),
444
445
            gradient_input_is_stale(true),
            get_output_and_gradient_input_disabled(false)
446
        {
447
448
            if (this_layer_operates_inplace())
                subnetwork.disable_output_and_gradient_getters();
449
450
451
452
        }

        add_layer(const add_layer&) = default;
        add_layer& operator=(const add_layer&) = default;
453
454
        add_layer(add_layer&& item) : add_layer() { swap(item); }
        add_layer& operator=(add_layer&& item) { swap(item); return *this; }
455
456
457

        template <typename T, typename U, typename E>
        friend class add_layer;
458
459
        template <typename T, bool is_first, typename E>
        friend class dimpl::subnet_wrapper;
460
461
462
463
464
465
466

        // Allow copying networks from one to another as long as their corresponding 
        // layers can be constructed from each other.
        template <typename T, typename U, typename E>
        add_layer(
            const add_layer<T,U,E>& item
        ) :
Davis King's avatar
Davis King committed
467
            subnetwork(item.subnet()),
468
469
470
            details(item.layer_details()), 
            this_layer_setup_called(item.this_layer_setup_called),
            gradient_input_is_stale(item.gradient_input_is_stale),
471
            get_output_and_gradient_input_disabled(item.get_output_and_gradient_input_disabled),
472
473
474
            x_grad(item.x_grad),
            cached_output(item.cached_output)
        {
475
476
            if (this_layer_operates_inplace())
                subnetwork.disable_output_and_gradient_getters();
477
478
479
480
481
482
483
484
        }

        template <typename ...T>
        add_layer(
            const LAYER_DETAILS& layer_det, 
            T&& ...args
        ) : 
            details(layer_det), 
Davis King's avatar
Davis King committed
485
            subnetwork(std::forward<T>(args)...),
486
            this_layer_setup_called(false),
487
488
            gradient_input_is_stale(true),
            get_output_and_gradient_input_disabled(false)
489
        {
490
491
            if (this_layer_operates_inplace())
                subnetwork.disable_output_and_gradient_getters();
492
493
494
495
496
497
498
499
        }

        template <typename ...T>
        add_layer(
            LAYER_DETAILS&& layer_det, 
            T&& ...args
        ) : 
            details(std::move(layer_det)), 
Davis King's avatar
Davis King committed
500
            subnetwork(std::forward<T>(args)...),
501
            this_layer_setup_called(false),
502
503
            gradient_input_is_stale(true),
            get_output_and_gradient_input_disabled(false)
504
        {
505
506
            if (this_layer_operates_inplace())
                subnetwork.disable_output_and_gradient_getters();
507
508
        }

509
510
511
512
513
514
515
516
        template <typename ...T, typename ...U>
        add_layer(
            const std::tuple<LAYER_DETAILS,U...>& layer_det, 
            T&& ...args
        ) : 
            details(std::get<0>(layer_det)), 
            subnetwork(tuple_tail(layer_det),std::forward<T>(args)...),
            this_layer_setup_called(false),
517
518
            gradient_input_is_stale(true),
            get_output_and_gradient_input_disabled(false)
519
        {
520
521
            if (this_layer_operates_inplace())
                subnetwork.disable_output_and_gradient_getters();
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
        }

        template <typename ...T, typename ...U>
        add_layer(
            std::tuple<>,
            const std::tuple<LAYER_DETAILS,U...>& layer_det, 
            T&& ...args
        ) : add_layer(layer_det,args...) { }

        template <typename ...T>
        add_layer(
            std::tuple<>, 
            LAYER_DETAILS&& layer_det, 
            T&& ...args
        ) : add_layer(layer_det, args...) { }

538
539
        template <typename input_iterator>
        void to_tensor (
540
541
            input_iterator ibegin,
            input_iterator iend,
542
543
544
            resizable_tensor& data
        ) const
        {
Davis King's avatar
Davis King committed
545
            subnetwork.to_tensor(ibegin,iend,data);
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
        }

        template <typename input_iterator>
        const tensor& operator() (
            input_iterator ibegin,
            input_iterator iend
        )
        {
            to_tensor(ibegin,iend,temp_tensor);
            return forward(temp_tensor);
        }


        const tensor& operator() (const input_type& x)
        {
            return (*this)(&x, &x+1);
        }

        const tensor& forward(const tensor& x)
        {
Davis King's avatar
Davis King committed
566
567
            subnetwork.forward(x);
            const dimpl::subnet_wrapper<subnet_type> wsub(subnetwork);
568
569
570
571
572
            if (!this_layer_setup_called)
            {
                details.setup(wsub);
                this_layer_setup_called = true;
            }
573
574
575
576
577
            if (this_layer_operates_inplace())
                impl::call_layer_forward(details, wsub, private_get_output());
            else
                impl::call_layer_forward(details, wsub, cached_output);

578
            gradient_input_is_stale = true;
579
            return private_get_output();
580
581
        }

582
583
    private:
        tensor& private_get_output() const
584
        { 
585
586
587
588
589
590
591
592
            if (const_cast<add_layer&>(*this).this_layer_operates_inplace())
                return subnetwork.private_get_output();
            else
                return const_cast<resizable_tensor&>(cached_output); 
        }
        tensor& private_get_gradient_input() 
        { 
            if (this_layer_operates_inplace())
593
            {
594
                return subnetwork.private_get_gradient_input();
595
            }
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
            else
            {
                if (gradient_input_is_stale)
                {
                    gradient_input_is_stale = false;
                    x_grad.copy_size(private_get_output());
                    x_grad = 0;
                }
                return x_grad; 
            }
        }
        void disable_output_and_gradient_getters (
        ) { get_output_and_gradient_input_disabled = true; }
    public:
        const tensor& get_output() const 
        { 
            if (get_output_and_gradient_input_disabled)
                throw dlib::error("Accessing this layer's get_output() is disabled because an in-place layer has been stacked on top of it.");
            return private_get_output(); 
        }
        tensor& get_gradient_input() 
        { 
            if (get_output_and_gradient_input_disabled)
                throw dlib::error("Accessing this layer's get_gradient_input() is disabled because an in-place layer has been stacked on top of it.");
            return private_get_gradient_input();
621
622
623
624
625
        }

        template <typename solver_type>
        void update(const tensor& x, sstack<solver_type,num_layers>& solvers)
        {
Davis King's avatar
Davis King committed
626
            dimpl::subnet_wrapper<subnet_type> wsub(subnetwork);
627
            params_grad.copy_size(details.get_layer_params());
628
629
            impl::call_layer_backward(details, private_get_output(),
                private_get_gradient_input(), wsub, static_cast<tensor&>(params_grad));
630
631
632
            // Don't try to adjust the parameters if this layer doesn't have any.
            if (params_grad.size() != 0)
                solvers.top()(details, static_cast<const tensor&>(params_grad));
Davis King's avatar
Davis King committed
633
            subnetwork.update(x, solvers.pop());
634
            gradient_input_is_stale = true;
635
636
        }

Davis King's avatar
Davis King committed
637
638
        const subnet_type& subnet() const { return subnetwork; }
        subnet_type& subnet() { return subnetwork; }
639
640
641
642
643
644
645
646
647
648
649

        const layer_details_type& layer_details() const { return details; } 
        layer_details_type& layer_details() { return details; } 

        void clean()
        {
            x_grad.clear();
            cached_output.clear();
            params_grad.clear();
            temp_tensor.clear();
            gradient_input_is_stale = true;
Davis King's avatar
Davis King committed
650
            subnetwork.clean();
651
652
        }

653
654
655
656
657
658
659
660
        friend void serialize(const add_layer& item, std::ostream& out)
        {
            int version = 1;
            serialize(version, out);
            serialize(item.subnetwork, out);
            serialize(item.details, out);
            serialize(item.this_layer_setup_called, out);
            serialize(item.gradient_input_is_stale, out);
661
            serialize(item.get_output_and_gradient_input_disabled, out);
662
663
664
665
666
667
668
669
670
671
672
673
674
675
            serialize(item.x_grad, out);
            serialize(item.cached_output, out);
        }

        friend void deserialize(add_layer& item, std::istream& in)
        {
            int version = 0;
            deserialize(version, in);
            if (version != 1)
                throw serialization_error("Unexpected version found while deserializing dlib::add_layer.");
            deserialize(item.subnetwork, in);
            deserialize(item.details, in);
            deserialize(item.this_layer_setup_called, in);
            deserialize(item.gradient_input_is_stale, in);
676
            deserialize(item.get_output_and_gradient_input_disabled, in);
677
678
679
680
            deserialize(item.x_grad, in);
            deserialize(item.cached_output, in);
        }

681
682
    private:

683
684
685
686
687
688
689
690
691
692
693
694
695
696
        bool this_layer_operates_inplace(
        ) 
        {
            // This layer can run in-place if it's an in-place capable layer and also if
            // the layer it's on top of doesn't need it's own output tensor (since in-place
            // layers overwrite that tensor)
            return impl::is_inplace_layer(details, subnetwork) && !subnetwork.this_layer_requires_forward_output();
        }
        bool this_layer_requires_forward_output(
        ) 
        {
            return impl::backward_requires_forward_output(details, subnetwork);
        }

697
698
699
700
701
702
        void swap(add_layer& item)
        {
            std::swap(subnetwork,item.subnetwork);
            std::swap(details, item.details);
            std::swap(this_layer_setup_called, item.this_layer_setup_called);
            std::swap(gradient_input_is_stale, item.gradient_input_is_stale);
703
            std::swap(get_output_and_gradient_input_disabled, item.get_output_and_gradient_input_disabled);
704
705
706
707
            std::swap(x_grad, item.x_grad);
            std::swap(cached_output, item.cached_output);
        }

708

Davis King's avatar
Davis King committed
709
        subnet_type subnetwork;
710
711
712
        LAYER_DETAILS details;
        bool this_layer_setup_called;
        bool gradient_input_is_stale;
713
714
715
716
        bool get_output_and_gradient_input_disabled;
        // Note that if this_layer_operates_inplace()==true then x_grad and cached_output
        // are not used at all.  Instead, this layer uses these variables from the lower
        // layer.
717
718
719
720
721
722
723
724
725
726
727
728
729
        resizable_tensor x_grad;
        resizable_tensor cached_output; 

        // The following 2 objects don't logically contribute to the state of this class.
        // They are only here to prevent them from being reallocated over and over in
        // member functions.
        resizable_tensor params_grad; 
        resizable_tensor temp_tensor;

    };

// ----------------------------------------------------------------------------------------

730
// This version of add_layer handles the special case where the subnetwork being given is
Davis King's avatar
Davis King committed
731
// just an input layer object.
732
733
734
735
736
    template <typename LAYER_DETAILS, typename INPUT_LAYER, typename enabled>
    class add_layer
    {
    public:
        typedef LAYER_DETAILS layer_details_type;
Davis King's avatar
Davis King committed
737
        typedef INPUT_LAYER subnet_type;
738
739
740
741
742
743
744
745
746
        typedef typename INPUT_LAYER::input_type input_type;
        const static unsigned int sample_expansion_factor = INPUT_LAYER::sample_expansion_factor;
        const static size_t num_layers = 1;
        static_assert(sample_expansion_factor >= 1,
            "The input layer can't produce fewer output tensors than there are inputs.");

        add_layer(
        ): 
            this_layer_setup_called(false),
747
748
            gradient_input_is_stale(true),
            get_output_and_gradient_input_disabled(false)
749
750
751
        {}

        add_layer(const add_layer&) = default;
752
        add_layer(add_layer&& item) : add_layer() { swap(item); }
753
        add_layer& operator=(const add_layer&) = default;
754
        add_layer& operator=(add_layer&& item) { swap(item); return *this; }
755
756
757

        template <typename T, typename U, typename E>
        friend class add_layer;
758
759
        template <typename T, bool is_first, typename E>
        friend class dimpl::subnet_wrapper;
760
761
762
763
764
765
766

        // Allow copying networks from one to another as long as their corresponding 
        // layers can be constructed from each other.
        template <typename T, typename U, typename E>
        add_layer(
            const add_layer<T,U,E>& item
        ):
Davis King's avatar
Davis King committed
767
            input_layer(item.subnet()),
768
769
770
            details(item.layer_details()),
            this_layer_setup_called(item.this_layer_setup_called),
            gradient_input_is_stale(item.gradient_input_is_stale),
771
            get_output_and_gradient_input_disabled(false),
772
773
774
775
776
777
778
779
780
781
            x_grad(item.x_grad),
            cached_output(item.cached_output)
        {
        }

        add_layer(
            const LAYER_DETAILS& layer_det
        ) : 
            details(layer_det), 
            this_layer_setup_called(false),
782
783
            gradient_input_is_stale(true),
            get_output_and_gradient_input_disabled(false)
784
785
786
787
788
789
790
        {}

        add_layer(
            LAYER_DETAILS&& layer_det
        ) : 
            details(std::move(layer_det)), 
            this_layer_setup_called(false),
791
792
            gradient_input_is_stale(true),
            get_output_and_gradient_input_disabled(false)
793
794
795
796
797
798
        {}

        add_layer(
            LAYER_DETAILS layer_det, 
            INPUT_LAYER il
        ) : 
799
800
            details(std::move(layer_det)),
            input_layer(std::move(il)),
801
            this_layer_setup_called(false),
802
803
            gradient_input_is_stale(true),
            get_output_and_gradient_input_disabled(false)
804
805
        {}

806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
        add_layer(
            std::tuple<>,
            const LAYER_DETAILS& layer_det
        ) : add_layer(layer_det) {}

        add_layer(
            std::tuple<>,
            LAYER_DETAILS&& layer_det
        ) : add_layer(layer_det) {}

        add_layer(
            std::tuple<>,
            LAYER_DETAILS layer_det, 
            INPUT_LAYER il
        ) : add_layer(layer_det,il) {}

        add_layer(
            const std::tuple<LAYER_DETAILS>& layer_det
        ) : add_layer(std::get<0>(layer_det)) {}

        add_layer(
            const std::tuple<LAYER_DETAILS>& layer_det,
            INPUT_LAYER il
        ) : add_layer(std::get<0>(layer_det),il) {}

831
832
        template <typename input_iterator>
        void to_tensor (
833
834
            input_iterator ibegin,
            input_iterator iend,
835
836
837
            resizable_tensor& data
        ) const
        {
838
            input_layer.to_tensor(ibegin, iend, data);
839
            // make sure the input layer's to_tensor() function is implemented properly.
840
            DLIB_CASSERT(std::distance(ibegin,iend)*sample_expansion_factor == data.num_samples(),"");
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
            data.async_copy_to_device();
        }


        template <typename input_iterator>
        const tensor& operator() (
            input_iterator ibegin,
            input_iterator iend
        )
        {
            to_tensor(ibegin,iend,temp_tensor);
            return forward(temp_tensor);
        }


        const tensor& operator() (const input_type& x)
        {
            return (*this)(&x, &x+1);
        }

        const tensor& forward (const tensor& x)
        {
            DLIB_CASSERT(x.num_samples()%sample_expansion_factor == 0,"");
Davis King's avatar
Davis King committed
864
            subnet_wrapper wsub(x, grad_final_ignored);
865
866
867
868
869
            if (!this_layer_setup_called)
            {
                details.setup(wsub);
                this_layer_setup_called = true;
            }
870
            impl::call_layer_forward(details, wsub, cached_output);
871
            gradient_input_is_stale = true;
872
            return private_get_output();
873
874
        }

875
876
877
    private:
        tensor& private_get_output() const { return const_cast<resizable_tensor&>(cached_output); }
        tensor& private_get_gradient_input() 
878
879
880
881
        { 
            if (gradient_input_is_stale)
            {
                gradient_input_is_stale = false;
882
                x_grad.copy_size(private_get_output());
883
884
885
886
                x_grad = 0;
            }
            return x_grad; 
        }
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
        void disable_output_and_gradient_getters (
        ) { get_output_and_gradient_input_disabled = true; }
    public:
        const tensor& get_output() const 
        { 
            if (get_output_and_gradient_input_disabled)
                throw dlib::error("Accessing this layer's get_output() is disabled because an in-place layer has been stacked on top of it.");
            return private_get_output(); 
        }
        tensor& get_gradient_input() 
        { 
            if (get_output_and_gradient_input_disabled)
                throw dlib::error("Accessing this layer's get_gradient_input() is disabled because an in-place layer has been stacked on top of it.");
            return private_get_gradient_input();
        }
902
903
904
905

        template <typename solver_type>
        void update(const tensor& x, sstack<solver_type,num_layers>& solvers)
        {
Davis King's avatar
Davis King committed
906
            subnet_wrapper wsub(x, grad_final_ignored);
907
            params_grad.copy_size(details.get_layer_params());
908
909
            impl::call_layer_backward(details, private_get_output(),
                private_get_gradient_input(), wsub, static_cast<tensor&>(params_grad));
910
911
912
            // Don't try to adjust the parameters if this layer doesn't have any.
            if (params_grad.size() != 0)
                solvers.top()(details, static_cast<const tensor&>(params_grad));
913
            gradient_input_is_stale = true;
914
915
        }

Davis King's avatar
Davis King committed
916
917
        const subnet_type& subnet() const { return input_layer; } 
        subnet_type& subnet() { return input_layer; } 
918
919
920
921
922
923
924
925
926
927
928
929
930
931

        const layer_details_type& layer_details() const { return details; } 
        layer_details_type& layer_details() { return details; } 

        void clean()
        {
            x_grad.clear();
            grad_final_ignored.clear();
            cached_output.clear();
            params_grad.clear();
            temp_tensor.clear();
            gradient_input_is_stale = true;
        }

932
933
934
935
936
937
938
939
        friend void serialize(const add_layer& item, std::ostream& out)
        {
            int version = 1;
            serialize(version, out);
            serialize(item.input_layer, out);
            serialize(item.details, out);
            serialize(item.this_layer_setup_called, out);
            serialize(item.gradient_input_is_stale, out);
940
            serialize(item.get_output_and_gradient_input_disabled, out);
941
942
943
944
945
946
947
948
949
950
951
952
953
954
            serialize(item.x_grad, out);
            serialize(item.cached_output, out);
        }

        friend void deserialize(add_layer& item, std::istream& in)
        {
            int version = 0;
            deserialize(version, in);
            if (version != 1)
                throw serialization_error("Unexpected version found while deserializing dlib::add_layer.");
            deserialize(item.input_layer, in);
            deserialize(item.details, in);
            deserialize(item.this_layer_setup_called, in);
            deserialize(item.gradient_input_is_stale, in);
955
            deserialize(item.get_output_and_gradient_input_disabled, in);
956
957
958
959
            deserialize(item.x_grad, in);
            deserialize(item.cached_output, in);
        }

960
961
    private:

962
963
964
965
966
967
968
        bool this_layer_requires_forward_output(
        ) 
        {
            subnet_wrapper wsub(grad_final_ignored, grad_final_ignored);
            return impl::backward_requires_forward_output(details, wsub);
        }

Davis King's avatar
Davis King committed
969
        class subnet_wrapper
970
971
        {
        public:
Davis King's avatar
Davis King committed
972
            subnet_wrapper(const tensor& x_, resizable_tensor& grad_final_ignored_) :
973
974
                x(x_), grad_final_ignored(grad_final_ignored_) {}

Davis King's avatar
Davis King committed
975
976
            subnet_wrapper(const subnet_wrapper&) = delete;
            subnet_wrapper& operator=(const subnet_wrapper&) = delete;
977

978
979
980
981
982
            const tensor& get_output() const { return x; }
            tensor& get_gradient_input() 
            { 
                // It doesn't matter what values are in this tensor but client code will
                // always assume it's the same dimension as the output so make sure that is
983
984
985
986
987
988
989
                // the case.  Note that we do set it to a non-crazy value though to avoid
                // it being full of NaN and slowing the processing down.
                if (!have_same_dimensions(x, grad_final_ignored))
                {
                    grad_final_ignored.copy_size(x);
                    grad_final_ignored = 0;  
                }
990
991
992
993
994
995
996
997
                return grad_final_ignored; 
            }

        private:
            const tensor& x;
            resizable_tensor& grad_final_ignored;
        };

998
999
1000
1001
1002
1003
        void swap(add_layer& item)
        {
            std::swap(input_layer, item.input_layer);
            std::swap(details, item.details);
            std::swap(this_layer_setup_called, item.this_layer_setup_called);
            std::swap(gradient_input_is_stale, item.gradient_input_is_stale);
1004
            std::swap(get_output_and_gradient_input_disabled, item.get_output_and_gradient_input_disabled);
1005
1006
1007
1008
            std::swap(x_grad, item.x_grad); 
            std::swap(cached_output, item.cached_output); 
        }

Davis King's avatar
Davis King committed
1009
        subnet_type input_layer;
1010
1011
1012
        LAYER_DETAILS details;
        bool this_layer_setup_called;
        bool gradient_input_is_stale;
1013
        bool get_output_and_gradient_input_disabled;
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
        resizable_tensor x_grad; 
        resizable_tensor cached_output; 

        // The following 3 objects don't logically contribute to the state of this class.
        // They are only here to prevent them from being reallocated over and over in
        // member functions.
        resizable_tensor params_grad; 
        resizable_tensor temp_tensor; 
        resizable_tensor grad_final_ignored;
    };

// ----------------------------------------------------------------------------------------

Davis King's avatar
Davis King committed
1027
    template <unsigned long ID, typename SUBNET, typename enabled=void>
1028
1029
    class add_tag_layer;

Davis King's avatar
Davis King committed
1030
1031
1032
    template <unsigned long ID, typename SUBNET>
    class add_tag_layer<ID,SUBNET,
            typename std::enable_if<is_nonloss_layer_type<SUBNET>::value>::type>
1033
1034
    {
    public:
Davis King's avatar
Davis King committed
1035
1036
1037
1038
        typedef SUBNET subnet_type;
        typedef typename subnet_type::input_type input_type;
        const static size_t num_layers = subnet_type::num_layers + 1;
        const static unsigned int sample_expansion_factor = subnet_type::sample_expansion_factor;
1039
1040
1041
        static_assert(sample_expansion_factor >= 1,
            "The input layer can't produce fewer output tensors than there are inputs.");

Davis King's avatar
Davis King committed
1042
1043
1044
1045
1046
        add_tag_layer() = default;
        add_tag_layer(const add_tag_layer&) = default;
        add_tag_layer(add_tag_layer&&) = default;
        add_tag_layer& operator=(add_tag_layer&&) = default;
        add_tag_layer& operator=(const add_tag_layer&) = default;
1047
1048

        template <typename T>
Davis King's avatar
Davis King committed
1049
1050
        add_tag_layer(
            const add_tag_layer<ID,T>& item
Davis King's avatar
Davis King committed
1051
        ) : subnetwork(item.subnet())
1052
1053
1054
        {}

        template <typename ...T>
Davis King's avatar
Davis King committed
1055
        add_tag_layer(
1056
1057
            T ...args
        ) : 
Davis King's avatar
Davis King committed
1058
            subnetwork(std::move(args)...) 
1059
1060
1061
1062
1063
        {
        }

        template <typename input_iterator>
        void to_tensor (
1064
1065
            input_iterator ibegin,
            input_iterator iend,
1066
1067
1068
            resizable_tensor& data
        ) const
        {
Davis King's avatar
Davis King committed
1069
            subnetwork.to_tensor(ibegin,iend,data);
1070
1071
1072
1073
1074
1075
1076
1077
        }

        template <typename input_iterator>
        const tensor& operator() (
            input_iterator ibegin,
            input_iterator iend
        )
        {
Davis King's avatar
Davis King committed
1078
            return subnetwork(ibegin,iend);
1079
1080
1081
1082
        }

        const tensor& operator() (const input_type& x)
        {
Davis King's avatar
Davis King committed
1083
            return subnetwork(x);
1084
1085
1086
1087
        }

        const tensor& forward(const tensor& x)
        {
Davis King's avatar
Davis King committed
1088
            return subnetwork.forward(x);
1089
1090
        }

Davis King's avatar
Davis King committed
1091
        const tensor& get_output() const { return subnetwork.get_output(); }
1092
1093
1094

        tensor& get_gradient_input() 
        { 
Davis King's avatar
Davis King committed
1095
            return subnetwork.get_gradient_input();
1096
1097
1098
1099
1100
        }

        template <typename solver_type>
        void update(const tensor& x, sstack<solver_type,num_layers>& solvers)
        {
Davis King's avatar
Davis King committed
1101
            subnetwork.update(x,solvers.pop());
1102
1103
        }

Davis King's avatar
Davis King committed
1104
1105
        const subnet_type& subnet() const { return subnetwork; }
        subnet_type& subnet() { return subnetwork; }
1106
1107
1108

        void clean()
        {
Davis King's avatar
Davis King committed
1109
            subnetwork.clean();
1110
1111
        }

1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
        friend void serialize(const add_tag_layer& item, std::ostream& out)
        {
            int version = 1;
            serialize(version, out);
            serialize(item.subnetwork, out);
        }

        friend void deserialize(add_tag_layer& item, std::istream& in)
        {
            int version = 0;
            deserialize(version, in);
            if (version != 1)
                throw serialization_error("Unexpected version found while deserializing dlib::add_tag_layer.");
            deserialize(item.subnetwork, in);
        }

1128
1129
    private:

Davis King's avatar
Davis King committed
1130
        subnet_type subnetwork;
1131
1132
    };

1133
1134
// ----------------------------------------------------------------------------------------

1135
// This version of add_tag_layer handles the special case where the subnetwork being given
1136
1137
1138
1139
1140
// is just an input layer object.
    template <unsigned long ID, typename INPUT_LAYER, typename enabled>
    class add_tag_layer
    {
    public:
Davis King's avatar
Davis King committed
1141
1142
        typedef INPUT_LAYER subnet_type;
        typedef typename subnet_type::input_type input_type;
1143
        const static size_t num_layers = 1;
Davis King's avatar
Davis King committed
1144
        const static unsigned int sample_expansion_factor = subnet_type::sample_expansion_factor;
1145
1146
1147
1148
1149
1150
        static_assert(sample_expansion_factor >= 1,
            "The input layer can't produce fewer output tensors than there are inputs.");

        add_tag_layer() = default;
        add_tag_layer(const add_tag_layer&) = default;
        add_tag_layer& operator=(const add_tag_layer&) = default;
1151
1152
        add_tag_layer(add_tag_layer&& item) : add_tag_layer() { swap(item); }
        add_tag_layer& operator=(add_tag_layer&& item) { swap(item); return *this; }
1153
1154
1155
1156

        template <typename T, typename E>
        add_tag_layer(
            const add_tag_layer<ID,T,E>& item
Davis King's avatar
Davis King committed
1157
        ) : input_layer(item.subnet())
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
        {}

        template <typename ...T>
        add_tag_layer(
            T ...args
        ) : 
            input_layer(std::move(args)...) 
        {
        }

        template <typename input_iterator>
        void to_tensor (
1170
1171
            input_iterator ibegin,
            input_iterator iend,
1172
1173
1174
            resizable_tensor& data
        ) const
        {
1175
            input_layer.to_tensor(ibegin,iend,data);
1176
1177
1178
1179
        }

        template <typename input_iterator>
        const tensor& operator() (
1180
            input_iterator ibegin, 
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
            input_iterator iend
        )
        {
            input_layer.to_tensor(ibegin,iend,cached_output);
            return get_output();
        }

        const tensor& operator() (const input_type& x)
        {
            return (*this)(&x, &x+1);
        }

        const tensor& forward(const tensor& x)
        {
            cached_output = x;
            return get_output();
        }

        const tensor& get_output() const 
        { 
            return cached_output; 
        }

        tensor& get_gradient_input() 
        { 
            if (!have_same_dimensions(cached_output, grad_final_ignored))
            {
                grad_final_ignored.copy_size(get_output());
                grad_final_ignored = 0;
            }
            return grad_final_ignored; 
        }

        template <typename solver_type>
        void update(const tensor& /*x*/, sstack<solver_type,num_layers>& /*solvers*/)
        {
            // nothing to update
        }

Davis King's avatar
Davis King committed
1220
1221
        const subnet_type& subnet() const { return input_layer; }
        subnet_type& subnet() { return input_layer; }
1222
1223
1224
1225
1226
1227
1228

        void clean()
        {
            grad_final_ignored.clear();
            cached_output.clear();
        }

1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
        friend void serialize(const add_tag_layer& item, std::ostream& out)
        {
            int version = 1;
            serialize(version, out);
            serialize(item.input_layer, out);
            serialize(item.cached_output, out);
            serialize(item.grad_final_ignored, out);
        }

        friend void deserialize(add_tag_layer& item, std::istream& in)
        {
            int version = 0;
            deserialize(version, in);
            if (version != 1)
                throw serialization_error("Unexpected version found while deserializing dlib::add_tag_layer.");
            deserialize(item.input_layer, in);
            deserialize(item.cached_output, in);
            deserialize(item.grad_final_ignored, in);
        }

1249
1250
    private:

1251
1252
1253
1254
1255
1256
1257
        void swap(add_tag_layer& item)
        {
            std::swap(input_layer, item.input_layer);
            std::swap(cached_output, item.cached_output);
            std::swap(grad_final_ignored, item.grad_final_ignored);
        }

Davis King's avatar
Davis King committed
1258
        subnet_type input_layer;
1259
1260
1261
1262
1263
1264
        resizable_tensor cached_output;
        resizable_tensor grad_final_ignored;
    };

    template <unsigned long ID, typename U, typename E>
    struct is_nonloss_layer_type<add_tag_layer<ID,U,E>> : std::true_type {};
1265
1266
1267
1268
1269
1270


// ----------------------------------------------------------------------------------------
// ----------------------------------------------------------------------------------------
// ----------------------------------------------------------------------------------------

Davis King's avatar
Davis King committed
1271
    template <typename LOSS_DETAILS, typename SUBNET>
Davis King's avatar
Davis King committed
1272
    class add_loss_layer;
1273
1274
1275
1276
1277

    class no_label_type
    {
    private:
        // We don't want anyone making these no_label_type objects.  They are here only to
Davis King's avatar
Davis King committed
1278
        // allow add_loss_layer::label_type and dnn_trainer::label_type to exist which avoids
Davis King's avatar
Davis King committed
1279
        // needing to overload add_loss_layer and dnn_trainer for supervised an unsupervised
1280
1281
        // losses.  It also can be a type to use in template metaprogramming to indicate
        // "no label".  So here we make the constructor private with the exception that
Davis King's avatar
Davis King committed
1282
        // add_loss_layer objects can make it (again, just to simplify add_loss_layer's
1283
        // implementation).
1284
        no_label_type(){};
Davis King's avatar
Davis King committed
1285
        template <typename LOSS_DETAILS, typename SUBNET> friend class add_loss_layer;
1286
        template < typename net_type, typename solver_type > friend class dnn_trainer; 
1287
1288
1289
1290
    };

// ----------------------------------------------------------------------------------------

Davis King's avatar
Davis King committed
1291
    template <typename LOSS_DETAILS, typename SUBNET>
Davis King's avatar
Davis King committed
1292
    class add_loss_layer
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
    {
        template <typename T, typename enabled=void>
        struct get_loss_layer_label_type
        {
            typedef no_label_type type;
        };
        template <typename T>
        struct get_loss_layer_label_type<T,typename std::enable_if<sizeof(typename T::label_type)!=0>::type>
        {
            typedef typename T::label_type type;
        };

    public:
        typedef LOSS_DETAILS loss_details_type;
Davis King's avatar
Davis King committed
1307
1308
        typedef SUBNET subnet_type;
        typedef typename subnet_type::input_type input_type;
1309
        // Note that the loss layer doesn't count as an additional layer.
Davis King's avatar
Davis King committed
1310
1311
        const static size_t num_layers = subnet_type::num_layers;
        const static unsigned int sample_expansion_factor = subnet_type::sample_expansion_factor;
1312
1313
        typedef typename get_loss_layer_label_type<LOSS_DETAILS>::type label_type;

1314
1315
        static_assert(is_nonloss_layer_type<SUBNET>::value, 
            "SUBNET must be of type add_layer, add_skip_layer, or add_tag_layer."); 
1316
1317
1318
1319
        static_assert(sample_expansion_factor == LOSS_DETAILS::sample_expansion_factor,
            "The loss layer and input layer must agree on the sample_expansion_factor.");


1320
        add_loss_layer() {};
Davis King's avatar
Davis King committed
1321
1322
        add_loss_layer(const add_loss_layer&) = default;
        add_loss_layer& operator=(const add_loss_layer&) = default;
1323
1324
        add_loss_layer(add_loss_layer&& item) : add_loss_layer() { swap(item); }
        add_loss_layer& operator=(add_loss_layer&& item) { swap(item); return *this; }
1325
1326

        template <typename T, typename U>
Davis King's avatar
Davis King committed
1327
1328
        add_loss_layer(
            const add_loss_layer<T,U>& item
1329
1330
        ) : 
            loss(item.loss_details()),
Davis King's avatar
Davis King committed
1331
            subnetwork(item.subnet())
1332
1333
1334
        {}

        template <typename ...T>
Davis King's avatar
Davis King committed
1335
        add_loss_layer(
1336
1337
1338
1339
            const LOSS_DETAILS& layer_det, 
            T&& ...args
        ) : 
            loss(layer_det), 
Davis King's avatar
Davis King committed
1340
            subnetwork(std::forward<T>(args)...)
1341
1342
1343
1344
        {
        }

        template <typename ...T>
Davis King's avatar
Davis King committed
1345
        add_loss_layer(
1346
1347
1348
1349
            LOSS_DETAILS&& layer_det, 
            T&& ...args
        ) : 
            loss(std::move(layer_det)), 
Davis King's avatar
Davis King committed
1350
            subnetwork(std::forward<T>(args)...)
1351
1352
1353
1354
        {
        }

        template <typename ...T>
Davis King's avatar
Davis King committed
1355
        add_loss_layer(
1356
1357
            T ...args
        ) : 
Davis King's avatar
Davis King committed
1358
            subnetwork(std::move(args)...)
1359
        {
1360
1361
1362
1363
1364
1365
1366
1367
1368
        }

        template <typename input_iterator>
        void to_tensor (
            input_iterator ibegin,
            input_iterator iend,
            resizable_tensor& data
        ) const
        {
Davis King's avatar
Davis King committed
1369
            subnetwork.to_tensor(ibegin,iend,data);
1370
1371
1372
1373
1374
1375
1376
1377
        }

        template <typename output_iterator>
        void operator() (
            const tensor& x, 
            output_iterator obegin
        )
        {
Davis King's avatar
Davis King committed
1378
1379
            subnetwork.forward(x);
            const dimpl::subnet_wrapper<subnet_type> wsub(subnetwork);
1380
            loss.to_label(x, wsub, obegin);
1381
1382
1383
1384
1385
1386
1387
1388
1389
        }

        template <typename input_iterator, typename output_iterator>
        void operator() (
            input_iterator ibegin,
            input_iterator iend,
            output_iterator obegin
        )
        {
1390
1391
            to_tensor(ibegin,iend,temp_tensor);
            (*this)(temp_tensor, obegin);
1392
1393
1394
1395
1396
1397
1398
1399
        }

        const label_type& operator() (const input_type& x)
        {
            (*this)(&x, &x+1, &temp_label);
            return temp_label;
        }

1400
1401
1402
1403
1404
1405
        template <typename label_iterator>
        double compute_loss (
            const tensor& x,
            label_iterator lbegin 
        )
        {
Davis King's avatar
Davis King committed
1406
1407
            subnetwork.forward(x);
            dimpl::subnet_wrapper<subnet_type> wsub(subnetwork);
1408
1409
            return loss.compute_loss(x, lbegin, wsub);
        }
1410
1411
1412
1413
1414
1415
1416
1417

        template <typename input_iterator, typename label_iterator>
        double compute_loss (
            input_iterator ibegin,
            input_iterator iend,
            label_iterator lbegin 
        )
        {
1418
1419
1420
1421
1422
1423
1424
1425
            to_tensor(ibegin,iend,temp_tensor);
            return compute_loss(temp_tensor, lbegin);
        }

        double compute_loss (
            const tensor& x
        )
        {
Davis King's avatar
Davis King committed
1426
1427
            subnetwork.forward(x);
            dimpl::subnet_wrapper<subnet_type> wsub(subnetwork);
1428
            return loss.compute_loss(x, wsub);
1429
1430
1431
1432
1433
1434
1435
1436
        }

        template <typename input_iterator>
        double compute_loss (
            input_iterator ibegin,
            input_iterator iend
        )
        {
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
            to_tensor(ibegin,iend,temp_tensor);
            return compute_loss(temp_tensor);
        }

        template <typename label_iterator, typename solver_type>
        double update (
            const tensor& x,
            label_iterator lbegin,
            sstack<solver_type,num_layers>& solvers
        )
        {
Davis King's avatar
Davis King committed
1448
1449
            subnetwork.forward(x);
            dimpl::subnet_wrapper<subnet_type> wsub(subnetwork);
1450
            double l = loss.compute_loss(x, lbegin, wsub);
Davis King's avatar
Davis King committed
1451
            subnetwork.update(x, solvers);
1452
            return l;
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
        }

        template <typename input_iterator, typename label_iterator, typename solver_type>
        double update (
            input_iterator ibegin,
            input_iterator iend,
            label_iterator lbegin,
            sstack<solver_type,num_layers>& solvers
        )
        {
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
            to_tensor(ibegin,iend,temp_tensor);
            return update(temp_tensor, lbegin, solvers);
        }

        template <typename solver_type>
        double update (
            const tensor& x,
            sstack<solver_type,num_layers>& solvers
        )
        {
Davis King's avatar
Davis King committed
1473
1474
            subnetwork.forward(x);
            dimpl::subnet_wrapper<subnet_type> wsub(subnetwork);
1475
            double l = loss.compute_loss(x, wsub);
Davis King's avatar
Davis King committed
1476
            subnetwork.update(x, solvers);
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
            return l;
        }

        template <typename input_iterator, typename solver_type>
        double update (
            input_iterator ibegin,
            input_iterator iend,
            sstack<solver_type,num_layers>& solvers
        )
        {
1487
1488
            to_tensor(ibegin,iend,temp_tensor);
            return update(temp_tensor, solvers);
1489
1490
        }

Davis King's avatar
Davis King committed
1491
1492
        const subnet_type& subnet() const { return subnetwork; }
        subnet_type& subnet() { return subnetwork; }
1493
1494
1495
1496
1497
1498
1499
        const loss_details_type& loss_details() const { return loss; }
        loss_details_type& loss_details() { return loss; }

        void clean (
        )
        {
            temp_tensor.clear();
1500
            subnetwork.clean();
1501
1502
        }

1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
        friend void serialize(const add_loss_layer& item, std::ostream& out)
        {
            int version = 1;
            serialize(version, out);
            serialize(item.loss, out);
            serialize(item.subnetwork, out);
        }

        friend void deserialize(add_loss_layer& item, std::istream& in)
        {
            int version = 0;
            deserialize(version, in);
            if (version != 1)
                throw serialization_error("Unexpected version found while deserializing dlib::add_loss_layer.");
            deserialize(item.loss, in);
            deserialize(item.subnetwork, in);
        }

1521
1522
    private:

1523
1524
1525
1526
1527
1528
        void swap(add_loss_layer& item)
        {
            std::swap(loss, item.loss);
            std::swap(subnetwork, item.subnetwork);
        }

1529
        loss_details_type loss;
Davis King's avatar
Davis King committed
1530
        subnet_type subnetwork;
1531
1532
1533
1534
1535
1536
1537
1538
1539

        // These two objects don't logically contribute to the state of this object.  They
        // are here to prevent them from being reallocated over and over.
        label_type temp_label;
        resizable_tensor temp_tensor;
    };


    template <typename T, typename U>
Davis King's avatar
Davis King committed
1540
    struct is_loss_layer_type<add_loss_layer<T,U>> : std::true_type {};
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551

// ----------------------------------------------------------------------------------------
// ----------------------------------------------------------------------------------------
// ----------------------------------------------------------------------------------------

    namespace impl
    {
        template <unsigned int i, typename T>
        struct layer_helper
        {
            static T& makeT();
Davis King's avatar
Davis King committed
1552
            using next_type = typename std::remove_reference<decltype(makeT().subnet())>::type;
1553
1554
1555
            using type = typename layer_helper<i-1,next_type>::type;
            static type& layer(T& n)
            {
Davis King's avatar
Davis King committed
1556
                return layer_helper<i-1,next_type>::layer(n.subnet());
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
            }
        };
        template <typename T>
        struct layer_helper<0,T>
        {
            using type = T;
            static type& layer(T& n)
            {
                return n;
            }
        };

        template <template<typename> class Match, typename T, unsigned int i, typename enabled = void>
        struct layer_helper_match
        {
            static T& makeT();
Davis King's avatar
Davis King committed
1573
            using next_type = typename std::remove_reference<decltype(makeT().subnet())>::type;
1574
1575
1576
            using type = typename layer_helper_match<Match,next_type,i>::type;
            static type& layer(T& n)
            {
Davis King's avatar
Davis King committed
1577
                return layer_helper_match<Match,next_type,i>::layer(n.subnet());
1578
1579
            }
        };
Davis King's avatar
Davis King committed
1580
        // This overload catches add_layer and add_loss_layer templates.
1581
1582
        template <template<typename> class Match, typename T, unsigned int i>
        struct layer_helper_match<Match,T,i,
Davis King's avatar
Davis King committed
1583
            typename std::enable_if<std::is_same<const T,const  Match<typename T::subnet_type>>::value>::type>
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
        {
            using type = typename layer_helper<i,T>::type;
            static type& layer(T& n)
            {
                return layer_helper<i,T>::layer(n);
            }
        };
        // This overload catches input templates.
        template <template<typename> class Match, typename T, unsigned int i>
        struct layer_helper_match<Match,T,i,
            typename std::enable_if<std::is_same<const T,const  Match<typename T::input_type>>::value>::type>
        {
            using type = typename layer_helper<i,T>::type;
            static type& layer(T& n)
            {
                return layer_helper<i,T>::layer(n);
            }
        };
Davis King's avatar
Davis King committed
1602
        // This overload catches subnet_wrapper templates.
1603
1604
1605
        template <template<typename> class Match, typename T, unsigned int i>
        struct layer_helper_match<Match,T,i,
            typename std::enable_if<std::is_same<const typename T::wrapped_type, 
Davis King's avatar
Davis King committed
1606
                                                 const Match<typename T::wrapped_type::subnet_type>>::value>::type>
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
        {
            using type = typename layer_helper<i,T>::type;
            static type& layer(T& n)
            {
                return layer_helper<i,T>::layer(n);
            }
        };
    }

    template <unsigned int i, typename T>
    typename impl::layer_helper<i,T>::type& layer (T& n) 
    {
        return impl::layer_helper<i,T>::layer(n);
    }

    template <template<typename> class Match, typename T>
    typename impl::layer_helper_match<Match,T,0>::type& layer (T& n) 
    {
        return impl::layer_helper_match<Match,T,0>::layer(n);
    }

    template <template<typename> class Match, unsigned int i, typename T>
    typename impl::layer_helper_match<Match,T,i>::type& layer (T& n) 
    {
        return impl::layer_helper_match<Match,T,i>::layer(n);
    }

// ----------------------------------------------------------------------------------------

Davis King's avatar
Davis King committed
1636
    template <template<typename> class TAG_TYPE, typename SUBNET>
Davis King's avatar
Davis King committed
1637
    class add_skip_layer
1638
1639
    {
    public:
Davis King's avatar
Davis King committed
1640
1641
1642
1643
        typedef SUBNET subnet_type;
        typedef typename subnet_type::input_type input_type;
        const static size_t num_layers = subnet_type::num_layers + 1;
        const static unsigned int sample_expansion_factor = subnet_type::sample_expansion_factor;
1644
1645
1646
        static_assert(sample_expansion_factor >= 1,
            "The input layer can't produce fewer output tensors than there are inputs.");

Davis King's avatar
Davis King committed
1647
1648
1649
1650
1651
        add_skip_layer() = default;
        add_skip_layer(const add_skip_layer&) = default;
        add_skip_layer(add_skip_layer&&) = default;
        add_skip_layer& operator=(add_skip_layer&&) = default;
        add_skip_layer& operator=(const add_skip_layer&) = default;
1652
1653

        template <typename T>
Davis King's avatar
Davis King committed
1654
1655
        add_skip_layer(
            const add_skip_layer<TAG_TYPE,T>& item
Davis King's avatar
Davis King committed
1656
        ) : subnetwork(item.subnet())
1657
1658
1659
        {}

        template <typename ...T>
Davis King's avatar
Davis King committed
1660
        add_skip_layer(
1661
1662
            T ...args
        ) : 
Davis King's avatar
Davis King committed
1663
            subnetwork(std::move(args)...) 
1664
1665
1666
1667
1668
        {
        }

        template <typename input_iterator>
        void to_tensor (
1669
1670
            input_iterator ibegin,
            input_iterator iend,
1671
1672
1673
            resizable_tensor& data
        ) const
        {
Davis King's avatar
Davis King committed
1674
            subnetwork.to_tensor(ibegin,iend,data);
1675
1676
1677
1678
1679
1680
1681
1682
        }

        template <typename input_iterator>
        const tensor& operator() (
            input_iterator ibegin,
            input_iterator iend
        )
        {
Davis King's avatar
Davis King committed
1683
1684
            subnetwork(ibegin,iend);
            return layer<TAG_TYPE>(subnetwork).get_output();
1685
1686
1687
1688
        }

        const tensor& operator() (const input_type& x)
        {
Davis King's avatar
Davis King committed
1689
1690
            subnetwork(x);
            return layer<TAG_TYPE>(subnetwork).get_output();
1691
1692
1693
1694
        }

        const tensor& forward(const tensor& x)
        {
Davis King's avatar
Davis King committed
1695
1696
            subnetwork.forward(x);
            return layer<TAG_TYPE>(subnetwork).get_output();
1697
1698
1699
1700
        }

        const tensor& get_output() const 
        { 
Davis King's avatar
Davis King committed
1701
            return layer<TAG_TYPE>(subnetwork).get_output();
1702
1703
1704
1705
        }

        tensor& get_gradient_input() 
        { 
Davis King's avatar
Davis King committed
1706
            return layer<TAG_TYPE>(subnetwork).get_gradient_input();
1707
1708
1709
1710
1711
        }

        template <typename solver_type>
        void update(const tensor& x, sstack<solver_type,num_layers>& solvers)
        {
Davis King's avatar
Davis King committed
1712
            subnetwork.update(x,solvers.pop());
1713
1714
        }

Davis King's avatar
Davis King committed
1715
        const subnet_type& subnet() const 
1716
        { 
Davis King's avatar
Davis King committed
1717
            return subnetwork; 
1718
1719
        }

Davis King's avatar
Davis King committed
1720
        subnet_type& subnet() 
1721
        { 
Davis King's avatar
Davis King committed
1722
            return subnetwork; 
1723
1724
1725
1726
        }

        void clean()
        {
Davis King's avatar
Davis King committed
1727
            subnetwork.clean();
1728
1729
        }

1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
        friend void serialize(const add_skip_layer& item, std::ostream& out)
        {
            int version = 1;
            serialize(version, out);
            serialize(item.subnetwork, out);
        }

        friend void deserialize(add_skip_layer& item, std::istream& in)
        {
            int version = 0;
            deserialize(version, in);
            if (version != 1)
                throw serialization_error("Unexpected version found while deserializing dlib::add_skip_layer.");
            deserialize(item.subnetwork, in);
        }

1746
1747
    private:

Davis King's avatar
Davis King committed
1748
        subnet_type subnetwork;
1749
1750
    };
    template <template<typename> class T, typename U>
Davis King's avatar
Davis King committed
1751
1752
    struct is_nonloss_layer_type<add_skip_layer<T,U>> : std::true_type {};

Davis King's avatar
Davis King committed
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
    template <typename SUBNET> using tag1  = add_tag_layer< 1, SUBNET>;
    template <typename SUBNET> using tag2  = add_tag_layer< 2, SUBNET>;
    template <typename SUBNET> using tag3  = add_tag_layer< 3, SUBNET>;
    template <typename SUBNET> using tag4  = add_tag_layer< 4, SUBNET>;
    template <typename SUBNET> using tag5  = add_tag_layer< 5, SUBNET>;
    template <typename SUBNET> using tag6  = add_tag_layer< 6, SUBNET>;
    template <typename SUBNET> using tag7  = add_tag_layer< 7, SUBNET>;
    template <typename SUBNET> using tag8  = add_tag_layer< 8, SUBNET>;
    template <typename SUBNET> using tag9  = add_tag_layer< 9, SUBNET>;
    template <typename SUBNET> using tag10 = add_tag_layer<10, SUBNET>;

    template <typename SUBNET> using skip1  = add_skip_layer< tag1, SUBNET>;
    template <typename SUBNET> using skip2  = add_skip_layer< tag2, SUBNET>;
    template <typename SUBNET> using skip3  = add_skip_layer< tag3, SUBNET>;
    template <typename SUBNET> using skip4  = add_skip_layer< tag4, SUBNET>;
    template <typename SUBNET> using skip5  = add_skip_layer< tag5, SUBNET>;
    template <typename SUBNET> using skip6  = add_skip_layer< tag6, SUBNET>;
    template <typename SUBNET> using skip7  = add_skip_layer< tag7, SUBNET>;
    template <typename SUBNET> using skip8  = add_skip_layer< tag8, SUBNET>;
    template <typename SUBNET> using skip9  = add_skip_layer< tag9, SUBNET>;
    template <typename SUBNET> using skip10 = add_skip_layer<tag10, SUBNET>;
1774
1775
1776
1777
1778

// ----------------------------------------------------------------------------------------

    namespace timpl
    {
1779
        inline void fill_with_gassuan_random_numbers (
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
            tensor& t,
            dlib::rand& rnd,
            double sigma = 1
        )
        {
            float* data = t.host();
            for (size_t i = 0; i < t.size(); ++i)
                data[i] = rnd.get_random_gaussian()*sigma;
        }

Davis King's avatar
Davis King committed
1790
        class test_layer_subnet 
1791
1792
        {
        public:
Davis King's avatar
Davis King committed
1793
            test_layer_subnet (
1794
1795
1796
1797
1798
1799
                dlib::rand& rnd_
            ) : rnd(rnd_) 
            {
                // Output and gradient_input have to have the same dimensions in each
                // layer.
                const long num_samples = rnd.get_random_32bit_number()%4+3;
1800
                const long k  = rnd.get_random_32bit_number()%4+2;
1801
1802
1803
                const long nr = rnd.get_random_32bit_number()%4+2;
                const long nc = rnd.get_random_32bit_number()%4+2;

1804
1805
                output.set_size(num_samples, k, nr, nc);
                gradient_input.set_size(num_samples, k, nr, nc);
1806
1807
1808
1809
1810
1811
1812
1813
1814

                // Use a non-zero initial gradient to make sure the layers add to it
                // rather than assign and blow away the initial value.
                fill_with_gassuan_random_numbers(gradient_input, rnd, 0.01);

                fill_with_gassuan_random_numbers(output, rnd);
            }


1815
            tensor& get_mutable_output() { return output; }
1816
            const tensor& get_output() const { return output; }
1817
            const tensor& private_get_output() const { return get_output(); }
Davis King's avatar
Davis King committed
1818
            const test_layer_subnet& subnet() const { init_sub(); return *subnetwork; }
1819
1820

            tensor& get_gradient_input() { return gradient_input; }
1821
            tensor& private_get_gradient_input() { return get_gradient_input(); }
Davis King's avatar
Davis King committed
1822
            test_layer_subnet& subnet() { init_sub(); return *subnetwork; }
1823
1824
1825
1826
1827



            unsigned long count_outputs() const
            {
Davis King's avatar
Davis King committed
1828
1829
                if (subnetwork)
                    return subnetwork->count_outputs() + output.size();
1830
1831
1832
1833
1834
1835
1836
1837
1838
                else
                    return output.size();
            }

            float& get_output_element(unsigned long i)
            {
                if (i < output.size())
                    return output.host()[i];
                else
Davis King's avatar
Davis King committed
1839
                    return subnet().get_output_element(i-output.size());
1840
1841
1842
1843
1844
1845
1846
            }

            float get_gradient_input_element(unsigned long i) const
            {
                if (i < gradient_input.size())
                    return gradient_input.host()[i];
                else
Davis King's avatar
Davis King committed
1847
                    return subnet().get_gradient_input_element(i-gradient_input.size());
1848
1849
1850
1851
1852
            }


        private:
            // We lazily initialize sub-layers as needed when someone tries to call
Davis King's avatar
Davis King committed
1853
            // subnet()
1854
1855
            void init_sub() const
            {
Davis King's avatar
Davis King committed
1856
1857
                if (!subnetwork)
                    subnetwork.reset(new test_layer_subnet(rnd));
1858
1859
1860
            }

            dlib::rand& rnd;
Davis King's avatar
Davis King committed
1861
            mutable std::unique_ptr<test_layer_subnet> subnetwork;
1862
1863
1864
1865
            resizable_tensor output;
            resizable_tensor gradient_input;
        };

1866
    }
1867

1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
    struct layer_test_results
    {
        layer_test_results() : was_good(true) {}
        explicit layer_test_results(const std::string& l) : log(l),was_good(false) {}

        std::string log;
        bool was_good;

        operator bool() const { return was_good; }
    };

    inline std::ostream& operator<< (std::ostream& out, const layer_test_results& item)
    {
        out << item.log;
        return out;
1883
1884
1885
1886
1887
    }

    template <
        typename layer_details_type
        >
1888
    layer_test_results test_layer (
1889
1890
1891
1892
1893
1894
        layer_details_type l
    )
    {
        const float base_eps = 0.01;
        using namespace timpl;
        // Do some setup
1895
        running_stats<double> rs_data, rs_params;
1896
        dlib::rand rnd;
1897
1898
        std::ostringstream sout;
        for (int iter = 0; iter < 10; ++iter)
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
        {
            test_layer_subnet subnetwork(rnd);
            resizable_tensor output, out2, out3;
            // Run setup() and forward() as well to make sure any calls to subnet() have
            // happened before we start assuming we know how many data elements there are
            // (since we do a lazy layer creation thing based on calls to subnet() inside
            // test_layer_subnet).
            l.setup(subnetwork);
            impl::call_layer_forward(l, subnetwork, output);

            resizable_tensor input_grad;
            input_grad.copy_size(output);
            fill_with_gassuan_random_numbers(input_grad, rnd);


            // The f() we are computing gradients of is this thing.  It's value at the current
            // parameter and data values is:
            //sout << "f(data,params): " << dot(output, input_grad) << std::endl;

            // We are going to save a copy of the subnetwork.get_gradient_input() data before we do
            // backpropagation since the backward() function is supposed to *add* to the
            // gradients rather than overwrite them.  We will use this saved data to check if
            // that is the case.
            const unsigned long num_data_inputs = subnetwork.count_outputs();
            std::vector<float> initial_gradient_input(num_data_inputs);
            for (unsigned long i = 0; i < num_data_inputs; ++i)
                initial_gradient_input[i] = subnetwork.get_gradient_input_element(i);

1927

1928
1929
1930
            // Now tell the layer to compute all the gradients.  In the rest of this function
            // we will just be checking that these gradients were computed correctly by
            // comparing them to a central differences approximation.
1931
            resizable_tensor params_grad;
1932
1933
1934
            params_grad.copy_size(l.get_layer_params());
            // But first, set the params grad to something crazy so that it's very obvious if
            // it doesn't get fully assigned.
1935
            params_grad = std::numeric_limits<float>::infinity();
1936
            impl::call_layer_backward(l, output, input_grad, subnetwork, params_grad);
1937

1938
1939
1940
1941
1942
1943
            static_assert(impl::is_inplace_layer(l, subnetwork) == impl::has_inplace_backward(l, subnetwork),
                "Layer not defined correctly.  forward and backward methods must either both be in-place or both out-of-place. ");

            // Make sure the outputs of forward() and backward() are the same when they are run
            // in in-place mode.
            if (impl::is_inplace_layer(l, subnetwork))
1944
            {
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
                test_layer_subnet subnetwork2(rnd);
                layer_details_type ll(l);
                ll.setup(subnetwork2);
                resizable_tensor ip_out;
                impl::call_layer_forward(ll, subnetwork2, ip_out);
                impl::call_layer_forward(ll, subnetwork2, subnetwork2.get_mutable_output());
                const auto forward_error = max(abs(mat(ip_out) - mat(subnetwork2.get_output())));
                if (forward_error > 0.00001)
                {
                    using namespace std;
                    sout << "This layer is supposed to support in-place computations but the output of forward_inplace()\n";
                    sout << "changes when invoked in-place vs. out-of-place. The error was: " << forward_error << endl;
                    return layer_test_results(sout.str()); 
                }

                resizable_tensor params_grad;
                params_grad.copy_size(ll.get_layer_params());
                params_grad = std::numeric_limits<float>::infinity();

                resizable_tensor input_grad;
                input_grad.copy_size(ip_out);
                fill_with_gassuan_random_numbers(input_grad, rnd);
                resizable_tensor params_grad1, params_grad2, data_grad1, data_grad2;
                params_grad1 = params_grad;
                params_grad2 = params_grad;
                // Now call backward() and make sure it works as well.
                subnetwork2.get_gradient_input() = 9999;
                impl::call_layer_backward(ll, ip_out, input_grad, subnetwork2, params_grad1);
                data_grad1 = subnetwork2.get_gradient_input();

                subnetwork2.get_gradient_input() = mat(input_grad);
                impl::call_layer_backward(ll, ip_out, subnetwork2.get_gradient_input(), subnetwork2, params_grad2);
                data_grad2 = subnetwork2.get_gradient_input();
                if (params_grad.size() != 0)
                {
                    const auto backward_param_error = max(abs(mat(params_grad1) - mat(params_grad2)));
                    if (backward_param_error > 0.00001)
                    {
                        using namespace std;
                        sout << "This layer is supposed to support in-place computations but the output of backward_inplace()\n";
                        sout << "changes when invoked in-place vs. out-of-place. The error was: " << backward_param_error << endl;
                        return layer_test_results(sout.str()); 
                    }
                }
                const auto backward_data_error = max(abs(mat(data_grad1) - mat(data_grad2)));
                if (backward_data_error > 0.00001)
1991
1992
1993
                {
                    using namespace std;
                    sout << "This layer is supposed to support in-place computations but the output of backward_inplace()\n";
1994
                    sout << "changes when invoked in-place vs. out-of-place. The error was: " << backward_data_error << endl;
1995
1996
1997
                    return layer_test_results(sout.str()); 
                }
            }
1998

1999
2000
2001
            // ==================================================================
            // first validate the way the parameter gradients are computed
            for (unsigned long i = 0; i < params_grad.size(); ++i)
2002
            {
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
                layer_details_type l1(l);

                float eps = l1.get_layer_params().host()[i]*base_eps;
                if (eps == 0)
                    eps = base_eps;
                const float oldval = l1.get_layer_params().host()[i];
                l1.get_layer_params().host()[i] = oldval+eps;
                impl::call_layer_forward(l1, subnetwork, out2);
                l1.get_layer_params().host()[i] = oldval-eps;
                impl::call_layer_forward(l1, subnetwork, out3);
                l1.get_layer_params().host()[i] = oldval;

                // Compute a reference derivative via a central differences approximation and
                // compare it to the one output by the layer and make sure they match.
                double reference_derivative = (dot(out2,input_grad)-dot(out3, input_grad))/(2*eps);
                double output_derivative = params_grad.host()[i];
                double relative_error = (reference_derivative - output_derivative)/(reference_derivative + 1e-100);
                double absolute_error = (reference_derivative - output_derivative);
2021
2022
                rs_params.add(std::abs(relative_error));
                if (std::abs(relative_error) > 0.05 && std::abs(absolute_error) > 0.005)
2023
2024
2025
2026
2027
2028
2029
2030
                {
                    using namespace std;
                    sout << "Gradient error in parameter #" << i <<".  Relative error: "<< relative_error << endl;
                    sout << "expected derivative: " << reference_derivative << endl;
                    sout << "output derivative:   " << output_derivative << endl;
                    return layer_test_results(sout.str()); 
                }
            }
2031

2032
2033
2034
            // ==================================================================
            // now validate the data gradients
            for (unsigned long i = 0; i < num_data_inputs; ++i)
2035
            {
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
                const float oldval = subnetwork.get_output_element(i);
                float eps = oldval*base_eps;
                if (eps == 0)
                    eps = base_eps;
                subnetwork.get_output_element(i) = oldval+eps;
                impl::call_layer_forward(l, subnetwork, out2);
                subnetwork.get_output_element(i) = oldval-eps;
                impl::call_layer_forward(l, subnetwork, out3);
                subnetwork.get_output_element(i) = oldval;

                // Compute a reference derivative via a central differences approximation and
                // compare it to the one output by the layer and make sure they match.
                double reference_derivative = (dot(out2,input_grad)-dot(out3, input_grad))/(2*eps);
                double output_derivative = subnetwork.get_gradient_input_element(i);
                if (!impl::is_inplace_layer(l,subnetwork))
                    output_derivative -= initial_gradient_input[i];
                double relative_error = (reference_derivative - output_derivative)/(reference_derivative + 1e-100);
                double absolute_error = (reference_derivative - output_derivative);
2054
2055
                rs_data.add(std::abs(relative_error));
                if (std::abs(relative_error) > 0.05 && std::abs(absolute_error) > 0.005)
2056
2057
2058
2059
2060
2061
2062
                {
                    using namespace std;
                    sout << "Gradient error in data variable #" << i <<".  Relative error: "<< relative_error << endl;
                    sout << "expected derivative: " << reference_derivative << endl;
                    sout << "output derivative:   " << output_derivative << endl;
                    return layer_test_results(sout.str()); 
                }
2063
            }
2064
2065

        } // end for (int iter = 0; iter < 5; ++iter)
2066

2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
        if (rs_params.mean() > 0.003)
        {
            using namespace std;
            sout << "Average parameter gradient error is somewhat large at: "<< rs_params.mean() << endl;
            return layer_test_results(sout.str()); 
        }
        if (rs_data.mean() > 0.003)
        {
            using namespace std;
            sout << "Average data gradient error is somewhat large at: "<< rs_data.mean() << endl;
            return layer_test_results(sout.str()); 
        }

2080
        return layer_test_results();
2081
2082
2083
2084
2085
2086
    }

// ----------------------------------------------------------------------------------------

}

2087
#endif // DLIB_DNn_CORE_H_
2088
2089