core.h 72.6 KB
Newer Older
1
2
3
4
5
6
7
8
9
// Copyright (C) 2015  Davis E. King (davis@dlib.net)
// License: Boost Software License   See LICENSE.txt for the full license.
#ifndef DLIB_DNn_CORE_H_
#define DLIB_DNn_CORE_H_

#include "core_abstract.h"
#include "tensor.h"
#include <iterator>
#include <memory>
10
#include <sstream>
11
#include <type_traits>
Davis King's avatar
Davis King committed
12
13
#include "../statistics.h"
#include "../rand.h"
14
#include "../algs.h"
15
#include <utility>
16
#include <tuple>
17
18
19
20
21
22
23


namespace dlib
{

// ----------------------------------------------------------------------------------------

Davis King's avatar
Davis King committed
24
25
26
27
    // Tell us if T is one of the special layer types (i.e. add_layer, add_tag_layer, or
    // add_skip_layer).
    template <typename T> struct is_nonloss_layer_type : std::false_type {};
    // Tell us if T is an instance of add_loss_layer.
28
29
    template <typename T> struct is_loss_layer_type : std::false_type {};

30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
    namespace impl
    {
        template <size_t... n>
        struct ct_integers_list {
            template <size_t m>
            struct push_back
            {
                typedef ct_integers_list<n..., m> type;
            };
        };

        template <size_t max>
        struct ct_make_integer_range
        {
            // recursively call push_back on ct_integers_list to build a range from 0 to max
            // inclusive.
            typedef typename ct_make_integer_range<max-1>::type::template push_back<max>::type type;
        };

        template <>
        struct ct_make_integer_range<0>
        {
            typedef ct_integers_list<> type;
        };

        template <size_t... indices, typename Tuple>
        auto tuple_subset(
            const Tuple& item, 
            ct_integers_list<indices...>
        ) -> decltype(std::make_tuple(std::get<indices>(item)...))
        {
            return std::make_tuple(std::get<indices>(item)...);
        }

64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
        template <typename T> struct alwaysbool { typedef bool type; };

        resizable_tensor& rt();

        // The significance of a layer's backward method requiring forward's outputs is
        // that such as layer can't have an in-place layer stacked on top of it because
        // in-place layers overwrite the output of the layer they sit on top of.
        template <typename layer_type, typename SUBNET>
        constexpr auto backward_requires_forward_output(
            layer_type& layer,
            SUBNET& sub
        ) -> typename alwaysbool<decltype(layer.backward(rt(),rt(),sub,rt()))>::type
        {
            return true;
        }

        template <typename layer_type, typename SUBNET>
        constexpr auto backward_requires_forward_output(
            layer_type& layer,
            SUBNET& sub
        ) -> typename alwaysbool<decltype(layer.backward(rt(),sub,rt()))>::type
        {
            return false;
        }

        template <typename layer_type, typename SUBNET>
        constexpr auto backward_requires_forward_output(
            layer_type& layer,
            SUBNET& sub
        ) -> typename alwaysbool<decltype(layer.backward_inplace(rt(),rt(),sub.get_gradient_input(),rt()))>::type
        {
            return true;
        }

        template <typename layer_type, typename SUBNET>
        constexpr auto has_inplace_backward(
            layer_type& layer,
            SUBNET& sub
        ) -> typename alwaysbool<decltype(layer.backward(rt(),rt(),sub,rt()))>::type
        {
            return false;
        }

        template <typename layer_type, typename SUBNET>
        constexpr auto has_inplace_backward(
            layer_type& layer,
            SUBNET& sub
        ) -> typename alwaysbool<decltype(layer.backward(rt(),sub,rt()))>::type
        {
            return false;
        }

        template <typename layer_type, typename SUBNET>
        constexpr auto has_inplace_backward(
            layer_type& layer,
            SUBNET& sub
        ) -> typename alwaysbool<decltype(layer.backward_inplace(rt(),rt(),sub.get_gradient_input(),rt()))>::type
        {
            return true;
        }

        template <typename layer_type, typename SUBNET>
        constexpr auto is_inplace_layer(
            layer_type& layer,
            const SUBNET& sub 
        ) -> typename alwaysbool<decltype(layer.forward(sub,rt()))>::type
        {
            return false;
        }

        template <typename layer_type, typename SUBNET>
        constexpr auto is_inplace_layer(
            layer_type& layer,
            const SUBNET& sub
        ) -> typename alwaysbool<decltype(layer.forward_inplace(sub.get_output(),rt()))>::type
        {
            return true;
        }

        template <typename layer_type, typename SUBNET>
        auto call_layer_backward(
            layer_type& layer,
            const tensor& computed_output, 
            const tensor& gradient_input, 
            SUBNET& sub, 
            tensor& params_grad
        ) -> decltype(layer.backward(computed_output,gradient_input,sub,params_grad))
        {
            layer.backward(computed_output,gradient_input,sub,params_grad);
        }

        template <typename layer_type, typename SUBNET>
        auto call_layer_backward(
            layer_type& layer,
            const tensor& , 
            const tensor& gradient_input, 
            SUBNET& sub, 
            tensor& params_grad
        ) -> decltype(layer.backward(gradient_input,sub,params_grad))
        {
            layer.backward(gradient_input,sub,params_grad);
        }

        template <typename layer_type, typename SUBNET>
        auto call_layer_backward(
            layer_type& layer,
            const tensor& computed_output, 
            const tensor& gradient_input, 
            SUBNET& sub, 
            tensor& params_grad
        ) -> decltype(layer.backward_inplace(computed_output,gradient_input,sub.get_gradient_input(),params_grad))
        {
            layer.backward_inplace(computed_output,gradient_input,sub.get_gradient_input(),params_grad);
        }


        template <typename layer_type, typename SUBNET>
        auto call_layer_forward(
            layer_type& layer,
            const SUBNET& sub, 
            tensor& data_output
        ) -> decltype(layer.forward(sub,rt()))
        {
            // This overload of call_layer_forward() is here because this template
            // naturally gets instantiated but only on code paths that never get executed.
            // So rather than writing a bunch of hard to read template magic around call
            // sites we just have this overload that doesn't do anything (and an assert to
            // make sure that's the case).
            DLIB_CASSERT(false, "This should never happen");
        }

        template <typename layer_type, typename SUBNET>
        auto call_layer_forward(
            layer_type& layer,
            const SUBNET& sub, 
            resizable_tensor& data_output
        ) -> decltype(layer.forward(sub,data_output))
        {
            layer.forward(sub,data_output);
        }

        template <typename layer_type, typename SUBNET>
        auto call_layer_forward(
            layer_type& layer,
            const SUBNET& sub, 
            tensor& data_output
        ) -> decltype(layer.forward_inplace(sub.get_output(),data_output))
        {
            layer.forward_inplace(sub.get_output(),data_output);
        }

        template <typename layer_type, typename SUBNET>
        auto call_layer_forward(
            layer_type& layer,
            const SUBNET& sub, 
            resizable_tensor& data_output
        ) -> decltype(layer.forward_inplace(sub.get_output(),data_output))
        {
            if (!have_same_dimensions(data_output, sub.get_output()))
                data_output.copy_size(sub.get_output());
            layer.forward_inplace(sub.get_output(),data_output);
        }


    } // end namespace impl
229
230
231
232
233
234
235
236
237

    template <typename Head, typename... Tail>
    std::tuple<Tail...> tuple_tail(
        const std::tuple<Head, Tail...>& item
    )
    {
        return impl::tuple_subset(item, typename impl::ct_make_integer_range<sizeof...(Tail)>::type());
    }

238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
// ----------------------------------------------------------------------------------------

    inline void randomize_parameters (
        tensor& params,
        unsigned long num_inputs_and_outputs,
        dlib::rand& rnd
    )
    {
        float* data = params.host();
        for (size_t i = 0; i < params.size(); ++i)
        {
            // Draw a random number to initialize the layer according to formula (16)
            // from Understanding the difficulty of training deep feedforward neural
            // networks by Xavier Glorot and Yoshua Bengio.
            float val = 2*rnd.get_random_float()-1;
            val *= std::sqrt(6.0/(num_inputs_and_outputs));

            data[i] = val;
        }
    }

// ----------------------------------------------------------------------------------------

    template <typename T, size_t N>
    class sstack
    {
Davis King's avatar
Davis King committed
264
265
266
267
    public:
        static_assert(N > 0, "You can't create an empty sstack.");
        typedef T value_type;
        const static size_t num_elements = N;
268

Davis King's avatar
Davis King committed
269
270
        sstack() {}
        sstack(const T& item_) : item(item_), data(item_) {}
271

Davis King's avatar
Davis King committed
272
273
        const T& top() const { return item; }
        T& top() { return item; }
274

Davis King's avatar
Davis King committed
275
        size_t size() const { return N; }
276

Davis King's avatar
Davis King committed
277
278
        const sstack<T,N-1>& pop() const { return data; }
        sstack<T,N-1>& pop() { return data; }
279

280
281
282
283
284
285
286
287
288
289
290
291
        friend void serialize(const sstack& item, std::ostream& out)
        {
            serialize(item.top(), out);
            serialize(item.pop(), out);
        }

        friend void deserialize(sstack& item, std::istream& in)
        {
            deserialize(item.top(), in);
            deserialize(item.pop(), in);
        }

Davis King's avatar
Davis King committed
292
293
294
    private:
        T item;
        sstack<T,N-1> data;
295
296
297
298
299
300
301
302
303
304
305
306
307
    };

    template <typename T>
    class sstack<T,1> // base case of recursive definition.
    {
    public:
        sstack() {}
        explicit sstack(const T& item_) : item(item_) {}

        const T& top() const { return item; }
        T& top() { return item; }

        size_t size() const { return 1; }
308
309
310
311
312
313
314
315
316
317
318

        friend void serialize(const sstack& item, std::ostream& out)
        {
            serialize(item.top(), out);
        }

        friend void deserialize(sstack& item, std::istream& in)
        {
            deserialize(item.top(), in);
        }

319
320
321
322
323
324
325
326
327
328
    private:
        T item;
    };

// ----------------------------------------------------------------------------------------
// ----------------------------------------------------------------------------------------
// ----------------------------------------------------------------------------------------

    namespace dimpl
    {
329
        template <typename T, bool is_first = true, typename enabled=void>
Davis King's avatar
Davis King committed
330
        class subnet_wrapper
331
332
333
        {
            /*!
                WHAT THIS OBJECT REPRESENTS
Davis King's avatar
Davis King committed
334
                    This is a tool that makes an add_layer or add_loss_layer object
Davis King's avatar
Davis King committed
335
                    expose only the part of its interface defined by the SUBNET
336
                    type in layers_abstract.h.  This way, when we pass subnetwork
337
                    objects to the layer callbacks those callbacks won't be able to 
338
                    interact with the subnetworks in a way other than specified 
Davis King's avatar
Davis King committed
339
                    by the SUBNET interface spec.
340
341
342
343
344
345
346

                    We also allow the top layer of a subnet_wrapper stack to call the
                    private_get_output() and private_get_gradient_input() functions.  This
                    way, layers that have had their output/gradient overwritten by in-place
                    layers can only be accessed from the in-place layers that sit directly
                    on top of them since those in-place layers are the only layers that
                    know how to interact with them properly.
347
348
349
            !*/

        public:
Davis King's avatar
Davis King committed
350
351
            subnet_wrapper(const subnet_wrapper&) = delete;
            subnet_wrapper& operator=(const subnet_wrapper&) = delete;
352

Davis King's avatar
Davis King committed
353
            subnet_wrapper(T& l_) {}
354
355
356
357
358
            // Nothing here because in this case T is one of the input layer types 
            // that doesn't have anything in it.
        };

        template <typename T>
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
        class subnet_wrapper<T,true, typename std::enable_if<is_nonloss_layer_type<T>::value>::type>
        {

        public:
            subnet_wrapper(const subnet_wrapper&) = delete;
            subnet_wrapper& operator=(const subnet_wrapper&) = delete;

            typedef T wrapped_type;
            const static size_t num_layers = T::num_layers;

            subnet_wrapper(T& l_) : l(l_),subnetwork(l.subnet()) {}

            const tensor& get_output() const { return l.private_get_output(); }
            tensor& get_gradient_input() { return l.private_get_gradient_input(); }

            const subnet_wrapper<typename T::subnet_type>& subnet() const { subnetwork; }
            subnet_wrapper<typename T::subnet_type>& subnet() { subnetwork; }

        private:
            T& l;
            subnet_wrapper<typename T::subnet_type,false> subnetwork;
        };

        template <typename T>
        class subnet_wrapper<T,false, typename std::enable_if<is_nonloss_layer_type<T>::value>::type>
384
385
386
        {

        public:
Davis King's avatar
Davis King committed
387
388
            subnet_wrapper(const subnet_wrapper&) = delete;
            subnet_wrapper& operator=(const subnet_wrapper&) = delete;
389

390
391
392
            typedef T wrapped_type;
            const static size_t num_layers = T::num_layers;

Davis King's avatar
Davis King committed
393
            subnet_wrapper(T& l_) : l(l_),subnetwork(l.subnet()) {}
394
395
396
397

            const tensor& get_output() const { return l.get_output(); }
            tensor& get_gradient_input() { return l.get_gradient_input(); }

Davis King's avatar
Davis King committed
398
399
            const subnet_wrapper<typename T::subnet_type>& subnet() const { subnetwork; }
            subnet_wrapper<typename T::subnet_type>& subnet() { subnetwork; }
400
401
402

        private:
            T& l;
Davis King's avatar
Davis King committed
403
            subnet_wrapper<typename T::subnet_type> subnetwork;
404
405
406
        };
    }

Davis King's avatar
Davis King committed
407
408
// ----------------------------------------------------------------------------------------

Davis King's avatar
Davis King committed
409
    template <typename LAYER_DETAILS, typename SUBNET, typename enabled = void>
410
411
412
    class add_layer;

    template <typename T, typename U>
Davis King's avatar
Davis King committed
413
    struct is_nonloss_layer_type<add_layer<T,U>> : std::true_type {};
414

Davis King's avatar
Davis King committed
415
416
417
    template <typename LAYER_DETAILS, typename SUBNET>
    class add_layer<LAYER_DETAILS,SUBNET, 
            typename std::enable_if<is_nonloss_layer_type<SUBNET>::value>::type>
418
419
420
    {
    public:
        typedef LAYER_DETAILS layer_details_type;
Davis King's avatar
Davis King committed
421
422
423
424
        typedef SUBNET subnet_type;
        typedef typename subnet_type::input_type input_type;
        const static size_t num_layers = subnet_type::num_layers + 1;
        const static unsigned int sample_expansion_factor = subnet_type::sample_expansion_factor;
425
426
427
428

        add_layer(
        ):
            this_layer_setup_called(false),
429
430
            gradient_input_is_stale(true),
            get_output_and_gradient_input_disabled(false)
431
        {
432
433
            if (this_layer_operates_inplace())
                subnetwork.disable_output_and_gradient_getters();
434
435
436
437
        }

        add_layer(const add_layer&) = default;
        add_layer& operator=(const add_layer&) = default;
438
439
        add_layer(add_layer&& item) : add_layer() { swap(item); }
        add_layer& operator=(add_layer&& item) { swap(item); return *this; }
440
441
442

        template <typename T, typename U, typename E>
        friend class add_layer;
443
444
        template <typename T, bool is_first, typename E>
        friend class dimpl::subnet_wrapper;
445
446
447
448
449
450
451

        // Allow copying networks from one to another as long as their corresponding 
        // layers can be constructed from each other.
        template <typename T, typename U, typename E>
        add_layer(
            const add_layer<T,U,E>& item
        ) :
Davis King's avatar
Davis King committed
452
            subnetwork(item.subnet()),
453
454
455
            details(item.layer_details()), 
            this_layer_setup_called(item.this_layer_setup_called),
            gradient_input_is_stale(item.gradient_input_is_stale),
456
            get_output_and_gradient_input_disabled(item.get_output_and_gradient_input_disabled),
457
458
459
            x_grad(item.x_grad),
            cached_output(item.cached_output)
        {
460
461
            if (this_layer_operates_inplace())
                subnetwork.disable_output_and_gradient_getters();
462
463
464
465
466
467
468
469
        }

        template <typename ...T>
        add_layer(
            const LAYER_DETAILS& layer_det, 
            T&& ...args
        ) : 
            details(layer_det), 
Davis King's avatar
Davis King committed
470
            subnetwork(std::forward<T>(args)...),
471
            this_layer_setup_called(false),
472
473
            gradient_input_is_stale(true),
            get_output_and_gradient_input_disabled(false)
474
        {
475
476
            if (this_layer_operates_inplace())
                subnetwork.disable_output_and_gradient_getters();
477
478
479
480
481
482
483
484
        }

        template <typename ...T>
        add_layer(
            LAYER_DETAILS&& layer_det, 
            T&& ...args
        ) : 
            details(std::move(layer_det)), 
Davis King's avatar
Davis King committed
485
            subnetwork(std::forward<T>(args)...),
486
            this_layer_setup_called(false),
487
488
            gradient_input_is_stale(true),
            get_output_and_gradient_input_disabled(false)
489
        {
490
491
            if (this_layer_operates_inplace())
                subnetwork.disable_output_and_gradient_getters();
492
493
        }

494
495
496
497
498
499
500
501
        template <typename ...T, typename ...U>
        add_layer(
            const std::tuple<LAYER_DETAILS,U...>& layer_det, 
            T&& ...args
        ) : 
            details(std::get<0>(layer_det)), 
            subnetwork(tuple_tail(layer_det),std::forward<T>(args)...),
            this_layer_setup_called(false),
502
503
            gradient_input_is_stale(true),
            get_output_and_gradient_input_disabled(false)
504
        {
505
506
            if (this_layer_operates_inplace())
                subnetwork.disable_output_and_gradient_getters();
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
        }

        template <typename ...T, typename ...U>
        add_layer(
            std::tuple<>,
            const std::tuple<LAYER_DETAILS,U...>& layer_det, 
            T&& ...args
        ) : add_layer(layer_det,args...) { }

        template <typename ...T>
        add_layer(
            std::tuple<>, 
            LAYER_DETAILS&& layer_det, 
            T&& ...args
        ) : add_layer(layer_det, args...) { }

523
524
        template <typename input_iterator>
        void to_tensor (
525
526
            input_iterator ibegin,
            input_iterator iend,
527
528
529
            resizable_tensor& data
        ) const
        {
Davis King's avatar
Davis King committed
530
            subnetwork.to_tensor(ibegin,iend,data);
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
        }

        template <typename input_iterator>
        const tensor& operator() (
            input_iterator ibegin,
            input_iterator iend
        )
        {
            to_tensor(ibegin,iend,temp_tensor);
            return forward(temp_tensor);
        }


        const tensor& operator() (const input_type& x)
        {
            return (*this)(&x, &x+1);
        }

        const tensor& forward(const tensor& x)
        {
Davis King's avatar
Davis King committed
551
552
            subnetwork.forward(x);
            const dimpl::subnet_wrapper<subnet_type> wsub(subnetwork);
553
554
555
556
557
            if (!this_layer_setup_called)
            {
                details.setup(wsub);
                this_layer_setup_called = true;
            }
558
559
560
561
562
            if (this_layer_operates_inplace())
                impl::call_layer_forward(details, wsub, private_get_output());
            else
                impl::call_layer_forward(details, wsub, cached_output);

563
            gradient_input_is_stale = true;
564
            return private_get_output();
565
566
        }

567
568
    private:
        tensor& private_get_output() const
569
        { 
570
571
572
573
574
575
576
577
            if (const_cast<add_layer&>(*this).this_layer_operates_inplace())
                return subnetwork.private_get_output();
            else
                return const_cast<resizable_tensor&>(cached_output); 
        }
        tensor& private_get_gradient_input() 
        { 
            if (this_layer_operates_inplace())
578
            {
579
                return subnetwork.private_get_gradient_input();
580
            }
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
            else
            {
                if (gradient_input_is_stale)
                {
                    gradient_input_is_stale = false;
                    x_grad.copy_size(private_get_output());
                    x_grad = 0;
                }
                return x_grad; 
            }
        }
        void disable_output_and_gradient_getters (
        ) { get_output_and_gradient_input_disabled = true; }
    public:
        const tensor& get_output() const 
        { 
            if (get_output_and_gradient_input_disabled)
                throw dlib::error("Accessing this layer's get_output() is disabled because an in-place layer has been stacked on top of it.");
            return private_get_output(); 
        }
        tensor& get_gradient_input() 
        { 
            if (get_output_and_gradient_input_disabled)
                throw dlib::error("Accessing this layer's get_gradient_input() is disabled because an in-place layer has been stacked on top of it.");
            return private_get_gradient_input();
606
607
608
609
610
        }

        template <typename solver_type>
        void update(const tensor& x, sstack<solver_type,num_layers>& solvers)
        {
Davis King's avatar
Davis King committed
611
            dimpl::subnet_wrapper<subnet_type> wsub(subnetwork);
612
            params_grad.copy_size(details.get_layer_params());
613
614
            impl::call_layer_backward(details, private_get_output(),
                private_get_gradient_input(), wsub, static_cast<tensor&>(params_grad));
615
616
617
            // Don't try to adjust the parameters if this layer doesn't have any.
            if (params_grad.size() != 0)
                solvers.top()(details, static_cast<const tensor&>(params_grad));
Davis King's avatar
Davis King committed
618
            subnetwork.update(x, solvers.pop());
619
            gradient_input_is_stale = true;
620
621
        }

Davis King's avatar
Davis King committed
622
623
        const subnet_type& subnet() const { return subnetwork; }
        subnet_type& subnet() { return subnetwork; }
624
625
626
627
628
629
630
631
632
633
634

        const layer_details_type& layer_details() const { return details; } 
        layer_details_type& layer_details() { return details; } 

        void clean()
        {
            x_grad.clear();
            cached_output.clear();
            params_grad.clear();
            temp_tensor.clear();
            gradient_input_is_stale = true;
Davis King's avatar
Davis King committed
635
            subnetwork.clean();
636
637
        }

638
639
640
641
642
643
644
645
        friend void serialize(const add_layer& item, std::ostream& out)
        {
            int version = 1;
            serialize(version, out);
            serialize(item.subnetwork, out);
            serialize(item.details, out);
            serialize(item.this_layer_setup_called, out);
            serialize(item.gradient_input_is_stale, out);
646
            serialize(item.get_output_and_gradient_input_disabled, out);
647
648
649
650
651
652
653
654
655
656
657
658
659
660
            serialize(item.x_grad, out);
            serialize(item.cached_output, out);
        }

        friend void deserialize(add_layer& item, std::istream& in)
        {
            int version = 0;
            deserialize(version, in);
            if (version != 1)
                throw serialization_error("Unexpected version found while deserializing dlib::add_layer.");
            deserialize(item.subnetwork, in);
            deserialize(item.details, in);
            deserialize(item.this_layer_setup_called, in);
            deserialize(item.gradient_input_is_stale, in);
661
            deserialize(item.get_output_and_gradient_input_disabled, in);
662
663
664
665
            deserialize(item.x_grad, in);
            deserialize(item.cached_output, in);
        }

666
667
    private:

668
669
670
671
672
673
674
675
676
677
678
679
680
681
        bool this_layer_operates_inplace(
        ) 
        {
            // This layer can run in-place if it's an in-place capable layer and also if
            // the layer it's on top of doesn't need it's own output tensor (since in-place
            // layers overwrite that tensor)
            return impl::is_inplace_layer(details, subnetwork) && !subnetwork.this_layer_requires_forward_output();
        }
        bool this_layer_requires_forward_output(
        ) 
        {
            return impl::backward_requires_forward_output(details, subnetwork);
        }

682
683
684
685
686
687
        void swap(add_layer& item)
        {
            std::swap(subnetwork,item.subnetwork);
            std::swap(details, item.details);
            std::swap(this_layer_setup_called, item.this_layer_setup_called);
            std::swap(gradient_input_is_stale, item.gradient_input_is_stale);
688
            std::swap(get_output_and_gradient_input_disabled, item.get_output_and_gradient_input_disabled);
689
690
691
692
            std::swap(x_grad, item.x_grad);
            std::swap(cached_output, item.cached_output);
        }

693

Davis King's avatar
Davis King committed
694
        subnet_type subnetwork;
695
696
697
        LAYER_DETAILS details;
        bool this_layer_setup_called;
        bool gradient_input_is_stale;
698
699
700
701
        bool get_output_and_gradient_input_disabled;
        // Note that if this_layer_operates_inplace()==true then x_grad and cached_output
        // are not used at all.  Instead, this layer uses these variables from the lower
        // layer.
702
703
704
705
706
707
708
709
710
711
712
713
714
        resizable_tensor x_grad;
        resizable_tensor cached_output; 

        // The following 2 objects don't logically contribute to the state of this class.
        // They are only here to prevent them from being reallocated over and over in
        // member functions.
        resizable_tensor params_grad; 
        resizable_tensor temp_tensor;

    };

// ----------------------------------------------------------------------------------------

715
// This version of add_layer handles the special case where the subnetwork being given is
Davis King's avatar
Davis King committed
716
// just an input layer object.
717
718
719
720
721
    template <typename LAYER_DETAILS, typename INPUT_LAYER, typename enabled>
    class add_layer
    {
    public:
        typedef LAYER_DETAILS layer_details_type;
Davis King's avatar
Davis King committed
722
        typedef INPUT_LAYER subnet_type;
723
724
725
726
727
728
729
730
731
        typedef typename INPUT_LAYER::input_type input_type;
        const static unsigned int sample_expansion_factor = INPUT_LAYER::sample_expansion_factor;
        const static size_t num_layers = 1;
        static_assert(sample_expansion_factor >= 1,
            "The input layer can't produce fewer output tensors than there are inputs.");

        add_layer(
        ): 
            this_layer_setup_called(false),
732
733
            gradient_input_is_stale(true),
            get_output_and_gradient_input_disabled(false)
734
735
736
        {}

        add_layer(const add_layer&) = default;
737
        add_layer(add_layer&& item) : add_layer() { swap(item); }
738
        add_layer& operator=(const add_layer&) = default;
739
        add_layer& operator=(add_layer&& item) { swap(item); return *this; }
740
741
742

        template <typename T, typename U, typename E>
        friend class add_layer;
743
744
        template <typename T, bool is_first, typename E>
        friend class dimpl::subnet_wrapper;
745
746
747
748
749
750
751

        // Allow copying networks from one to another as long as their corresponding 
        // layers can be constructed from each other.
        template <typename T, typename U, typename E>
        add_layer(
            const add_layer<T,U,E>& item
        ):
Davis King's avatar
Davis King committed
752
            input_layer(item.subnet()),
753
754
755
            details(item.layer_details()),
            this_layer_setup_called(item.this_layer_setup_called),
            gradient_input_is_stale(item.gradient_input_is_stale),
756
            get_output_and_gradient_input_disabled(false),
757
758
759
760
761
762
763
764
765
766
            x_grad(item.x_grad),
            cached_output(item.cached_output)
        {
        }

        add_layer(
            const LAYER_DETAILS& layer_det
        ) : 
            details(layer_det), 
            this_layer_setup_called(false),
767
768
            gradient_input_is_stale(true),
            get_output_and_gradient_input_disabled(false)
769
770
771
772
773
774
775
        {}

        add_layer(
            LAYER_DETAILS&& layer_det
        ) : 
            details(std::move(layer_det)), 
            this_layer_setup_called(false),
776
777
            gradient_input_is_stale(true),
            get_output_and_gradient_input_disabled(false)
778
779
780
781
782
783
        {}

        add_layer(
            LAYER_DETAILS layer_det, 
            INPUT_LAYER il
        ) : 
784
785
            details(std::move(layer_det)),
            input_layer(std::move(il)),
786
            this_layer_setup_called(false),
787
788
            gradient_input_is_stale(true),
            get_output_and_gradient_input_disabled(false)
789
790
        {}

791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
        add_layer(
            std::tuple<>,
            const LAYER_DETAILS& layer_det
        ) : add_layer(layer_det) {}

        add_layer(
            std::tuple<>,
            LAYER_DETAILS&& layer_det
        ) : add_layer(layer_det) {}

        add_layer(
            std::tuple<>,
            LAYER_DETAILS layer_det, 
            INPUT_LAYER il
        ) : add_layer(layer_det,il) {}

        add_layer(
            const std::tuple<LAYER_DETAILS>& layer_det
        ) : add_layer(std::get<0>(layer_det)) {}

        add_layer(
            const std::tuple<LAYER_DETAILS>& layer_det,
            INPUT_LAYER il
        ) : add_layer(std::get<0>(layer_det),il) {}

816
817
        template <typename input_iterator>
        void to_tensor (
818
819
            input_iterator ibegin,
            input_iterator iend,
820
821
822
            resizable_tensor& data
        ) const
        {
823
            input_layer.to_tensor(ibegin, iend, data);
824
            // make sure the input layer's to_tensor() function is implemented properly.
825
            DLIB_CASSERT(std::distance(ibegin,iend)*sample_expansion_factor == data.num_samples(),"");
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
            data.async_copy_to_device();
        }


        template <typename input_iterator>
        const tensor& operator() (
            input_iterator ibegin,
            input_iterator iend
        )
        {
            to_tensor(ibegin,iend,temp_tensor);
            return forward(temp_tensor);
        }


        const tensor& operator() (const input_type& x)
        {
            return (*this)(&x, &x+1);
        }

        const tensor& forward (const tensor& x)
        {
            DLIB_CASSERT(x.num_samples()%sample_expansion_factor == 0,"");
Davis King's avatar
Davis King committed
849
            subnet_wrapper wsub(x, grad_final_ignored);
850
851
852
853
854
            if (!this_layer_setup_called)
            {
                details.setup(wsub);
                this_layer_setup_called = true;
            }
855
            impl::call_layer_forward(details, wsub, cached_output);
856
            gradient_input_is_stale = true;
857
            return private_get_output();
858
859
        }

860
861
862
    private:
        tensor& private_get_output() const { return const_cast<resizable_tensor&>(cached_output); }
        tensor& private_get_gradient_input() 
863
864
865
866
        { 
            if (gradient_input_is_stale)
            {
                gradient_input_is_stale = false;
867
                x_grad.copy_size(private_get_output());
868
869
870
871
                x_grad = 0;
            }
            return x_grad; 
        }
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
        void disable_output_and_gradient_getters (
        ) { get_output_and_gradient_input_disabled = true; }
    public:
        const tensor& get_output() const 
        { 
            if (get_output_and_gradient_input_disabled)
                throw dlib::error("Accessing this layer's get_output() is disabled because an in-place layer has been stacked on top of it.");
            return private_get_output(); 
        }
        tensor& get_gradient_input() 
        { 
            if (get_output_and_gradient_input_disabled)
                throw dlib::error("Accessing this layer's get_gradient_input() is disabled because an in-place layer has been stacked on top of it.");
            return private_get_gradient_input();
        }
887
888
889
890

        template <typename solver_type>
        void update(const tensor& x, sstack<solver_type,num_layers>& solvers)
        {
Davis King's avatar
Davis King committed
891
            subnet_wrapper wsub(x, grad_final_ignored);
892
            params_grad.copy_size(details.get_layer_params());
893
894
            impl::call_layer_backward(details, private_get_output(),
                private_get_gradient_input(), wsub, static_cast<tensor&>(params_grad));
895
896
897
            // Don't try to adjust the parameters if this layer doesn't have any.
            if (params_grad.size() != 0)
                solvers.top()(details, static_cast<const tensor&>(params_grad));
898
            gradient_input_is_stale = true;
899
900
        }

Davis King's avatar
Davis King committed
901
902
        const subnet_type& subnet() const { return input_layer; } 
        subnet_type& subnet() { return input_layer; } 
903
904
905
906
907
908
909
910
911
912
913
914
915
916

        const layer_details_type& layer_details() const { return details; } 
        layer_details_type& layer_details() { return details; } 

        void clean()
        {
            x_grad.clear();
            grad_final_ignored.clear();
            cached_output.clear();
            params_grad.clear();
            temp_tensor.clear();
            gradient_input_is_stale = true;
        }

917
918
919
920
921
922
923
924
        friend void serialize(const add_layer& item, std::ostream& out)
        {
            int version = 1;
            serialize(version, out);
            serialize(item.input_layer, out);
            serialize(item.details, out);
            serialize(item.this_layer_setup_called, out);
            serialize(item.gradient_input_is_stale, out);
925
            serialize(item.get_output_and_gradient_input_disabled, out);
926
927
928
929
930
931
932
933
934
935
936
937
938
939
            serialize(item.x_grad, out);
            serialize(item.cached_output, out);
        }

        friend void deserialize(add_layer& item, std::istream& in)
        {
            int version = 0;
            deserialize(version, in);
            if (version != 1)
                throw serialization_error("Unexpected version found while deserializing dlib::add_layer.");
            deserialize(item.input_layer, in);
            deserialize(item.details, in);
            deserialize(item.this_layer_setup_called, in);
            deserialize(item.gradient_input_is_stale, in);
940
            deserialize(item.get_output_and_gradient_input_disabled, in);
941
942
943
944
            deserialize(item.x_grad, in);
            deserialize(item.cached_output, in);
        }

945
946
    private:

947
948
949
950
951
952
953
        bool this_layer_requires_forward_output(
        ) 
        {
            subnet_wrapper wsub(grad_final_ignored, grad_final_ignored);
            return impl::backward_requires_forward_output(details, wsub);
        }

Davis King's avatar
Davis King committed
954
        class subnet_wrapper
955
956
        {
        public:
Davis King's avatar
Davis King committed
957
            subnet_wrapper(const tensor& x_, resizable_tensor& grad_final_ignored_) :
958
959
                x(x_), grad_final_ignored(grad_final_ignored_) {}

Davis King's avatar
Davis King committed
960
961
            subnet_wrapper(const subnet_wrapper&) = delete;
            subnet_wrapper& operator=(const subnet_wrapper&) = delete;
962

963
964
965
966
967
            const tensor& get_output() const { return x; }
            tensor& get_gradient_input() 
            { 
                // It doesn't matter what values are in this tensor but client code will
                // always assume it's the same dimension as the output so make sure that is
968
969
970
971
972
973
974
                // the case.  Note that we do set it to a non-crazy value though to avoid
                // it being full of NaN and slowing the processing down.
                if (!have_same_dimensions(x, grad_final_ignored))
                {
                    grad_final_ignored.copy_size(x);
                    grad_final_ignored = 0;  
                }
975
976
977
978
979
980
981
982
                return grad_final_ignored; 
            }

        private:
            const tensor& x;
            resizable_tensor& grad_final_ignored;
        };

983
984
985
986
987
988
        void swap(add_layer& item)
        {
            std::swap(input_layer, item.input_layer);
            std::swap(details, item.details);
            std::swap(this_layer_setup_called, item.this_layer_setup_called);
            std::swap(gradient_input_is_stale, item.gradient_input_is_stale);
989
            std::swap(get_output_and_gradient_input_disabled, item.get_output_and_gradient_input_disabled);
990
991
992
993
            std::swap(x_grad, item.x_grad); 
            std::swap(cached_output, item.cached_output); 
        }

Davis King's avatar
Davis King committed
994
        subnet_type input_layer;
995
996
997
        LAYER_DETAILS details;
        bool this_layer_setup_called;
        bool gradient_input_is_stale;
998
        bool get_output_and_gradient_input_disabled;
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
        resizable_tensor x_grad; 
        resizable_tensor cached_output; 

        // The following 3 objects don't logically contribute to the state of this class.
        // They are only here to prevent them from being reallocated over and over in
        // member functions.
        resizable_tensor params_grad; 
        resizable_tensor temp_tensor; 
        resizable_tensor grad_final_ignored;
    };

// ----------------------------------------------------------------------------------------

Davis King's avatar
Davis King committed
1012
    template <unsigned long ID, typename SUBNET, typename enabled=void>
1013
1014
    class add_tag_layer;

Davis King's avatar
Davis King committed
1015
1016
1017
    template <unsigned long ID, typename SUBNET>
    class add_tag_layer<ID,SUBNET,
            typename std::enable_if<is_nonloss_layer_type<SUBNET>::value>::type>
1018
1019
    {
    public:
Davis King's avatar
Davis King committed
1020
1021
1022
1023
        typedef SUBNET subnet_type;
        typedef typename subnet_type::input_type input_type;
        const static size_t num_layers = subnet_type::num_layers + 1;
        const static unsigned int sample_expansion_factor = subnet_type::sample_expansion_factor;
1024
1025
1026
        static_assert(sample_expansion_factor >= 1,
            "The input layer can't produce fewer output tensors than there are inputs.");

Davis King's avatar
Davis King committed
1027
1028
1029
1030
1031
        add_tag_layer() = default;
        add_tag_layer(const add_tag_layer&) = default;
        add_tag_layer(add_tag_layer&&) = default;
        add_tag_layer& operator=(add_tag_layer&&) = default;
        add_tag_layer& operator=(const add_tag_layer&) = default;
1032
1033

        template <typename T>
Davis King's avatar
Davis King committed
1034
1035
        add_tag_layer(
            const add_tag_layer<ID,T>& item
Davis King's avatar
Davis King committed
1036
        ) : subnetwork(item.subnet())
1037
1038
1039
        {}

        template <typename ...T>
Davis King's avatar
Davis King committed
1040
        add_tag_layer(
1041
1042
            T ...args
        ) : 
Davis King's avatar
Davis King committed
1043
            subnetwork(std::move(args)...) 
1044
1045
1046
1047
1048
        {
        }

        template <typename input_iterator>
        void to_tensor (
1049
1050
            input_iterator ibegin,
            input_iterator iend,
1051
1052
1053
            resizable_tensor& data
        ) const
        {
Davis King's avatar
Davis King committed
1054
            subnetwork.to_tensor(ibegin,iend,data);
1055
1056
1057
1058
1059
1060
1061
1062
        }

        template <typename input_iterator>
        const tensor& operator() (
            input_iterator ibegin,
            input_iterator iend
        )
        {
Davis King's avatar
Davis King committed
1063
            return subnetwork(ibegin,iend);
1064
1065
1066
1067
        }

        const tensor& operator() (const input_type& x)
        {
Davis King's avatar
Davis King committed
1068
            return subnetwork(x);
1069
1070
1071
1072
        }

        const tensor& forward(const tensor& x)
        {
Davis King's avatar
Davis King committed
1073
            return subnetwork.forward(x);
1074
1075
        }

Davis King's avatar
Davis King committed
1076
        const tensor& get_output() const { return subnetwork.get_output(); }
1077
1078
1079

        tensor& get_gradient_input() 
        { 
Davis King's avatar
Davis King committed
1080
            return subnetwork.get_gradient_input();
1081
1082
1083
1084
1085
        }

        template <typename solver_type>
        void update(const tensor& x, sstack<solver_type,num_layers>& solvers)
        {
Davis King's avatar
Davis King committed
1086
            subnetwork.update(x,solvers.pop());
1087
1088
        }

Davis King's avatar
Davis King committed
1089
1090
        const subnet_type& subnet() const { return subnetwork; }
        subnet_type& subnet() { return subnetwork; }
1091
1092
1093

        void clean()
        {
Davis King's avatar
Davis King committed
1094
            subnetwork.clean();
1095
1096
        }

1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
        friend void serialize(const add_tag_layer& item, std::ostream& out)
        {
            int version = 1;
            serialize(version, out);
            serialize(item.subnetwork, out);
        }

        friend void deserialize(add_tag_layer& item, std::istream& in)
        {
            int version = 0;
            deserialize(version, in);
            if (version != 1)
                throw serialization_error("Unexpected version found while deserializing dlib::add_tag_layer.");
            deserialize(item.subnetwork, in);
        }

1113
1114
    private:

Davis King's avatar
Davis King committed
1115
        subnet_type subnetwork;
1116
1117
    };

1118
1119
// ----------------------------------------------------------------------------------------

1120
// This version of add_tag_layer handles the special case where the subnetwork being given
1121
1122
1123
1124
1125
// is just an input layer object.
    template <unsigned long ID, typename INPUT_LAYER, typename enabled>
    class add_tag_layer
    {
    public:
Davis King's avatar
Davis King committed
1126
1127
        typedef INPUT_LAYER subnet_type;
        typedef typename subnet_type::input_type input_type;
1128
        const static size_t num_layers = 1;
Davis King's avatar
Davis King committed
1129
        const static unsigned int sample_expansion_factor = subnet_type::sample_expansion_factor;
1130
1131
1132
1133
1134
1135
        static_assert(sample_expansion_factor >= 1,
            "The input layer can't produce fewer output tensors than there are inputs.");

        add_tag_layer() = default;
        add_tag_layer(const add_tag_layer&) = default;
        add_tag_layer& operator=(const add_tag_layer&) = default;
1136
1137
        add_tag_layer(add_tag_layer&& item) : add_tag_layer() { swap(item); }
        add_tag_layer& operator=(add_tag_layer&& item) { swap(item); return *this; }
1138
1139
1140
1141

        template <typename T, typename E>
        add_tag_layer(
            const add_tag_layer<ID,T,E>& item
Davis King's avatar
Davis King committed
1142
        ) : input_layer(item.subnet())
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
        {}

        template <typename ...T>
        add_tag_layer(
            T ...args
        ) : 
            input_layer(std::move(args)...) 
        {
        }

        template <typename input_iterator>
        void to_tensor (
1155
1156
            input_iterator ibegin,
            input_iterator iend,
1157
1158
1159
            resizable_tensor& data
        ) const
        {
1160
            input_layer.to_tensor(ibegin,iend,data);
1161
1162
1163
1164
        }

        template <typename input_iterator>
        const tensor& operator() (
1165
            input_iterator ibegin, 
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
            input_iterator iend
        )
        {
            input_layer.to_tensor(ibegin,iend,cached_output);
            return get_output();
        }

        const tensor& operator() (const input_type& x)
        {
            return (*this)(&x, &x+1);
        }

        const tensor& forward(const tensor& x)
        {
            cached_output = x;
            return get_output();
        }

        const tensor& get_output() const 
        { 
            return cached_output; 
        }

        tensor& get_gradient_input() 
        { 
            if (!have_same_dimensions(cached_output, grad_final_ignored))
            {
                grad_final_ignored.copy_size(get_output());
                grad_final_ignored = 0;
            }
            return grad_final_ignored; 
        }

        template <typename solver_type>
        void update(const tensor& /*x*/, sstack<solver_type,num_layers>& /*solvers*/)
        {
            // nothing to update
        }

Davis King's avatar
Davis King committed
1205
1206
        const subnet_type& subnet() const { return input_layer; }
        subnet_type& subnet() { return input_layer; }
1207
1208
1209
1210
1211
1212
1213

        void clean()
        {
            grad_final_ignored.clear();
            cached_output.clear();
        }

1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
        friend void serialize(const add_tag_layer& item, std::ostream& out)
        {
            int version = 1;
            serialize(version, out);
            serialize(item.input_layer, out);
            serialize(item.cached_output, out);
            serialize(item.grad_final_ignored, out);
        }

        friend void deserialize(add_tag_layer& item, std::istream& in)
        {
            int version = 0;
            deserialize(version, in);
            if (version != 1)
                throw serialization_error("Unexpected version found while deserializing dlib::add_tag_layer.");
            deserialize(item.input_layer, in);
            deserialize(item.cached_output, in);
            deserialize(item.grad_final_ignored, in);
        }

1234
1235
    private:

1236
1237
1238
1239
1240
1241
1242
        void swap(add_tag_layer& item)
        {
            std::swap(input_layer, item.input_layer);
            std::swap(cached_output, item.cached_output);
            std::swap(grad_final_ignored, item.grad_final_ignored);
        }

Davis King's avatar
Davis King committed
1243
        subnet_type input_layer;
1244
1245
1246
1247
1248
1249
        resizable_tensor cached_output;
        resizable_tensor grad_final_ignored;
    };

    template <unsigned long ID, typename U, typename E>
    struct is_nonloss_layer_type<add_tag_layer<ID,U,E>> : std::true_type {};
1250
1251
1252
1253
1254
1255


// ----------------------------------------------------------------------------------------
// ----------------------------------------------------------------------------------------
// ----------------------------------------------------------------------------------------

Davis King's avatar
Davis King committed
1256
    template <typename LOSS_DETAILS, typename SUBNET>
Davis King's avatar
Davis King committed
1257
    class add_loss_layer;
1258
1259
1260
1261
1262

    class no_label_type
    {
    private:
        // We don't want anyone making these no_label_type objects.  They are here only to
Davis King's avatar
Davis King committed
1263
1264
        // allow add_loss_layer::label_type and dnn_trainer::label_type to exist which voids
        // needing to overload add_loss_layer and dnn_trainer for supervised an unsupervised
1265
1266
        // losses.  It also can be a type to use in template metaprogramming to indicate
        // "no label".  So here we make the constructor private with the exception that
Davis King's avatar
Davis King committed
1267
        // add_loss_layer objects can make it (again, just to simplify add_loss_layer's
1268
1269
        // implementation).
        no_label_type()=default;
Davis King's avatar
Davis King committed
1270
        template <typename LOSS_DETAILS, typename SUBNET> friend class add_loss_layer;
1271
1272
1273
1274
    };

// ----------------------------------------------------------------------------------------

Davis King's avatar
Davis King committed
1275
    template <typename LOSS_DETAILS, typename SUBNET>
Davis King's avatar
Davis King committed
1276
    class add_loss_layer
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
    {
        template <typename T, typename enabled=void>
        struct get_loss_layer_label_type
        {
            typedef no_label_type type;
        };
        template <typename T>
        struct get_loss_layer_label_type<T,typename std::enable_if<sizeof(typename T::label_type)!=0>::type>
        {
            typedef typename T::label_type type;
        };

    public:
        typedef LOSS_DETAILS loss_details_type;
Davis King's avatar
Davis King committed
1291
1292
        typedef SUBNET subnet_type;
        typedef typename subnet_type::input_type input_type;
1293
        // Note that the loss layer doesn't count as an additional layer.
Davis King's avatar
Davis King committed
1294
1295
        const static size_t num_layers = subnet_type::num_layers;
        const static unsigned int sample_expansion_factor = subnet_type::sample_expansion_factor;
1296
1297
        typedef typename get_loss_layer_label_type<LOSS_DETAILS>::type label_type;

1298
1299
        static_assert(is_nonloss_layer_type<SUBNET>::value, 
            "SUBNET must be of type add_layer, add_skip_layer, or add_tag_layer."); 
1300
1301
1302
1303
        static_assert(sample_expansion_factor == LOSS_DETAILS::sample_expansion_factor,
            "The loss layer and input layer must agree on the sample_expansion_factor.");


1304
        add_loss_layer() {};
Davis King's avatar
Davis King committed
1305
1306
        add_loss_layer(const add_loss_layer&) = default;
        add_loss_layer& operator=(const add_loss_layer&) = default;
1307
1308
        add_loss_layer(add_loss_layer&& item) : add_loss_layer() { swap(item); }
        add_loss_layer& operator=(add_loss_layer&& item) { swap(item); return *this; }
1309
1310

        template <typename T, typename U>
Davis King's avatar
Davis King committed
1311
1312
        add_loss_layer(
            const add_loss_layer<T,U>& item
1313
1314
        ) : 
            loss(item.loss_details()),
Davis King's avatar
Davis King committed
1315
            subnetwork(item.subnet())
1316
1317
1318
        {}

        template <typename ...T>
Davis King's avatar
Davis King committed
1319
        add_loss_layer(
1320
1321
1322
1323
            const LOSS_DETAILS& layer_det, 
            T&& ...args
        ) : 
            loss(layer_det), 
Davis King's avatar
Davis King committed
1324
            subnetwork(std::forward<T>(args)...)
1325
1326
1327
1328
        {
        }

        template <typename ...T>
Davis King's avatar
Davis King committed
1329
        add_loss_layer(
1330
1331
1332
1333
            LOSS_DETAILS&& layer_det, 
            T&& ...args
        ) : 
            loss(std::move(layer_det)), 
Davis King's avatar
Davis King committed
1334
            subnetwork(std::forward<T>(args)...)
1335
1336
1337
1338
        {
        }

        template <typename ...T>
Davis King's avatar
Davis King committed
1339
        add_loss_layer(
1340
1341
            T ...args
        ) : 
Davis King's avatar
Davis King committed
1342
            subnetwork(std::move(args)...)
1343
        {
1344
1345
1346
1347
1348
1349
1350
1351
1352
        }

        template <typename input_iterator>
        void to_tensor (
            input_iterator ibegin,
            input_iterator iend,
            resizable_tensor& data
        ) const
        {
Davis King's avatar
Davis King committed
1353
            subnetwork.to_tensor(ibegin,iend,data);
1354
1355
1356
1357
1358
1359
1360
1361
        }

        template <typename output_iterator>
        void operator() (
            const tensor& x, 
            output_iterator obegin
        )
        {
Davis King's avatar
Davis King committed
1362
1363
            subnetwork.forward(x);
            const dimpl::subnet_wrapper<subnet_type> wsub(subnetwork);
1364
            loss.to_label(x, wsub, obegin);
1365
1366
1367
1368
1369
1370
1371
1372
1373
        }

        template <typename input_iterator, typename output_iterator>
        void operator() (
            input_iterator ibegin,
            input_iterator iend,
            output_iterator obegin
        )
        {
1374
1375
            to_tensor(ibegin,iend,temp_tensor);
            (*this)(temp_tensor, obegin);
1376
1377
1378
1379
1380
1381
1382
1383
        }

        const label_type& operator() (const input_type& x)
        {
            (*this)(&x, &x+1, &temp_label);
            return temp_label;
        }

1384
1385
1386
1387
1388
1389
        template <typename label_iterator>
        double compute_loss (
            const tensor& x,
            label_iterator lbegin 
        )
        {
Davis King's avatar
Davis King committed
1390
1391
            subnetwork.forward(x);
            dimpl::subnet_wrapper<subnet_type> wsub(subnetwork);
1392
1393
            return loss.compute_loss(x, lbegin, wsub);
        }
1394
1395
1396
1397
1398
1399
1400
1401

        template <typename input_iterator, typename label_iterator>
        double compute_loss (
            input_iterator ibegin,
            input_iterator iend,
            label_iterator lbegin 
        )
        {
1402
1403
1404
1405
1406
1407
1408
1409
            to_tensor(ibegin,iend,temp_tensor);
            return compute_loss(temp_tensor, lbegin);
        }

        double compute_loss (
            const tensor& x
        )
        {
Davis King's avatar
Davis King committed
1410
1411
            subnetwork.forward(x);
            dimpl::subnet_wrapper<subnet_type> wsub(subnetwork);
1412
            return loss.compute_loss(x, wsub);
1413
1414
1415
1416
1417
1418
1419
1420
        }

        template <typename input_iterator>
        double compute_loss (
            input_iterator ibegin,
            input_iterator iend
        )
        {
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
            to_tensor(ibegin,iend,temp_tensor);
            return compute_loss(temp_tensor);
        }

        template <typename label_iterator, typename solver_type>
        double update (
            const tensor& x,
            label_iterator lbegin,
            sstack<solver_type,num_layers>& solvers
        )
        {
Davis King's avatar
Davis King committed
1432
1433
            subnetwork.forward(x);
            dimpl::subnet_wrapper<subnet_type> wsub(subnetwork);
1434
            double l = loss.compute_loss(x, lbegin, wsub);
Davis King's avatar
Davis King committed
1435
            subnetwork.update(x, solvers);
1436
            return l;
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
        }

        template <typename input_iterator, typename label_iterator, typename solver_type>
        double update (
            input_iterator ibegin,
            input_iterator iend,
            label_iterator lbegin,
            sstack<solver_type,num_layers>& solvers
        )
        {
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
            to_tensor(ibegin,iend,temp_tensor);
            return update(temp_tensor, lbegin, solvers);
        }

        template <typename solver_type>
        double update (
            const tensor& x,
            sstack<solver_type,num_layers>& solvers
        )
        {
Davis King's avatar
Davis King committed
1457
1458
            subnetwork.forward(x);
            dimpl::subnet_wrapper<subnet_type> wsub(subnetwork);
1459
            double l = loss.compute_loss(x, wsub);
Davis King's avatar
Davis King committed
1460
            subnetwork.update(x, solvers);
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
            return l;
        }

        template <typename input_iterator, typename solver_type>
        double update (
            input_iterator ibegin,
            input_iterator iend,
            sstack<solver_type,num_layers>& solvers
        )
        {
1471
1472
            to_tensor(ibegin,iend,temp_tensor);
            return update(temp_tensor, solvers);
1473
1474
        }

Davis King's avatar
Davis King committed
1475
1476
        const subnet_type& subnet() const { return subnetwork; }
        subnet_type& subnet() { return subnetwork; }
1477
1478
1479
1480
1481
1482
1483
        const loss_details_type& loss_details() const { return loss; }
        loss_details_type& loss_details() { return loss; }

        void clean (
        )
        {
            temp_tensor.clear();
1484
            subnetwork.clean();
1485
1486
        }

1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
        friend void serialize(const add_loss_layer& item, std::ostream& out)
        {
            int version = 1;
            serialize(version, out);
            serialize(item.loss, out);
            serialize(item.subnetwork, out);
        }

        friend void deserialize(add_loss_layer& item, std::istream& in)
        {
            int version = 0;
            deserialize(version, in);
            if (version != 1)
                throw serialization_error("Unexpected version found while deserializing dlib::add_loss_layer.");
            deserialize(item.loss, in);
            deserialize(item.subnetwork, in);
        }

1505
1506
    private:

1507
1508
1509
1510
1511
1512
        void swap(add_loss_layer& item)
        {
            std::swap(loss, item.loss);
            std::swap(subnetwork, item.subnetwork);
        }

1513
        loss_details_type loss;
Davis King's avatar
Davis King committed
1514
        subnet_type subnetwork;
1515
1516
1517
1518
1519
1520
1521
1522
1523

        // These two objects don't logically contribute to the state of this object.  They
        // are here to prevent them from being reallocated over and over.
        label_type temp_label;
        resizable_tensor temp_tensor;
    };


    template <typename T, typename U>
Davis King's avatar
Davis King committed
1524
    struct is_loss_layer_type<add_loss_layer<T,U>> : std::true_type {};
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535

// ----------------------------------------------------------------------------------------
// ----------------------------------------------------------------------------------------
// ----------------------------------------------------------------------------------------

    namespace impl
    {
        template <unsigned int i, typename T>
        struct layer_helper
        {
            static T& makeT();
Davis King's avatar
Davis King committed
1536
            using next_type = typename std::remove_reference<decltype(makeT().subnet())>::type;
1537
1538
1539
            using type = typename layer_helper<i-1,next_type>::type;
            static type& layer(T& n)
            {
Davis King's avatar
Davis King committed
1540
                return layer_helper<i-1,next_type>::layer(n.subnet());
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
            }
        };
        template <typename T>
        struct layer_helper<0,T>
        {
            using type = T;
            static type& layer(T& n)
            {
                return n;
            }
        };

        template <template<typename> class Match, typename T, unsigned int i, typename enabled = void>
        struct layer_helper_match
        {
            static T& makeT();
Davis King's avatar
Davis King committed
1557
            using next_type = typename std::remove_reference<decltype(makeT().subnet())>::type;
1558
1559
1560
            using type = typename layer_helper_match<Match,next_type,i>::type;
            static type& layer(T& n)
            {
Davis King's avatar
Davis King committed
1561
                return layer_helper_match<Match,next_type,i>::layer(n.subnet());
1562
1563
            }
        };
Davis King's avatar
Davis King committed
1564
        // This overload catches add_layer and add_loss_layer templates.
1565
1566
        template <template<typename> class Match, typename T, unsigned int i>
        struct layer_helper_match<Match,T,i,
Davis King's avatar
Davis King committed
1567
            typename std::enable_if<std::is_same<const T,const  Match<typename T::subnet_type>>::value>::type>
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
        {
            using type = typename layer_helper<i,T>::type;
            static type& layer(T& n)
            {
                return layer_helper<i,T>::layer(n);
            }
        };
        // This overload catches input templates.
        template <template<typename> class Match, typename T, unsigned int i>
        struct layer_helper_match<Match,T,i,
            typename std::enable_if<std::is_same<const T,const  Match<typename T::input_type>>::value>::type>
        {
            using type = typename layer_helper<i,T>::type;
            static type& layer(T& n)
            {
                return layer_helper<i,T>::layer(n);
            }
        };
Davis King's avatar
Davis King committed
1586
        // This overload catches subnet_wrapper templates.
1587
1588
1589
        template <template<typename> class Match, typename T, unsigned int i>
        struct layer_helper_match<Match,T,i,
            typename std::enable_if<std::is_same<const typename T::wrapped_type, 
Davis King's avatar
Davis King committed
1590
                                                 const Match<typename T::wrapped_type::subnet_type>>::value>::type>
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
        {
            using type = typename layer_helper<i,T>::type;
            static type& layer(T& n)
            {
                return layer_helper<i,T>::layer(n);
            }
        };
    }

    template <unsigned int i, typename T>
    typename impl::layer_helper<i,T>::type& layer (T& n) 
    {
        return impl::layer_helper<i,T>::layer(n);
    }

    template <template<typename> class Match, typename T>
    typename impl::layer_helper_match<Match,T,0>::type& layer (T& n) 
    {
        return impl::layer_helper_match<Match,T,0>::layer(n);
    }

    template <template<typename> class Match, unsigned int i, typename T>
    typename impl::layer_helper_match<Match,T,i>::type& layer (T& n) 
    {
        return impl::layer_helper_match<Match,T,i>::layer(n);
    }

// ----------------------------------------------------------------------------------------

Davis King's avatar
Davis King committed
1620
    template <template<typename> class TAG_TYPE, typename SUBNET>
Davis King's avatar
Davis King committed
1621
    class add_skip_layer
1622
1623
    {
    public:
Davis King's avatar
Davis King committed
1624
1625
1626
1627
        typedef SUBNET subnet_type;
        typedef typename subnet_type::input_type input_type;
        const static size_t num_layers = subnet_type::num_layers + 1;
        const static unsigned int sample_expansion_factor = subnet_type::sample_expansion_factor;
1628
1629
1630
        static_assert(sample_expansion_factor >= 1,
            "The input layer can't produce fewer output tensors than there are inputs.");

Davis King's avatar
Davis King committed
1631
1632
1633
1634
1635
        add_skip_layer() = default;
        add_skip_layer(const add_skip_layer&) = default;
        add_skip_layer(add_skip_layer&&) = default;
        add_skip_layer& operator=(add_skip_layer&&) = default;
        add_skip_layer& operator=(const add_skip_layer&) = default;
1636
1637

        template <typename T>
Davis King's avatar
Davis King committed
1638
1639
        add_skip_layer(
            const add_skip_layer<TAG_TYPE,T>& item
Davis King's avatar
Davis King committed
1640
        ) : subnetwork(item.subnet())
1641
1642
1643
        {}

        template <typename ...T>
Davis King's avatar
Davis King committed
1644
        add_skip_layer(
1645
1646
            T ...args
        ) : 
Davis King's avatar
Davis King committed
1647
            subnetwork(std::move(args)...) 
1648
1649
1650
1651
1652
        {
        }

        template <typename input_iterator>
        void to_tensor (
1653
1654
            input_iterator ibegin,
            input_iterator iend,
1655
1656
1657
            resizable_tensor& data
        ) const
        {
Davis King's avatar
Davis King committed
1658
            subnetwork.to_tensor(ibegin,iend,data);
1659
1660
1661
1662
1663
1664
1665
1666
        }

        template <typename input_iterator>
        const tensor& operator() (
            input_iterator ibegin,
            input_iterator iend
        )
        {
Davis King's avatar
Davis King committed
1667
1668
            subnetwork(ibegin,iend);
            return layer<TAG_TYPE>(subnetwork).get_output();
1669
1670
1671
1672
        }

        const tensor& operator() (const input_type& x)
        {
Davis King's avatar
Davis King committed
1673
1674
            subnetwork(x);
            return layer<TAG_TYPE>(subnetwork).get_output();
1675
1676
1677
1678
        }

        const tensor& forward(const tensor& x)
        {
Davis King's avatar
Davis King committed
1679
1680
            subnetwork.forward(x);
            return layer<TAG_TYPE>(subnetwork).get_output();
1681
1682
1683
1684
        }

        const tensor& get_output() const 
        { 
Davis King's avatar
Davis King committed
1685
            return layer<TAG_TYPE>(subnetwork).get_output();
1686
1687
1688
1689
        }

        tensor& get_gradient_input() 
        { 
Davis King's avatar
Davis King committed
1690
            return layer<TAG_TYPE>(subnetwork).get_gradient_input();
1691
1692
1693
1694
1695
        }

        template <typename solver_type>
        void update(const tensor& x, sstack<solver_type,num_layers>& solvers)
        {
Davis King's avatar
Davis King committed
1696
            subnetwork.update(x,solvers.pop());
1697
1698
        }

Davis King's avatar
Davis King committed
1699
        const subnet_type& subnet() const 
1700
        { 
Davis King's avatar
Davis King committed
1701
            return subnetwork; 
1702
1703
        }

Davis King's avatar
Davis King committed
1704
        subnet_type& subnet() 
1705
        { 
Davis King's avatar
Davis King committed
1706
            return subnetwork; 
1707
1708
1709
1710
        }

        void clean()
        {
Davis King's avatar
Davis King committed
1711
            subnetwork.clean();
1712
1713
        }

1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
        friend void serialize(const add_skip_layer& item, std::ostream& out)
        {
            int version = 1;
            serialize(version, out);
            serialize(item.subnetwork, out);
        }

        friend void deserialize(add_skip_layer& item, std::istream& in)
        {
            int version = 0;
            deserialize(version, in);
            if (version != 1)
                throw serialization_error("Unexpected version found while deserializing dlib::add_skip_layer.");
            deserialize(item.subnetwork, in);
        }

1730
1731
    private:

Davis King's avatar
Davis King committed
1732
        subnet_type subnetwork;
1733
1734
    };
    template <template<typename> class T, typename U>
Davis King's avatar
Davis King committed
1735
1736
    struct is_nonloss_layer_type<add_skip_layer<T,U>> : std::true_type {};

Davis King's avatar
Davis King committed
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
    template <typename SUBNET> using tag1  = add_tag_layer< 1, SUBNET>;
    template <typename SUBNET> using tag2  = add_tag_layer< 2, SUBNET>;
    template <typename SUBNET> using tag3  = add_tag_layer< 3, SUBNET>;
    template <typename SUBNET> using tag4  = add_tag_layer< 4, SUBNET>;
    template <typename SUBNET> using tag5  = add_tag_layer< 5, SUBNET>;
    template <typename SUBNET> using tag6  = add_tag_layer< 6, SUBNET>;
    template <typename SUBNET> using tag7  = add_tag_layer< 7, SUBNET>;
    template <typename SUBNET> using tag8  = add_tag_layer< 8, SUBNET>;
    template <typename SUBNET> using tag9  = add_tag_layer< 9, SUBNET>;
    template <typename SUBNET> using tag10 = add_tag_layer<10, SUBNET>;

    template <typename SUBNET> using skip1  = add_skip_layer< tag1, SUBNET>;
    template <typename SUBNET> using skip2  = add_skip_layer< tag2, SUBNET>;
    template <typename SUBNET> using skip3  = add_skip_layer< tag3, SUBNET>;
    template <typename SUBNET> using skip4  = add_skip_layer< tag4, SUBNET>;
    template <typename SUBNET> using skip5  = add_skip_layer< tag5, SUBNET>;
    template <typename SUBNET> using skip6  = add_skip_layer< tag6, SUBNET>;
    template <typename SUBNET> using skip7  = add_skip_layer< tag7, SUBNET>;
    template <typename SUBNET> using skip8  = add_skip_layer< tag8, SUBNET>;
    template <typename SUBNET> using skip9  = add_skip_layer< tag9, SUBNET>;
    template <typename SUBNET> using skip10 = add_skip_layer<tag10, SUBNET>;
1758
1759
1760
1761
1762

// ----------------------------------------------------------------------------------------

    namespace timpl
    {
1763
        inline void fill_with_gassuan_random_numbers (
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
            tensor& t,
            dlib::rand& rnd,
            double sigma = 1
        )
        {
            float* data = t.host();
            for (size_t i = 0; i < t.size(); ++i)
                data[i] = rnd.get_random_gaussian()*sigma;
        }

Davis King's avatar
Davis King committed
1774
        class test_layer_subnet 
1775
1776
        {
        public:
Davis King's avatar
Davis King committed
1777
            test_layer_subnet (
1778
1779
1780
1781
1782
1783
                dlib::rand& rnd_
            ) : rnd(rnd_) 
            {
                // Output and gradient_input have to have the same dimensions in each
                // layer.
                const long num_samples = rnd.get_random_32bit_number()%4+3;
1784
                const long k  = rnd.get_random_32bit_number()%4+2;
1785
1786
1787
                const long nr = rnd.get_random_32bit_number()%4+2;
                const long nc = rnd.get_random_32bit_number()%4+2;

1788
1789
                output.set_size(num_samples, k, nr, nc);
                gradient_input.set_size(num_samples, k, nr, nc);
1790
1791
1792
1793
1794
1795
1796
1797
1798

                // Use a non-zero initial gradient to make sure the layers add to it
                // rather than assign and blow away the initial value.
                fill_with_gassuan_random_numbers(gradient_input, rnd, 0.01);

                fill_with_gassuan_random_numbers(output, rnd);
            }


1799
            tensor& get_mutable_output() { return output; }
1800
            const tensor& get_output() const { return output; }
1801
            const tensor& private_get_output() const { return get_output(); }
Davis King's avatar
Davis King committed
1802
            const test_layer_subnet& subnet() const { init_sub(); return *subnetwork; }
1803
1804

            tensor& get_gradient_input() { return gradient_input; }
1805
            tensor& private_get_gradient_input() { return get_gradient_input(); }
Davis King's avatar
Davis King committed
1806
            test_layer_subnet& subnet() { init_sub(); return *subnetwork; }
1807
1808
1809
1810
1811



            unsigned long count_outputs() const
            {
Davis King's avatar
Davis King committed
1812
1813
                if (subnetwork)
                    return subnetwork->count_outputs() + output.size();
1814
1815
1816
1817
1818
1819
1820
1821
1822
                else
                    return output.size();
            }

            float& get_output_element(unsigned long i)
            {
                if (i < output.size())
                    return output.host()[i];
                else
Davis King's avatar
Davis King committed
1823
                    return subnet().get_output_element(i-output.size());
1824
1825
1826
1827
1828
1829
1830
            }

            float get_gradient_input_element(unsigned long i) const
            {
                if (i < gradient_input.size())
                    return gradient_input.host()[i];
                else
Davis King's avatar
Davis King committed
1831
                    return subnet().get_gradient_input_element(i-gradient_input.size());
1832
1833
1834
1835
1836
            }


        private:
            // We lazily initialize sub-layers as needed when someone tries to call
Davis King's avatar
Davis King committed
1837
            // subnet()
1838
1839
            void init_sub() const
            {
Davis King's avatar
Davis King committed
1840
1841
                if (!subnetwork)
                    subnetwork.reset(new test_layer_subnet(rnd));
1842
1843
1844
            }

            dlib::rand& rnd;
Davis King's avatar
Davis King committed
1845
            mutable std::unique_ptr<test_layer_subnet> subnetwork;
1846
1847
1848
1849
            resizable_tensor output;
            resizable_tensor gradient_input;
        };

1850
    }
1851

1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
    struct layer_test_results
    {
        layer_test_results() : was_good(true) {}
        explicit layer_test_results(const std::string& l) : log(l),was_good(false) {}

        std::string log;
        bool was_good;

        operator bool() const { return was_good; }
    };

    inline std::ostream& operator<< (std::ostream& out, const layer_test_results& item)
    {
        out << item.log;
        return out;
1867
1868
1869
1870
1871
    }

    template <
        typename layer_details_type
        >
1872
    layer_test_results test_layer (
1873
1874
1875
1876
1877
1878
1879
        layer_details_type l
    )
    {
        const float base_eps = 0.01;
        using namespace timpl;
        // Do some setup
        dlib::rand rnd;
Davis King's avatar
Davis King committed
1880
        test_layer_subnet subnetwork(rnd);
1881
        resizable_tensor output, out2, out3;
Davis King's avatar
Davis King committed
1882
        // Run setup() and forward() as well to make sure any calls to subnet() have
1883
        // happened before we start assuming we know how many data elements there are
Davis King's avatar
Davis King committed
1884
1885
        // (since we do a lazy layer creation thing based on calls to subnet() inside
        // test_layer_subnet).
Davis King's avatar
Davis King committed
1886
        l.setup(subnetwork);
1887
        impl::call_layer_forward(l, subnetwork, output);
1888
1889
1890
1891
1892

        resizable_tensor input_grad;
        input_grad.copy_size(output);
        fill_with_gassuan_random_numbers(input_grad, rnd);

1893
1894
        std::ostringstream sout;

1895
1896
        // The f() we are computing gradients of is this thing.  It's value at the current
        // parameter and data values is:
1897
        //sout << "f(data,params): " << dot(output, input_grad) << std::endl;
1898

Davis King's avatar
Davis King committed
1899
        // We are going to save a copy of the subnetwork.get_gradient_input() data before we do
1900
1901
1902
        // backpropagation since the backward() function is supposed to *add* to the
        // gradients rather than overwrite them.  We will use this saved data to check if
        // that is the case.
Davis King's avatar
Davis King committed
1903
        const unsigned long num_data_inputs = subnetwork.count_outputs();
1904
1905
        std::vector<float> initial_gradient_input(num_data_inputs);
        for (unsigned long i = 0; i < num_data_inputs; ++i)
Davis King's avatar
Davis King committed
1906
            initial_gradient_input[i] = subnetwork.get_gradient_input_element(i);
1907
1908
1909
1910
1911


        // Now tell the layer to compute all the gradients.  In the rest of this function
        // we will just be checking that these gradients were computed correctly by
        // comparing them to a central differences approximation.
1912
        resizable_tensor params_grad;
1913
        params_grad.copy_size(l.get_layer_params());
1914
1915
        // But first, set the params grad to something crazy so that it's very obvious if
        // it doesn't get fully assigned.
1916
        params_grad = std::numeric_limits<float>::infinity();
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
        impl::call_layer_backward(l, output, input_grad, subnetwork, params_grad);

        static_assert(impl::is_inplace_layer(l, subnetwork) == impl::has_inplace_backward(l, subnetwork),
            "Layer not defined correctly.  forward and backward methods must either both be in-place or both out-of-place. ");

        // Make sure the outputs of forward() and backward() are the same when they are run
        // in in-place mode.
        if (impl::is_inplace_layer(l, subnetwork))
        {
            test_layer_subnet subnetwork2(rnd);
            layer_details_type ll(l);
            ll.setup(subnetwork2);
            resizable_tensor ip_out;
            impl::call_layer_forward(ll, subnetwork2, ip_out);
            impl::call_layer_forward(ll, subnetwork2, subnetwork2.get_mutable_output());
            const auto forward_error = max(abs(mat(ip_out) - mat(subnetwork2.get_output())));
            if (forward_error > 0.00001)
            {
                using namespace std;
                sout << "This layer is supposed to support in-place computations but the output of forward_inplace()\n";
                sout << "changes when invoked in-place vs. out-of-place. The error was: " << forward_error << endl;
                return layer_test_results(sout.str()); 
            }
1940

1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
            resizable_tensor params_grad;
            params_grad.copy_size(ll.get_layer_params());
            params_grad = std::numeric_limits<float>::infinity();

            resizable_tensor input_grad;
            input_grad.copy_size(ip_out);
            fill_with_gassuan_random_numbers(input_grad, rnd);
            resizable_tensor params_grad1, params_grad2, data_grad1, data_grad2;
            params_grad1 = params_grad;
            params_grad2 = params_grad;
            // Now call backward() and make sure it works as well.
            subnetwork2.get_gradient_input() = 9999;
            impl::call_layer_backward(ll, ip_out, input_grad, subnetwork2, params_grad1);
            data_grad1 = subnetwork2.get_gradient_input();

            subnetwork2.get_gradient_input() = mat(input_grad);
            impl::call_layer_backward(ll, ip_out, subnetwork2.get_gradient_input(), subnetwork2, params_grad2);
            data_grad2 = subnetwork2.get_gradient_input();
            if (params_grad.size() != 0)
            {
                const auto backward_param_error = max(abs(mat(params_grad1) - mat(params_grad2)));
                if (backward_param_error > 0.00001)
                {
                    using namespace std;
                    sout << "This layer is supposed to support in-place computations but the output of backward_inplace()\n";
                    sout << "changes when invoked in-place vs. out-of-place. The error was: " << backward_param_error << endl;
                    return layer_test_results(sout.str()); 
                }
            }
            const auto backward_data_error = max(abs(mat(data_grad1) - mat(data_grad2)));
            if (backward_data_error > 0.00001)
            {
                using namespace std;
                sout << "This layer is supposed to support in-place computations but the output of backward_inplace()\n";
                sout << "changes when invoked in-place vs. out-of-place. The error was: " << backward_data_error << endl;
                return layer_test_results(sout.str()); 
            }
        }
1979
1980
1981

        // ==================================================================
        // first validate the way the parameter gradients are computed
1982
        for (unsigned long i = 0; i < params_grad.size(); ++i)
1983
1984
1985
1986
1987
1988
1989
1990
        {
            layer_details_type l1(l);

            float eps = l1.get_layer_params().host()[i]*base_eps;
            if (eps == 0)
                eps = base_eps;
            const float oldval = l1.get_layer_params().host()[i];
            l1.get_layer_params().host()[i] = oldval+eps;
1991
            impl::call_layer_forward(l1, subnetwork, out2);
1992
            l1.get_layer_params().host()[i] = oldval-eps;
1993
1994
            impl::call_layer_forward(l1, subnetwork, out3);
            l1.get_layer_params().host()[i] = oldval;
1995
1996
1997
1998

            // Compute a reference derivative via a central differences approximation and
            // compare it to the one output by the layer and make sure they match.
            double reference_derivative = (dot(out2,input_grad)-dot(out3, input_grad))/(2*eps);
1999
            double output_derivative = params_grad.host()[i];
2000
2001
2002
2003
            double relative_error = (reference_derivative - output_derivative)/(reference_derivative + 1e-100);
            if (std::abs(relative_error) > 0.01)
            {
                using namespace std;
2004
                sout << "Gradient error in parameter #" << i <<".  Relative error: "<< relative_error << endl;
2005
2006
                sout << "expected derivative: " << reference_derivative << endl;
                sout << "output derivative:   " << output_derivative << endl;
2007
                return layer_test_results(sout.str()); 
2008
2009
2010
2011
2012
2013
2014
2015
            }

        }

        // ==================================================================
        // now validate the data gradients
        for (unsigned long i = 0; i < num_data_inputs; ++i)
        {
Davis King's avatar
Davis King committed
2016
            const float oldval = subnetwork.get_output_element(i);
2017
2018
2019
            float eps = oldval*base_eps;
            if (eps == 0)
                eps = base_eps;
Davis King's avatar
Davis King committed
2020
            subnetwork.get_output_element(i) = oldval+eps;
2021
            impl::call_layer_forward(l, subnetwork, out2);
Davis King's avatar
Davis King committed
2022
            subnetwork.get_output_element(i) = oldval-eps;
2023
2024
            impl::call_layer_forward(l, subnetwork, out3);
            subnetwork.get_output_element(i) = oldval;
2025
2026
2027
2028

            // Compute a reference derivative via a central differences approximation and
            // compare it to the one output by the layer and make sure they match.
            double reference_derivative = (dot(out2,input_grad)-dot(out3, input_grad))/(2*eps);
2029
2030
2031
            double output_derivative = subnetwork.get_gradient_input_element(i);
            if (!impl::is_inplace_layer(l,subnetwork))
                output_derivative -= initial_gradient_input[i];
2032
2033
2034
2035
            double relative_error = (reference_derivative - output_derivative)/(reference_derivative + 1e-100);
            if (std::abs(relative_error) > 0.01)
            {
                using namespace std;
2036
                sout << "Gradient error in data variable #" << i <<".  Relative error: "<< relative_error << endl;
2037
2038
                sout << "expected derivative: " << reference_derivative << endl;
                sout << "output derivative:   " << output_derivative << endl;
2039
                return layer_test_results(sout.str()); 
2040
2041
2042
            }
        }

2043
        return layer_test_results();
2044
2045
2046
2047
2048
2049
    }

// ----------------------------------------------------------------------------------------

}

2050
#endif // DLIB_DNn_CORE_H_
2051
2052