core.h 111 KB
Newer Older
1
2
3
4
5
6
7
8
9
// Copyright (C) 2015  Davis E. King (davis@dlib.net)
// License: Boost Software License   See LICENSE.txt for the full license.
#ifndef DLIB_DNn_CORE_H_
#define DLIB_DNn_CORE_H_

#include "core_abstract.h"
#include "tensor.h"
#include <iterator>
#include <memory>
10
#include <sstream>
11
#include <type_traits>
Davis King's avatar
Davis King committed
12
13
#include "../statistics.h"
#include "../rand.h"
14
#include "../algs.h"
15
#include <utility>
16
#include <tuple>
Davis King's avatar
Davis King committed
17
#include <cmath>
18
#include <vector>
19
#include "tensor_tools.h"
20
#include <type_traits>
21

22
23
24
25
26


namespace dlib
{

27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
// ----------------------------------------------------------------------------------------

    namespace impl
    {
        class repeat_input_layer 
        {
            /*!
                None of the declarations in this object are really used. The only reason it
                exists is to allow the repeat object to use a special input layer in its
                internal networks which will cause add_tag_layer objects that happen to be
                right at the input to not create copies of their input tensors.  So
                introducing the repeat_input_layer object allows us to optimize the
                implementation of add_tag_layer for a special case that arises when it's
                used in the context of the repeat layer.
            !*/
        public:
            typedef int input_type;
            const static unsigned int sample_expansion_factor = 1;

            template <typename input_iterator>
            void to_tensor (
                input_iterator ,
                input_iterator ,
                resizable_tensor& 
            ) const
            {
                DLIB_CASSERT(false,"This function should never be called");
            }

            friend void serialize(const repeat_input_layer&, std::ostream&){}
            friend void deserialize(repeat_input_layer&, std::istream&){}
            friend std::ostream& operator<<(std::ostream& out, const repeat_input_layer&) { out << "FUCK"; return out; }
        };
    }

Davis King's avatar
Davis King committed
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
// ----------------------------------------------------------------------------------------

    inline double log1pexp(double x)
    {
        using std::exp;
        using namespace std; // Do this instead of using std::log1p because some compilers
                             // error out otherwise (E.g. gcc 4.9 in cygwin)
        if (x <= -37)
            return exp(x);
        else if (-37 < x && x <= 18)
            return log1p(exp(x));
        else if (18 < x && x <= 33.3)
            return x + exp(-x);
        else
            return x;
    }
    
79
80
// ----------------------------------------------------------------------------------------

Davis King's avatar
Davis King committed
81
    // Tell us if T is one of the special layer types (i.e. add_layer, repeat, add_tag_layer, or
Davis King's avatar
Davis King committed
82
83
84
    // add_skip_layer).
    template <typename T> struct is_nonloss_layer_type : std::false_type {};
    // Tell us if T is an instance of add_loss_layer.
85
    template <typename T> struct is_loss_layer_type : std::false_type {};
Davis King's avatar
Davis King committed
86
87
    // Tell us if T is an instance of add_layer
    template <typename T> struct is_add_layer : std::false_type {};
88

89
90
91
92
93
94
95
96
97
98
99
100
101
102
    namespace impl
    {
        template <size_t... n>
        struct ct_integers_list {
            template <size_t m>
            struct push_back
            {
                typedef ct_integers_list<n..., m> type;
            };
        };

        template <size_t max>
        struct ct_make_integer_range
        {
103
            // recursively call push_back on ct_integers_list to build a range from 1 to max
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
            // inclusive.
            typedef typename ct_make_integer_range<max-1>::type::template push_back<max>::type type;
        };

        template <>
        struct ct_make_integer_range<0>
        {
            typedef ct_integers_list<> type;
        };

        template <size_t... indices, typename Tuple>
        auto tuple_subset(
            const Tuple& item, 
            ct_integers_list<indices...>
        ) -> decltype(std::make_tuple(std::get<indices>(item)...))
        {
            return std::make_tuple(std::get<indices>(item)...);
        }

123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
        template <typename Head, typename... Tail>
        std::tuple<Tail...> basic_tuple_tail(
            const std::tuple<Head, Tail...>& item
        )
        {
            return tuple_subset(item, typename ct_make_integer_range<sizeof...(Tail)>::type());
        }

        template <typename T>
        std::tuple<T> tuple_flatten(const T& t) 
        {
            return std::make_tuple(t);
        }

        template <typename... T>
        auto tuple_flatten(
            const std::tuple<T...>& item
        ) -> decltype(tuple_flatten(item, typename ct_make_integer_range<sizeof...(T)>::type()))
        {
            return tuple_flatten(item, typename ct_make_integer_range<sizeof...(T)>::type());
        }

        template <size_t... indices, typename... T>
        auto tuple_flatten(
            const std::tuple<T...>& item, 
            ct_integers_list<indices...>
        ) -> decltype(std::tuple_cat(tuple_flatten(std::get<indices-1>(item))...))
        {
            return std::tuple_cat(tuple_flatten(std::get<indices-1>(item))...);
        }

        template <typename T>
        struct tuple_head_helper
        {
            typedef T type;
            static const type& get(const T& item) 
            {
                return item;
            }
        };

        template <typename T, typename... U>
        struct tuple_head_helper<std::tuple<T, U...>>
        {
            typedef typename tuple_head_helper<T>::type type;
            static const type& get(const std::tuple<T,U...>& item) 
            {
                return tuple_head_helper<T>::get(std::get<0>(item));
            }
        };

174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
        template <typename T> struct alwaysbool { typedef bool type; };

        resizable_tensor& rt();

        // The significance of a layer's backward method requiring forward's outputs is
        // that such as layer can't have an in-place layer stacked on top of it because
        // in-place layers overwrite the output of the layer they sit on top of.
        template <typename layer_type, typename SUBNET>
        constexpr auto backward_requires_forward_output(
            layer_type& layer,
            SUBNET& sub
        ) -> typename alwaysbool<decltype(layer.backward(rt(),rt(),sub,rt()))>::type
        {
            return true;
        }

        template <typename layer_type, typename SUBNET>
        constexpr auto backward_requires_forward_output(
            layer_type& layer,
            SUBNET& sub
        ) -> typename alwaysbool<decltype(layer.backward(rt(),sub,rt()))>::type
        {
            return false;
        }

        template <typename layer_type, typename SUBNET>
        constexpr auto backward_requires_forward_output(
            layer_type& layer,
            SUBNET& sub
        ) -> typename alwaysbool<decltype(layer.backward_inplace(rt(),rt(),sub.get_gradient_input(),rt()))>::type
        {
            return true;
        }

208
209
210
211
212
213
214
215
216
        template <typename layer_type, typename SUBNET>
        constexpr auto backward_requires_forward_output(
            layer_type& layer,
            SUBNET& sub
        ) -> typename alwaysbool<decltype(layer.backward_inplace(rt(),sub.get_gradient_input(),rt()))>::type
        {
            return false;
        }

217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
        template <typename layer_type, typename SUBNET>
        constexpr auto has_inplace_backward(
            layer_type& layer,
            SUBNET& sub
        ) -> typename alwaysbool<decltype(layer.backward(rt(),rt(),sub,rt()))>::type
        {
            return false;
        }

        template <typename layer_type, typename SUBNET>
        constexpr auto has_inplace_backward(
            layer_type& layer,
            SUBNET& sub
        ) -> typename alwaysbool<decltype(layer.backward(rt(),sub,rt()))>::type
        {
            return false;
        }

        template <typename layer_type, typename SUBNET>
        constexpr auto has_inplace_backward(
            layer_type& layer,
            SUBNET& sub
        ) -> typename alwaysbool<decltype(layer.backward_inplace(rt(),rt(),sub.get_gradient_input(),rt()))>::type
        {
            return true;
        }

244
245
246
247
248
249
250
251
252
        template <typename layer_type, typename SUBNET>
        constexpr auto has_inplace_backward(
            layer_type& layer,
            SUBNET& sub
        ) -> typename alwaysbool<decltype(layer.backward_inplace(rt(),sub.get_gradient_input(),rt()))>::type
        {
            return true;
        }

253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
        template <typename layer_type, typename SUBNET>
        constexpr auto is_inplace_layer(
            layer_type& layer,
            const SUBNET& sub 
        ) -> typename alwaysbool<decltype(layer.forward(sub,rt()))>::type
        {
            return false;
        }

        template <typename layer_type, typename SUBNET>
        constexpr auto is_inplace_layer(
            layer_type& layer,
            const SUBNET& sub
        ) -> typename alwaysbool<decltype(layer.forward_inplace(sub.get_output(),rt()))>::type
        {
            return true;
        }

        template <typename layer_type, typename SUBNET>
        auto call_layer_backward(
            layer_type& layer,
            const tensor& computed_output, 
            const tensor& gradient_input, 
            SUBNET& sub, 
            tensor& params_grad
        ) -> decltype(layer.backward(computed_output,gradient_input,sub,params_grad))
        {
            layer.backward(computed_output,gradient_input,sub,params_grad);
        }

        template <typename layer_type, typename SUBNET>
        auto call_layer_backward(
            layer_type& layer,
            const tensor& , 
            const tensor& gradient_input, 
            SUBNET& sub, 
            tensor& params_grad
        ) -> decltype(layer.backward(gradient_input,sub,params_grad))
        {
            layer.backward(gradient_input,sub,params_grad);
        }

        template <typename layer_type, typename SUBNET>
        auto call_layer_backward(
            layer_type& layer,
            const tensor& computed_output, 
            const tensor& gradient_input, 
            SUBNET& sub, 
            tensor& params_grad
        ) -> decltype(layer.backward_inplace(computed_output,gradient_input,sub.get_gradient_input(),params_grad))
        {
            layer.backward_inplace(computed_output,gradient_input,sub.get_gradient_input(),params_grad);
        }

307
308
309
310
311
312
313
314
315
316
317
318
        template <typename layer_type, typename SUBNET>
        auto call_layer_backward(
            layer_type& layer,
            const tensor& , 
            const tensor& gradient_input, 
            SUBNET& sub, 
            tensor& params_grad
        ) -> decltype(layer.backward_inplace(gradient_input,sub.get_gradient_input(),params_grad))
        {
            layer.backward_inplace(gradient_input,sub.get_gradient_input(),params_grad);
        }

319
320
321
322
323

        template <typename layer_type, typename SUBNET>
        auto call_layer_forward(
            layer_type& layer,
            const SUBNET& sub, 
324
            tensor& /*data_output*/
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
        ) -> decltype(layer.forward(sub,rt()))
        {
            // This overload of call_layer_forward() is here because this template
            // naturally gets instantiated but only on code paths that never get executed.
            // So rather than writing a bunch of hard to read template magic around call
            // sites we just have this overload that doesn't do anything (and an assert to
            // make sure that's the case).
            DLIB_CASSERT(false, "This should never happen");
        }

        template <typename layer_type, typename SUBNET>
        auto call_layer_forward(
            layer_type& layer,
            const SUBNET& sub, 
            resizable_tensor& data_output
        ) -> decltype(layer.forward(sub,data_output))
        {
            layer.forward(sub,data_output);
        }

        template <typename layer_type, typename SUBNET>
        auto call_layer_forward(
            layer_type& layer,
            const SUBNET& sub, 
            tensor& data_output
        ) -> decltype(layer.forward_inplace(sub.get_output(),data_output))
        {
            layer.forward_inplace(sub.get_output(),data_output);
        }

        template <typename layer_type, typename SUBNET>
        auto call_layer_forward(
            layer_type& layer,
            const SUBNET& sub, 
            resizable_tensor& data_output
        ) -> decltype(layer.forward_inplace(sub.get_output(),data_output))
        {
            if (!have_same_dimensions(data_output, sub.get_output()))
                data_output.copy_size(sub.get_output());
            layer.forward_inplace(sub.get_output(),data_output);
        }


    } // end namespace impl
369

370
371
372
373
374
375
376
377
378
379
380
381
    template <typename... T>
    typename impl::tuple_head_helper<std::tuple<T...>>::type tuple_head (
        const std::tuple<T...>& item
    ) 
    {
        return impl::tuple_head_helper<std::tuple<T...>>::get(item);
    }

    template <typename... T>
    auto tuple_tail(
        const std::tuple<T...>& item
    ) -> decltype(impl::basic_tuple_tail(impl::tuple_flatten(item)))
382
    {
383
        return impl::basic_tuple_tail(impl::tuple_flatten(item));
384
385
    }

386
387
388
389
390
391
    inline std::tuple<> tuple_tail(
        const std::tuple<>& item
    ) 
    {
        return item;
    }
392
393
394
395
396
397
398
399
// ----------------------------------------------------------------------------------------

    inline void randomize_parameters (
        tensor& params,
        unsigned long num_inputs_and_outputs,
        dlib::rand& rnd
    )
    {
Davis King's avatar
Davis King committed
400
        for (auto& val : params)
401
402
403
404
        {
            // Draw a random number to initialize the layer according to formula (16)
            // from Understanding the difficulty of training deep feedforward neural
            // networks by Xavier Glorot and Yoshua Bengio.
Davis King's avatar
Davis King committed
405
            val = 2*rnd.get_random_float()-1;
406
407
408
409
410
411
            val *= std::sqrt(6.0/(num_inputs_and_outputs));
        }
    }

// ----------------------------------------------------------------------------------------

412
    template <typename T>
413
414
    class sstack
    {
Davis King's avatar
Davis King committed
415
416
    public:
        typedef T value_type;
417

418
        sstack() = delete;
419

420
421
422
423
        sstack (
            T* data_,
            size_t s
        ) : data(data_), mysize(s) {}
424

425
426
427
428
429
430
431
432
433
        const T& top() const 
        { 
            DLIB_CASSERT(size() != 0, "You can't call top() on an empty stack");
            return *data;
        }
        T& top()  
        { 
            DLIB_CASSERT(size() != 0, "You can't call top() on an empty stack");
            return *data;
434
435
        }

436
437
438
439
440
441
        size_t size() const { return mysize; }

        sstack pop(size_t num=1) 
        { 
            DLIB_CASSERT(num < size(), "You can't pop more things from the stack than it has in it.");
            return sstack(data+num, mysize-num);
442
443
        }

Davis King's avatar
Davis King committed
444
    private:
445
446
447

        T* data;
        size_t mysize;
448
449
450
    };

    template <typename T>
451
    sstack<T> make_sstack(std::vector<T>& item)
452
    {
453
454
        return sstack<T>(item.data(), item.size());
    }
455
456
457
458
459
460
461

// ----------------------------------------------------------------------------------------
// ----------------------------------------------------------------------------------------
// ----------------------------------------------------------------------------------------

    namespace dimpl
    {
462
        template <typename T, bool is_first = true, typename enabled=void>
Davis King's avatar
Davis King committed
463
        class subnet_wrapper
464
465
466
        {
            /*!
                WHAT THIS OBJECT REPRESENTS
Davis King's avatar
Davis King committed
467
                    This is a tool that makes an add_layer or add_loss_layer object
Davis King's avatar
Davis King committed
468
                    expose only the part of its interface defined by the SUBNET
469
                    type in layers_abstract.h.  This way, when we pass subnetwork
470
                    objects to the layer callbacks those callbacks won't be able to 
471
                    interact with the subnetworks in a way other than specified 
Davis King's avatar
Davis King committed
472
                    by the SUBNET interface spec.
473
474
475
476
477
478
479

                    We also allow the top layer of a subnet_wrapper stack to call the
                    private_get_output() and private_get_gradient_input() functions.  This
                    way, layers that have had their output/gradient overwritten by in-place
                    layers can only be accessed from the in-place layers that sit directly
                    on top of them since those in-place layers are the only layers that
                    know how to interact with them properly.
480
481
482
            !*/

        public:
Davis King's avatar
Davis King committed
483
484
            subnet_wrapper(const subnet_wrapper&) = delete;
            subnet_wrapper& operator=(const subnet_wrapper&) = delete;
485

Davis King's avatar
Davis King committed
486
            subnet_wrapper(T& l_) {}
487
488
489
490
491
            // Nothing here because in this case T is one of the input layer types 
            // that doesn't have anything in it.
        };

        template <typename T>
492
493
494
495
496
497
498
499
        class subnet_wrapper<T,true, typename std::enable_if<is_nonloss_layer_type<T>::value>::type>
        {

        public:
            subnet_wrapper(const subnet_wrapper&) = delete;
            subnet_wrapper& operator=(const subnet_wrapper&) = delete;

            typedef T wrapped_type;
500
            const static size_t num_computational_layers = T::num_computational_layers;
501
502
503
504
505
506

            subnet_wrapper(T& l_) : l(l_),subnetwork(l.subnet()) {}

            const tensor& get_output() const { return l.private_get_output(); }
            tensor& get_gradient_input() { return l.private_get_gradient_input(); }

Davis King's avatar
Davis King committed
507
508
            const subnet_wrapper<typename T::subnet_type,false>& subnet() const { return subnetwork; }
            subnet_wrapper<typename T::subnet_type,false>& subnet() { return subnetwork; }
509
510
511
512
513
514
515
516

        private:
            T& l;
            subnet_wrapper<typename T::subnet_type,false> subnetwork;
        };

        template <typename T>
        class subnet_wrapper<T,false, typename std::enable_if<is_nonloss_layer_type<T>::value>::type>
517
518
519
        {

        public:
Davis King's avatar
Davis King committed
520
521
            subnet_wrapper(const subnet_wrapper&) = delete;
            subnet_wrapper& operator=(const subnet_wrapper&) = delete;
522

523
            typedef T wrapped_type;
524
            const static size_t num_computational_layers = T::num_computational_layers;
525

Davis King's avatar
Davis King committed
526
            subnet_wrapper(T& l_) : l(l_),subnetwork(l.subnet()) {}
527
528
529
530

            const tensor& get_output() const { return l.get_output(); }
            tensor& get_gradient_input() { return l.get_gradient_input(); }

Davis King's avatar
Davis King committed
531
532
            const subnet_wrapper<typename T::subnet_type,false>& subnet() const { return subnetwork; }
            subnet_wrapper<typename T::subnet_type,false>& subnet() { return subnetwork; }
533
534
535

        private:
            T& l;
Davis King's avatar
Davis King committed
536
            subnet_wrapper<typename T::subnet_type,false> subnetwork;
537
538
539
        };
    }

Davis King's avatar
Davis King committed
540
541
// ----------------------------------------------------------------------------------------

Davis King's avatar
Davis King committed
542
    template <typename LAYER_DETAILS, typename SUBNET, typename enabled = void>
543
544
    class add_layer;

Davis King's avatar
Davis King committed
545

546
    template <typename T, typename U>
Davis King's avatar
Davis King committed
547
    struct is_nonloss_layer_type<add_layer<T,U>> : std::true_type {};
548

Davis King's avatar
Davis King committed
549
550
551
    template <typename LAYER_DETAILS, typename SUBNET>
    class add_layer<LAYER_DETAILS,SUBNET, 
            typename std::enable_if<is_nonloss_layer_type<SUBNET>::value>::type>
552
553
554
    {
    public:
        typedef LAYER_DETAILS layer_details_type;
Davis King's avatar
Davis King committed
555
556
557
        typedef SUBNET subnet_type;
        typedef typename subnet_type::input_type input_type;
        const static size_t num_layers = subnet_type::num_layers + 1;
558
        const static size_t num_computational_layers = subnet_type::num_computational_layers + 1;
Davis King's avatar
Davis King committed
559
        const static unsigned int sample_expansion_factor = subnet_type::sample_expansion_factor;
560
561
562

        add_layer(
        ):
563
            subnetwork(new subnet_type()),
564
            this_layer_setup_called(false),
565
566
            gradient_input_is_stale(true),
            get_output_and_gradient_input_disabled(false)
567
        {
568
            if (this_layer_operates_inplace())
569
                subnetwork->disable_output_and_gradient_getters();
570
571
        }

572
573
574
575
576
577
578
579
580
581
582
583
584
        add_layer(const add_layer& item)
        {
            details = item.details;
            subnetwork.reset(new subnet_type(*item.subnetwork));
            this_layer_setup_called = item.this_layer_setup_called;
            gradient_input_is_stale = item.gradient_input_is_stale;
            get_output_and_gradient_input_disabled = item.get_output_and_gradient_input_disabled;
            x_grad = item.x_grad;
            cached_output = item.cached_output; 
            params_grad = item.params_grad; 
            temp_tensor = item.temp_tensor;
        }
        add_layer& operator=(const add_layer& item) { add_layer(item).swap(*this); return *this;}
585
586
        add_layer(add_layer&& item) : add_layer() { swap(item); }
        add_layer& operator=(add_layer&& item) { swap(item); return *this; }
587
588
589

        template <typename T, typename U, typename E>
        friend class add_layer;
590
591
        template <typename T, bool is_first, typename E>
        friend class dimpl::subnet_wrapper;
592
593
594
595
        template <unsigned long T, typename U, typename E>
        friend class add_tag_layer;
        template <template<typename> class T, typename U>
        friend class add_skip_layer;
596
597
        template <size_t N, template<typename> class L, typename S>
        friend class repeat;
598
599
600
601
602
603
604
605

        // Allow copying networks from one to another as long as their corresponding 
        // layers can be constructed from each other.
        template <typename T, typename U, typename E>
        add_layer(
            const add_layer<T,U,E>& item
        ) :
            details(item.layer_details()), 
606
            subnetwork(new subnet_type(item.subnet())),
607
608
            this_layer_setup_called(item.this_layer_setup_called),
            gradient_input_is_stale(item.gradient_input_is_stale),
609
            get_output_and_gradient_input_disabled(item.get_output_and_gradient_input_disabled),
610
611
612
            x_grad(item.x_grad),
            cached_output(item.cached_output)
        {
613
            if (this_layer_operates_inplace())
614
                subnetwork->disable_output_and_gradient_getters();
615
616
617
618
619
620
621
622
        }

        template <typename ...T>
        add_layer(
            const LAYER_DETAILS& layer_det, 
            T&& ...args
        ) : 
            details(layer_det), 
623
            subnetwork(new subnet_type(std::forward<T>(args)...)),
624
            this_layer_setup_called(false),
625
626
            gradient_input_is_stale(true),
            get_output_and_gradient_input_disabled(false)
627
        {
628
            if (this_layer_operates_inplace())
629
                subnetwork->disable_output_and_gradient_getters();
630
631
        }

632
        template <typename T, typename ...U>
633
        struct disable_forwarding_constr 
634
635
636
637
        {
            const static bool value = std::is_constructible<LAYER_DETAILS,T>::value;
        };
        template <typename ...T, typename ...U>
638
        struct disable_forwarding_constr<std::tuple<T...>,U...>
639
640
641
642
643
644
645
646
647
648
        {
            const static bool value = disable_forwarding_constr<typename std::remove_reference<T>::type...>::value;
        };
        template <typename T, typename ...U>
        struct disable_forwarding_constr<std::tuple<T>,U...>
        {
            const static bool value = disable_forwarding_constr<typename std::remove_reference<T>::type>::value;
        };
        template <typename ...U>
        struct disable_forwarding_constr<std::tuple<>,U...>
649
650
651
652
653
        {
            const static bool value = true;
        };
        template <typename ...T>
        struct disable_forwarding_constr<add_layer<T...>>
654
655
656
        {
            const static bool value = true;
        };
657
658
659

        template <
            typename ...T,
660
            typename = typename std::enable_if<!disable_forwarding_constr<typename std::remove_reference<T>::type...>::value>::type
661
662
663
664
665
666
667
668
669
670
671
672
673
            >
        add_layer(
            T&& ...args
        ) : 
            subnetwork(new subnet_type(std::forward<T>(args)...)),
            this_layer_setup_called(false),
            gradient_input_is_stale(true),
            get_output_and_gradient_input_disabled(false)
        {
            if (this_layer_operates_inplace())
                subnetwork->disable_output_and_gradient_getters();
        }

674
675
676
677
678
679
        template <typename ...T>
        add_layer(
            LAYER_DETAILS&& layer_det, 
            T&& ...args
        ) : 
            details(std::move(layer_det)), 
680
            subnetwork(new subnet_type(std::forward<T>(args)...)),
681
            this_layer_setup_called(false),
682
683
            gradient_input_is_stale(true),
            get_output_and_gradient_input_disabled(false)
684
        {
685
            if (this_layer_operates_inplace())
686
                subnetwork->disable_output_and_gradient_getters();
687
688
        }

689
        template <typename ...T, typename LD, typename ...U>
690
        add_layer(
691
            const std::tuple<LD,U...>& layer_det, 
692
693
            T&& ...args
        ) : 
694
            details(tuple_head(layer_det)), 
695
            subnetwork(new subnet_type(tuple_tail(layer_det),std::forward<T>(args)...)),
696
            this_layer_setup_called(false),
697
698
            gradient_input_is_stale(true),
            get_output_and_gradient_input_disabled(false)
699
        {
700
            if (this_layer_operates_inplace())
701
                subnetwork->disable_output_and_gradient_getters();
702
703
        }

704
        template <typename ...T, typename LD, typename ...U>
705
706
        add_layer(
            std::tuple<>,
707
            const std::tuple<LD,U...>& layer_det, 
708
709
710
            T&& ...args
        ) : add_layer(layer_det,args...) { }

711
712
713
714
        add_layer (
            std::tuple<>
        ) : add_layer() {}

715
716
717
718
719
720
721
        template <typename ...T>
        add_layer(
            std::tuple<>, 
            LAYER_DETAILS&& layer_det, 
            T&& ...args
        ) : add_layer(layer_det, args...) { }

722
723
        template <typename input_iterator>
        void to_tensor (
724
725
            input_iterator ibegin,
            input_iterator iend,
726
727
728
            resizable_tensor& data
        ) const
        {
729
            subnetwork->to_tensor(ibegin,iend,data);
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
        }

        template <typename input_iterator>
        const tensor& operator() (
            input_iterator ibegin,
            input_iterator iend
        )
        {
            to_tensor(ibegin,iend,temp_tensor);
            return forward(temp_tensor);
        }


        const tensor& operator() (const input_type& x)
        {
            return (*this)(&x, &x+1);
        }

        const tensor& forward(const tensor& x)
        {
750
751
            subnetwork->forward(x);
            const dimpl::subnet_wrapper<subnet_type> wsub(*subnetwork);
752
753
754
755
756
            if (!this_layer_setup_called)
            {
                details.setup(wsub);
                this_layer_setup_called = true;
            }
757
758
759
760
761
            if (this_layer_operates_inplace())
                impl::call_layer_forward(details, wsub, private_get_output());
            else
                impl::call_layer_forward(details, wsub, cached_output);

762
            gradient_input_is_stale = true;
763
            return private_get_output();
764
765
        }

766
767
    private:
        tensor& private_get_output() const
768
        { 
769
            if (const_cast<add_layer&>(*this).this_layer_operates_inplace())
770
                return subnetwork->private_get_output();
771
772
773
774
775
776
            else
                return const_cast<resizable_tensor&>(cached_output); 
        }
        tensor& private_get_gradient_input() 
        { 
            if (this_layer_operates_inplace())
777
            {
778
                return subnetwork->private_get_gradient_input();
779
            }
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
            else
            {
                if (gradient_input_is_stale)
                {
                    gradient_input_is_stale = false;
                    x_grad.copy_size(private_get_output());
                    x_grad = 0;
                }
                return x_grad; 
            }
        }
        void disable_output_and_gradient_getters (
        ) { get_output_and_gradient_input_disabled = true; }
    public:
        const tensor& get_output() const 
        { 
            if (get_output_and_gradient_input_disabled)
                throw dlib::error("Accessing this layer's get_output() is disabled because an in-place layer has been stacked on top of it.");
            return private_get_output(); 
        }
        tensor& get_gradient_input() 
        { 
            if (get_output_and_gradient_input_disabled)
                throw dlib::error("Accessing this layer's get_gradient_input() is disabled because an in-place layer has been stacked on top of it.");
            return private_get_gradient_input();
805
806
        }

807
        const tensor& get_final_data_gradient(
808
        ) const { return subnetwork->get_final_data_gradient(); }
809

810
        void back_propagate_error(const tensor& x)
811
        {
812
            back_propagate_error(x, private_get_gradient_input());
813
        }
814
        void back_propagate_error(const tensor& x, const tensor& gradient_input)
815
        {
816
            dimpl::subnet_wrapper<subnet_type> wsub(*subnetwork);
817
            params_grad.copy_size(details.get_layer_params());
818
            impl::call_layer_backward(details, private_get_output(),
819
                gradient_input, wsub, static_cast<tensor&>(params_grad));
820

821
822
823
824
825
826
827
828
829
830
            subnetwork->back_propagate_error(x); 

            // zero out get_gradient_input()
            gradient_input_is_stale = true;
        }

        template <typename solver_type>
        void update_parameters(sstack<solver_type> solvers, double step_size)
        {
            DLIB_CASSERT(solvers.size()>=num_computational_layers,"");
831
832
            // Don't try to adjust the parameters if this layer doesn't have any.
            if (params_grad.size() != 0)
833
834
835
836
            {
                const tensor& step = solvers.top()(details.get_layer_params(), static_cast<const tensor&>(params_grad));
                tt::add(1,details.get_layer_params(), step_size, step);
            }
837
            subnetwork->update_parameters(solvers.pop(), step_size);
838
839
        }

840
841
842
843
844
845
        const tensor& get_parameter_gradient(
        ) const { return params_grad; }

        tensor& get_parameter_gradient (
        ) { return params_grad; }

846
847
        const subnet_type& subnet() const { return *subnetwork; }
        subnet_type& subnet() { return *subnetwork; }
848
849
850
851
852
853
854
855
856
857
858

        const layer_details_type& layer_details() const { return details; } 
        layer_details_type& layer_details() { return details; } 

        void clean()
        {
            x_grad.clear();
            cached_output.clear();
            params_grad.clear();
            temp_tensor.clear();
            gradient_input_is_stale = true;
859
            subnetwork->clean();
860
861
        }

862
863
        friend void serialize(const add_layer& item, std::ostream& out)
        {
864
            int version = 2;
865
            serialize(version, out);
866
            serialize(*item.subnetwork, out);
867
868
869
            serialize(item.details, out);
            serialize(item.this_layer_setup_called, out);
            serialize(item.gradient_input_is_stale, out);
870
            serialize(item.get_output_and_gradient_input_disabled, out);
871
872
            serialize(item.x_grad, out);
            serialize(item.cached_output, out);
873
            serialize(item.params_grad, out);
874
875
876
877
878
879
        }

        friend void deserialize(add_layer& item, std::istream& in)
        {
            int version = 0;
            deserialize(version, in);
880
            if (!(1 <= version && version <= 2))
881
                throw serialization_error("Unexpected version found while deserializing dlib::add_layer.");
882
            deserialize(*item.subnetwork, in);
883
884
885
            deserialize(item.details, in);
            deserialize(item.this_layer_setup_called, in);
            deserialize(item.gradient_input_is_stale, in);
886
            deserialize(item.get_output_and_gradient_input_disabled, in);
887
888
            deserialize(item.x_grad, in);
            deserialize(item.cached_output, in);
889
890
            if (version == 2)
                deserialize(item.params_grad, in);
891
892
        }

893
894
895
896
897
898
899
900
901
902
903
904
        friend std::ostream& operator<< (std::ostream& out, const add_layer& item)
        {
            item.print(out, 0);
            return out;
        }

        void print (std::ostream& out, unsigned long idx=0) const
        {
            out << "layer<" << idx << ">\t" << layer_details() << "\n";
            subnet().print(out, idx+1);
        }

905
906
    private:

907
908
909
910
        bool this_layer_operates_inplace(
        ) 
        {
            // This layer can run in-place if it's an in-place capable layer and also if
911
            // the layer it's on top of doesn't need its own output tensor (since in-place
912
            // layers overwrite that tensor)
913
            return impl::is_inplace_layer(details, *subnetwork) && !subnetwork->this_layer_requires_forward_output();
914
915
916
917
        }
        bool this_layer_requires_forward_output(
        ) 
        {
918
            return impl::backward_requires_forward_output(details, *subnetwork);
919
920
        }

921
922
923
924
925
926
        void swap(add_layer& item)
        {
            std::swap(subnetwork,item.subnetwork);
            std::swap(details, item.details);
            std::swap(this_layer_setup_called, item.this_layer_setup_called);
            std::swap(gradient_input_is_stale, item.gradient_input_is_stale);
927
            std::swap(get_output_and_gradient_input_disabled, item.get_output_and_gradient_input_disabled);
928
929
            std::swap(x_grad, item.x_grad);
            std::swap(cached_output, item.cached_output);
930
            std::swap(params_grad, item.params_grad);
931
932
        }

933
934

        LAYER_DETAILS details;
935
        std::unique_ptr<subnet_type> subnetwork;
936
937
        bool this_layer_setup_called;
        bool gradient_input_is_stale;
938
939
940
941
        bool get_output_and_gradient_input_disabled;
        // Note that if this_layer_operates_inplace()==true then x_grad and cached_output
        // are not used at all.  Instead, this layer uses these variables from the lower
        // layer.
942
943
944
945
        resizable_tensor x_grad;
        resizable_tensor cached_output; 

        resizable_tensor params_grad; 
946
947
948

        // temp_tensor doesn't logically contribute to the state of this object.  
        // It is here only to prevent it from being reallocated over and over.
949
950
951
952
        resizable_tensor temp_tensor;

    };

Davis King's avatar
Davis King committed
953
954
955
956
957
958
959
960
961
    template <typename T, typename U, typename E>
    struct is_add_layer<add_layer<T,U,E>> : std::true_type {};
    template <typename T, typename U, typename E>
    struct is_add_layer<const add_layer<T,U,E>> : std::true_type {};
    template <typename T, typename U, typename E>
    struct is_add_layer<add_layer<T,U,E>&> : std::true_type {};
    template <typename T, typename U, typename E>
    struct is_add_layer<const add_layer<T,U,E>&> : std::true_type {};

962
963
// ----------------------------------------------------------------------------------------

964
// This version of add_layer handles the special case where the subnetwork being given is
Davis King's avatar
Davis King committed
965
// just an input layer object.
966
967
968
969
970
    template <typename LAYER_DETAILS, typename INPUT_LAYER, typename enabled>
    class add_layer
    {
    public:
        typedef LAYER_DETAILS layer_details_type;
Davis King's avatar
Davis King committed
971
        typedef INPUT_LAYER subnet_type;
972
973
        typedef typename INPUT_LAYER::input_type input_type;
        const static unsigned int sample_expansion_factor = INPUT_LAYER::sample_expansion_factor;
974
975
        const static size_t num_layers = 2;
        const static size_t num_computational_layers = 1;
976
977
978
979
980
981
        static_assert(sample_expansion_factor >= 1,
            "The input layer can't produce fewer output tensors than there are inputs.");

        add_layer(
        ): 
            this_layer_setup_called(false),
982
983
            gradient_input_is_stale(true),
            get_output_and_gradient_input_disabled(false)
984
985
986
        {}

        add_layer(const add_layer&) = default;
987
        add_layer(add_layer&& item) : add_layer() { swap(item); }
988
        add_layer& operator=(const add_layer&) = default;
989
        add_layer& operator=(add_layer&& item) { swap(item); return *this; }
990
991
992

        template <typename T, typename U, typename E>
        friend class add_layer;
993
994
        template <typename T, bool is_first, typename E>
        friend class dimpl::subnet_wrapper;
995
996
997
998
        template <unsigned long T, typename U, typename E>
        friend class add_tag_layer;
        template <template<typename> class T, typename U>
        friend class add_skip_layer;
999
1000
        template <size_t N, template<typename> class L, typename S>
        friend class repeat;
1001
1002
1003
1004
1005
1006
1007

        // Allow copying networks from one to another as long as their corresponding 
        // layers can be constructed from each other.
        template <typename T, typename U, typename E>
        add_layer(
            const add_layer<T,U,E>& item
        ):
Davis King's avatar
Davis King committed
1008
            input_layer(item.subnet()),
1009
1010
1011
            details(item.layer_details()),
            this_layer_setup_called(item.this_layer_setup_called),
            gradient_input_is_stale(item.gradient_input_is_stale),
1012
            get_output_and_gradient_input_disabled(false),
1013
            x_grad(item.x_grad),
1014
1015
            cached_output(item.cached_output),
            grad_final(item.grad_final)
1016
1017
1018
1019
1020
1021
1022
1023
        {
        }

        add_layer(
            const LAYER_DETAILS& layer_det
        ) : 
            details(layer_det), 
            this_layer_setup_called(false),
1024
1025
            gradient_input_is_stale(true),
            get_output_and_gradient_input_disabled(false)
1026
1027
        {}

1028
1029
1030
1031
1032
1033
1034
1035
1036
        add_layer(
            const INPUT_LAYER& il 
        ) : 
            input_layer(il), 
            this_layer_setup_called(false),
            gradient_input_is_stale(true),
            get_output_and_gradient_input_disabled(false)
        {}

1037
1038
1039
1040
1041
        add_layer(
            LAYER_DETAILS&& layer_det
        ) : 
            details(std::move(layer_det)), 
            this_layer_setup_called(false),
1042
1043
            gradient_input_is_stale(true),
            get_output_and_gradient_input_disabled(false)
1044
1045
1046
1047
1048
1049
        {}

        add_layer(
            LAYER_DETAILS layer_det, 
            INPUT_LAYER il
        ) : 
1050
1051
            details(std::move(layer_det)),
            input_layer(std::move(il)),
1052
            this_layer_setup_called(false),
1053
1054
            gradient_input_is_stale(true),
            get_output_and_gradient_input_disabled(false)
1055
1056
        {}

1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
        add_layer(
            std::tuple<>,
            const LAYER_DETAILS& layer_det
        ) : add_layer(layer_det) {}

        add_layer(
            std::tuple<>,
            LAYER_DETAILS&& layer_det
        ) : add_layer(layer_det) {}

        add_layer(
            std::tuple<>,
            LAYER_DETAILS layer_det, 
            INPUT_LAYER il
        ) : add_layer(layer_det,il) {}

        add_layer(
            const std::tuple<LAYER_DETAILS>& layer_det
1075
        ) : add_layer(tuple_head(layer_det)) {}
1076
1077
1078
1079

        add_layer(
            const std::tuple<LAYER_DETAILS>& layer_det,
            INPUT_LAYER il
1080
        ) : add_layer(tuple_head(layer_det),il) {}
1081

1082
1083
        template <typename input_iterator>
        void to_tensor (
1084
1085
            input_iterator ibegin,
            input_iterator iend,
1086
1087
1088
            resizable_tensor& data
        ) const
        {
1089
            input_layer.to_tensor(ibegin, iend, data);
1090
            // make sure the input layer's to_tensor() function is implemented properly.
1091
            DLIB_CASSERT(std::distance(ibegin,iend)*sample_expansion_factor == data.num_samples(),"");
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
            data.async_copy_to_device();
        }


        template <typename input_iterator>
        const tensor& operator() (
            input_iterator ibegin,
            input_iterator iend
        )
        {
            to_tensor(ibegin,iend,temp_tensor);
            return forward(temp_tensor);
        }


        const tensor& operator() (const input_type& x)
        {
            return (*this)(&x, &x+1);
        }

        const tensor& forward (const tensor& x)
        {
            DLIB_CASSERT(x.num_samples()%sample_expansion_factor == 0,"");
1115
            subnet_wrapper wsub(x, grad_final);
1116
1117
1118
1119
1120
            if (!this_layer_setup_called)
            {
                details.setup(wsub);
                this_layer_setup_called = true;
            }
1121
            impl::call_layer_forward(details, wsub, cached_output);
1122
            gradient_input_is_stale = true;
1123
            return private_get_output();
1124
1125
        }

1126
1127
1128
    private:
        tensor& private_get_output() const { return const_cast<resizable_tensor&>(cached_output); }
        tensor& private_get_gradient_input() 
1129
1130
1131
1132
        { 
            if (gradient_input_is_stale)
            {
                gradient_input_is_stale = false;
1133
                x_grad.copy_size(private_get_output());
1134
1135
1136
1137
                x_grad = 0;
            }
            return x_grad; 
        }
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
        void disable_output_and_gradient_getters (
        ) { get_output_and_gradient_input_disabled = true; }
    public:
        const tensor& get_output() const 
        { 
            if (get_output_and_gradient_input_disabled)
                throw dlib::error("Accessing this layer's get_output() is disabled because an in-place layer has been stacked on top of it.");
            return private_get_output(); 
        }
        tensor& get_gradient_input() 
        { 
            if (get_output_and_gradient_input_disabled)
                throw dlib::error("Accessing this layer's get_gradient_input() is disabled because an in-place layer has been stacked on top of it.");
            return private_get_gradient_input();
        }
1153

1154
        const tensor& get_final_data_gradient(
1155
        ) const { return grad_final; }
1156

1157
        void back_propagate_error(const tensor& x)
1158
        {
1159
            back_propagate_error(x, private_get_gradient_input());
1160
        }
1161
        void back_propagate_error(const tensor& x, const tensor& gradient_input)
1162
        {
1163
1164
1165
1166
1167
1168
            // make sure grad_final is initialized to 0
            if (!have_same_dimensions(x, grad_final))
                grad_final.copy_size(x);
            grad_final = 0;  

            subnet_wrapper wsub(x, grad_final);
1169
            params_grad.copy_size(details.get_layer_params());
1170
            impl::call_layer_backward(details, private_get_output(),
1171
                gradient_input, wsub, static_cast<tensor&>(params_grad));
1172

1173
1174
1175
1176
1177
1178
1179
1180
            // zero out get_gradient_input()
            gradient_input_is_stale = true;
        }

        template <typename solver_type>
        void update_parameters(sstack<solver_type> solvers, double step_size)
        {
            DLIB_CASSERT(solvers.size()>=num_computational_layers,"");
1181
            // Don't try to adjust the parameters if this layer doesn't have any.
1182
            if (params_grad.size() != 0) {
1183
1184
1185
                const tensor& step = solvers.top()(details.get_layer_params(), static_cast<const tensor&>(params_grad));
                tt::add(1,details.get_layer_params(), step_size, step);
            }
1186
1187
        }

1188
1189
1190
1191
1192
1193
        const tensor& get_parameter_gradient(
        ) const { return params_grad; }

        tensor& get_parameter_gradient (
        )  { return params_grad; }

Davis King's avatar
Davis King committed
1194
1195
        const subnet_type& subnet() const { return input_layer; } 
        subnet_type& subnet() { return input_layer; } 
1196
1197
1198
1199
1200
1201
1202

        const layer_details_type& layer_details() const { return details; } 
        layer_details_type& layer_details() { return details; } 

        void clean()
        {
            x_grad.clear();
1203
            grad_final.clear();
1204
1205
1206
1207
1208
1209
            cached_output.clear();
            params_grad.clear();
            temp_tensor.clear();
            gradient_input_is_stale = true;
        }

1210
1211
        friend void serialize(const add_layer& item, std::ostream& out)
        {
1212
            int version = 2;
1213
1214
1215
1216
1217
            serialize(version, out);
            serialize(item.input_layer, out);
            serialize(item.details, out);
            serialize(item.this_layer_setup_called, out);
            serialize(item.gradient_input_is_stale, out);
1218
            serialize(item.get_output_and_gradient_input_disabled, out);
1219
1220
            serialize(item.x_grad, out);
            serialize(item.cached_output, out);
1221
            serialize(item.grad_final, out);
1222
1223
1224
1225
1226
1227
        }

        friend void deserialize(add_layer& item, std::istream& in)
        {
            int version = 0;
            deserialize(version, in);
1228
            if (version != 2)
1229
1230
1231
1232
1233
                throw serialization_error("Unexpected version found while deserializing dlib::add_layer.");
            deserialize(item.input_layer, in);
            deserialize(item.details, in);
            deserialize(item.this_layer_setup_called, in);
            deserialize(item.gradient_input_is_stale, in);
1234
            deserialize(item.get_output_and_gradient_input_disabled, in);
1235
1236
            deserialize(item.x_grad, in);
            deserialize(item.cached_output, in);
1237
            deserialize(item.grad_final, in);
1238
1239
        }

1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
        friend std::ostream& operator<< (std::ostream& out, const add_layer& item)
        {
            item.print(out, 0);
            return out;
        }

        void print (std::ostream& out, unsigned long idx=0) const
        {
            out << "layer<" << idx << ">\t" << layer_details() << "\n";
            // Don't print the repeat_input_layer since it doesn't exist from the user's
            // point of view.  It's just an artifact of how repeat<> works.
            if (!std::is_same<subnet_type, impl::repeat_input_layer>::value)
                out << "layer<" << idx+1 << ">\t" << subnet() << "\n";
        }

1255
1256
    private:

1257
1258
1259
        bool this_layer_requires_forward_output(
        ) 
        {
1260
            subnet_wrapper wsub(grad_final, grad_final);
1261
1262
1263
            return impl::backward_requires_forward_output(details, wsub);
        }

Davis King's avatar
Davis King committed
1264
        class subnet_wrapper
1265
1266
        {
        public:
1267
1268
            subnet_wrapper(const tensor& x_, resizable_tensor& grad_final_) :
                x(x_), grad_final(grad_final_) {}
1269

Davis King's avatar
Davis King committed
1270
1271
            subnet_wrapper(const subnet_wrapper&) = delete;
            subnet_wrapper& operator=(const subnet_wrapper&) = delete;
1272

1273
1274
1275
            const tensor& get_output() const { return x; }
            tensor& get_gradient_input() 
            { 
1276
                if (!have_same_dimensions(x, grad_final))
1277
                {
1278
1279
                    grad_final.copy_size(x);
                    grad_final = 0;  
1280
                }
1281
                return grad_final; 
1282
1283
1284
1285
            }

        private:
            const tensor& x;
1286
            resizable_tensor& grad_final;
1287
1288
        };

1289
1290
1291
1292
1293
1294
        void swap(add_layer& item)
        {
            std::swap(input_layer, item.input_layer);
            std::swap(details, item.details);
            std::swap(this_layer_setup_called, item.this_layer_setup_called);
            std::swap(gradient_input_is_stale, item.gradient_input_is_stale);
1295
            std::swap(get_output_and_gradient_input_disabled, item.get_output_and_gradient_input_disabled);
1296
1297
            std::swap(x_grad, item.x_grad); 
            std::swap(cached_output, item.cached_output); 
1298
            std::swap(grad_final, item.grad_final); 
1299
1300
        }

Davis King's avatar
Davis King committed
1301
        subnet_type input_layer;
1302
1303
1304
        LAYER_DETAILS details;
        bool this_layer_setup_called;
        bool gradient_input_is_stale;
1305
        bool get_output_and_gradient_input_disabled;
1306
1307
        resizable_tensor x_grad; 
        resizable_tensor cached_output; 
1308
        resizable_tensor grad_final;
1309

1310
        // The following 2 objects don't logically contribute to the state of this class.
1311
1312
1313
1314
1315
1316
1317
1318
        // They are only here to prevent them from being reallocated over and over in
        // member functions.
        resizable_tensor params_grad; 
        resizable_tensor temp_tensor; 
    };

// ----------------------------------------------------------------------------------------

Davis King's avatar
Davis King committed
1319
    template <unsigned long ID, typename SUBNET, typename enabled=void>
1320
1321
    class add_tag_layer;

Davis King's avatar
Davis King committed
1322
1323
1324
    template <unsigned long ID, typename SUBNET>
    class add_tag_layer<ID,SUBNET,
            typename std::enable_if<is_nonloss_layer_type<SUBNET>::value>::type>
1325
1326
    {
    public:
Davis King's avatar
Davis King committed
1327
1328
        typedef SUBNET subnet_type;
        typedef typename subnet_type::input_type input_type;
1329
1330
        const static size_t num_layers = subnet_type::num_layers + 1;
        const static size_t num_computational_layers = subnet_type::num_computational_layers;
Davis King's avatar
Davis King committed
1331
        const static unsigned int sample_expansion_factor = subnet_type::sample_expansion_factor;
1332
1333
1334
        static_assert(sample_expansion_factor >= 1,
            "The input layer can't produce fewer output tensors than there are inputs.");

Davis King's avatar
Davis King committed
1335
1336
1337
1338
1339
        add_tag_layer() = default;
        add_tag_layer(const add_tag_layer&) = default;
        add_tag_layer(add_tag_layer&&) = default;
        add_tag_layer& operator=(add_tag_layer&&) = default;
        add_tag_layer& operator=(const add_tag_layer&) = default;
1340
1341

        template <typename T>
Davis King's avatar
Davis King committed
1342
1343
        add_tag_layer(
            const add_tag_layer<ID,T>& item
Davis King's avatar
Davis King committed
1344
        ) : subnetwork(item.subnet())
1345
1346
1347
        {}

        template <typename ...T>
Davis King's avatar
Davis King committed
1348
        add_tag_layer(
1349
1350
            T ...args
        ) : 
Davis King's avatar
Davis King committed
1351
            subnetwork(std::move(args)...) 
1352
1353
1354
1355
1356
        {
        }

        template <typename input_iterator>
        void to_tensor (
1357
1358
            input_iterator ibegin,
            input_iterator iend,
1359
1360
1361
            resizable_tensor& data
        ) const
        {
Davis King's avatar
Davis King committed
1362
            subnetwork.to_tensor(ibegin,iend,data);
1363
1364
1365
1366
1367
1368
1369
1370
        }

        template <typename input_iterator>
        const tensor& operator() (
            input_iterator ibegin,
            input_iterator iend
        )
        {
Davis King's avatar
Davis King committed
1371
            return subnetwork(ibegin,iend);
1372
1373
1374
1375
        }

        const tensor& operator() (const input_type& x)
        {
Davis King's avatar
Davis King committed
1376
            return subnetwork(x);
1377
1378
1379
1380
        }

        const tensor& forward(const tensor& x)
        {
Davis King's avatar
Davis King committed
1381
            return subnetwork.forward(x);
1382
1383
        }

Davis King's avatar
Davis King committed
1384
        const tensor& get_output() const { return subnetwork.get_output(); }
1385
1386
1387

        tensor& get_gradient_input() 
        { 
Davis King's avatar
Davis King committed
1388
            return subnetwork.get_gradient_input();
1389
1390
        }

1391
1392
1393
        const tensor& get_final_data_gradient(
        ) const { return subnetwork.get_final_data_gradient(); }

1394
1395
1396
1397
1398
        void back_propagate_error(const tensor& x)
        {
            subnetwork.back_propagate_error(x);
        }
        void back_propagate_error(const tensor& x, const tensor& gradient_input)
1399
        {
1400
            subnetwork.back_propagate_error(x,gradient_input);
1401
1402
        }

1403
        template <typename solver_type>
1404
        void update_parameters(sstack<solver_type> solvers, double step_size)
1405
        {
1406
            subnetwork.update_parameters(solvers, step_size);
1407
1408
        }

1409
1410
1411
1412
1413
1414
        const tensor& get_parameter_gradient(
        ) const { return params_grad; }

        tensor& get_parameter_gradient (
        ) { return params_grad; }

Davis King's avatar
Davis King committed
1415
1416
        const subnet_type& subnet() const { return subnetwork; }
        subnet_type& subnet() { return subnetwork; }
1417
1418
1419

        void clean()
        {
Davis King's avatar
Davis King committed
1420
            subnetwork.clean();
1421
1422
        }

1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
        friend void serialize(const add_tag_layer& item, std::ostream& out)
        {
            int version = 1;
            serialize(version, out);
            serialize(item.subnetwork, out);
        }

        friend void deserialize(add_tag_layer& item, std::istream& in)
        {
            int version = 0;
            deserialize(version, in);
            if (version != 1)
                throw serialization_error("Unexpected version found while deserializing dlib::add_tag_layer.");
            deserialize(item.subnetwork, in);
        }

1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
        friend std::ostream& operator<< (std::ostream& out, const add_tag_layer& item)
        {
            item.print(out, 0);
            return out;
        }

        void print (std::ostream& out, unsigned long idx=0) const
        {
            out << "layer<" << idx << ">\ttag" << ID << "\n";
            subnet().print(out, idx+1);
        }

1451
1452
    private:

1453
1454
1455
1456
1457
1458
1459
1460
        template <typename T, typename U, typename E>
        friend class add_layer;
        template <typename T, bool is_first, typename E>
        friend class dimpl::subnet_wrapper;
        template <unsigned long T, typename U, typename E>
        friend class add_tag_layer;
        template <template<typename> class T, typename U>
        friend class add_skip_layer;
1461
1462
        template <size_t N, template<typename> class L, typename S>
        friend class repeat;
1463

Davis King's avatar
Davis King committed
1464
        // You wouldn't put a tag on a layer if you didn't want to access its forward
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
        // outputs.  So this is always true.
        bool this_layer_requires_forward_output(
        ) { return true; } 

        void disable_output_and_gradient_getters (
        ) 
        { 
            // This should never happen because only inplace layers call
            // disable_output_and_gradient_getters(), however, putting a tag layer right
            // before an inplace layer basically means you don't want the following layer
            // to operate in place.  So the inplace layer should turn itself into an
            // out-of-place layer and not call disable_output_and_gradient_getters(). 
            DLIB_CASSERT(false,"This should never happen");
        }

        tensor& private_get_output() const
        { return subnetwork.private_get_output(); }
        tensor& private_get_gradient_input() 
        { return subnetwork.private_get_gradient_input(); }

Davis King's avatar
Davis King committed
1485
        subnet_type subnetwork;
1486
1487
1488
1489
1490

        // This member doesn't logically contribute to the state of the object since it is
        // always empty. It's just here so we can have the get_parameter_gradient() methods
        // which have to return something.  So they return this empty tensor.
        resizable_tensor params_grad;
1491
1492
    };

1493
1494
// ----------------------------------------------------------------------------------------

1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
    template <typename ...T>
    struct decorator_repeat_group
    {
        decorator_repeat_group(
            T&& ...args
        ) : data(std::forward<T>(args)...) {}

        std::tuple<T...> data;
    };
    template <typename ...T>
    decorator_repeat_group<T...> repeat_group (
        T&& ...args
    )
    {
        return decorator_repeat_group<T...>(std::forward<T>(args)...);
    }

1512
1513
    template <
        size_t num,
1514
        template<typename> class REPEATED_LAYER, 
1515
1516
1517
1518
1519
1520
1521
1522
        typename SUBNET
        >
    class repeat
    {
        static_assert(num > 0, "You can't have a layer repeated 0 times.");
    public:
        typedef SUBNET subnet_type;
        typedef typename SUBNET::input_type input_type;
1523
1524
1525
1526
1527
1528
1529
1530
        const static size_t comp_layers_in_each_group = (REPEATED_LAYER<SUBNET>::num_computational_layers-SUBNET::num_computational_layers);
        const static size_t comp_layers_in_repeated_group = comp_layers_in_each_group*num;
        const static size_t num_computational_layers = comp_layers_in_repeated_group + SUBNET::num_computational_layers;

        const static size_t layers_in_each_group = (REPEATED_LAYER<SUBNET>::num_layers-SUBNET::num_layers);
        const static size_t layers_in_repeated_group = layers_in_each_group*num;
        const static size_t num_layers = subnet_type::num_layers + layers_in_repeated_group;

1531
1532
        const static unsigned int sample_expansion_factor = SUBNET::sample_expansion_factor;

1533
        typedef REPEATED_LAYER<impl::repeat_input_layer> repeated_layer_type;
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584

        repeat(
        ) : 
            details(num)
        {
        }

        size_t num_repetitions (
        ) const { return num; }

        const repeated_layer_type& get_repeated_layer (
            size_t i 
        ) const
        { 
            DLIB_CASSERT(i < num_repetitions(), "");
            return details[i]; 
        }

        repeated_layer_type& get_repeated_layer (
            size_t i 
        ) 
        { 
            DLIB_CASSERT(i < num_repetitions(), "");
            return details[i]; 
        }

        repeat(const repeat&) = default;
        repeat(repeat&&) = default;
        repeat& operator=(repeat&&) = default;
        repeat& operator=(const repeat&) = default;

        template <template<typename> class T, typename U>
        repeat(
            const repeat<num,T,U>& item
        ) : 
            subnetwork(item.subnetwork)
        {
            for (auto&& d : item.details)
                details.emplace_back(d);
        }

        template <typename T, typename ...U>
        repeat(
            T arg1,
            U ...args2
        ): 
            details(num, std::move(arg1)),
            subnetwork(std::move(args2)...)
        {
        }

1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
        template <typename ...T, typename ...U>
        repeat(
            decorator_repeat_group<T...>&& arg1,
            U ...args2
        ): 
            details(num, arg1.data),
            subnetwork(std::move(args2)...)
        {
        }

1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
        template <typename T, typename ...U>
        repeat(
            std::tuple<>,
            T arg1,
            U ...args2
        ): 
            details(num, std::move(arg1)),
            subnetwork(std::move(args2)...)
        {
        }

        template <typename input_iterator>
        void to_tensor (
            input_iterator ibegin,
            input_iterator iend,
            resizable_tensor& data
        ) const
        {
            subnetwork.to_tensor(ibegin,iend,data);
        }

        template <typename input_iterator>
        const tensor& operator() (
            input_iterator ibegin,
            input_iterator iend
        )
        {
            to_tensor(ibegin,iend,temp_tensor);
            return forward(temp_tensor);
        }

        const tensor& operator() (const input_type& x)
        {
            return (*this)(&x, &x+1);
        }

        const tensor& forward(const tensor& x)
        {
            subnetwork.forward(x);
            details[details.size()-1].forward(subnetwork.get_output());
            for (long i = details.size()-2; i >= 0; --i)
                details[i].forward(details[i+1].get_output());
            return private_get_output();
        }

    private:
        tensor& private_get_output() const
        { 
            return details[0].private_get_output();
        }
        tensor& private_get_gradient_input() 
        { 
            return details[0].private_get_gradient_input();
        }
    public:
        const tensor& get_output() const 
        { 
            return details[0].get_output(); 
        }
        tensor& get_gradient_input() 
        { 
            return details[0].get_gradient_input();
        }

1659
1660
1661
1662
1663
1664
1665
        const tensor& get_parameter_gradient(
        ) const { return details[0].get_parameter_gradient(); }

        tensor& get_parameter_gradient (
        ) { return details[0].get_parameter_gradient(); }

        void back_propagate_error(const tensor& x)
1666
        {
1667
            back_propagate_error(x, private_get_gradient_input());
1668
        }
1669
        void back_propagate_error(const tensor& x, const tensor& gradient_input)
1670
1671
1672
        {
            if (details.size() > 1)
            {
1673
                details[0].back_propagate_error(details[1].get_output(), gradient_input);
1674
1675
1676
                for (size_t i = 1; i < details.size(); ++i)
                {
                    if (i+1 < details.size())
1677
                        details[i].back_propagate_error(details[i+1].get_output(), details[i-1].get_final_data_gradient());
1678
                    else
1679
                        details[i].back_propagate_error(subnetwork.get_output(), details[i-1].get_final_data_gradient());
1680
1681
1682
1683
                }
            }
            else
            {
1684
                details[0].back_propagate_error(subnetwork.get_output(), gradient_input);
1685
            }
1686
1687
1688
1689
1690
1691
1692
1693
1694
            subnetwork.back_propagate_error(x, details.back().get_final_data_gradient());
        }

        template <typename solver_type>
        void update_parameters(sstack<solver_type> solvers, double step_size)
        {
            for (size_t i = 0; i < details.size(); ++i)
                details[i].update_parameters(solvers.pop(comp_layers_in_each_group*i),step_size);
            subnetwork.update_parameters(solvers.pop(comp_layers_in_each_group*details.size()),step_size);
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
        }

        const subnet_type& subnet() const { return subnetwork; }
        subnet_type& subnet() { return subnetwork; }

        void clean()
        {
            temp_tensor.clear();
            subnetwork.clean();
            for (auto&& d : details)
                d.clean();
        }

        friend void serialize(const repeat& item, std::ostream& out)
        {
            int version = 1;
            serialize(version, out);
            serialize(item.details, out);
            serialize(item.subnetwork, out);
        }

        friend void deserialize(repeat& item, std::istream& in)
        {
            int version = 0;
            deserialize(version, in);
            if (version != 1)
                throw serialization_error("Unexpected version found while deserializing dlib::repeat.");
            deserialize(item.details, in);
            deserialize(item.subnetwork, in);
        }

1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
        friend std::ostream& operator<< (std::ostream& out, const repeat& item)
        {
            item.print(out, 0);
            return out;
        }

        void print (std::ostream& out, unsigned long idx=0) const
        {
            for (size_t i = 0; i < num_repetitions(); ++i)
            {
                get_repeated_layer(i).print(out, idx);
                idx += layers_in_each_group;
            }
            subnet().print(out, idx);
        }
1741
1742
    private:

1743

1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
        template <typename T, typename U, typename E>
        friend class add_layer;
        template <typename T, bool is_first, typename E>
        friend class dimpl::subnet_wrapper;
        template <unsigned long T, typename U, typename E>
        friend class add_tag_layer;
        template <template<typename> class T, typename U>
        friend class add_skip_layer;
        template <size_t N, template<typename> class L, typename S>
        friend class repeat;

        bool this_layer_requires_forward_output(
        ) 
        { 
            return details[0].this_layer_requires_forward_output(); 
        } 

        void disable_output_and_gradient_getters (
        ) 
        { 
            details[0].disable_output_and_gradient_getters();
        }


        std::vector<repeated_layer_type> details; 
        subnet_type subnetwork;

        // temp_tensor doesn't logically contribute to the state of this class.
        // It is here only to void needing to reallocate it over and over.
        resizable_tensor temp_tensor;
    };

    template <
        size_t num,
1778
        template<typename> class REPEATED_LAYER, 
1779
1780
        typename SUBNET
        >
1781
    struct is_nonloss_layer_type<repeat<num,REPEATED_LAYER,SUBNET>> : std::true_type {};
1782

1783
1784
// ----------------------------------------------------------------------------------------

1785
// This version of add_tag_layer handles the special case where the subnetwork being given
1786
1787
1788
1789
1790
// is just an input layer object.
    template <unsigned long ID, typename INPUT_LAYER, typename enabled>
    class add_tag_layer
    {
    public:
Davis King's avatar
Davis King committed
1791
1792
        typedef INPUT_LAYER subnet_type;
        typedef typename subnet_type::input_type input_type;
1793
1794
1795
1796
        // This layer counts as a computational layer because it copies and stores the
        // inputs.
        const static size_t num_computational_layers = 1;
        const static size_t num_layers = 2;
Davis King's avatar
Davis King committed
1797
        const static unsigned int sample_expansion_factor = subnet_type::sample_expansion_factor;
1798
1799
1800
        static_assert(sample_expansion_factor >= 1,
            "The input layer can't produce fewer output tensors than there are inputs.");

1801
        add_tag_layer():cached_output_ptr(nullptr),gradient_input_is_stale(true) {}
1802

1803
1804
        add_tag_layer(const add_tag_layer&) = default;
        add_tag_layer& operator=(const add_tag_layer&) = default;
1805
1806
        add_tag_layer(add_tag_layer&& item) : add_tag_layer() { swap(item); }
        add_tag_layer& operator=(add_tag_layer&& item) { swap(item); return *this; }
1807
1808
1809
1810

        template <typename T, typename E>
        add_tag_layer(
            const add_tag_layer<ID,T,E>& item
1811
1812
1813
1814
1815
        ) : input_layer(item.subnet()), 
            cached_output(item.cached_output),
            cached_output_ptr(nullptr),
            grad_final(item.grad_final),
            gradient_input_is_stale(item.gradient_input_is_stale)
1816
1817
1818
1819
1820
1821
        {}

        template <typename ...T>
        add_tag_layer(
            T ...args
        ) : 
1822
1823
1824
            input_layer(std::move(args)...),
            cached_output_ptr(nullptr),
            gradient_input_is_stale(true)
1825
1826
1827
        {
        }

1828
1829
1830
1831
1832
1833
1834
        add_tag_layer (
            std::tuple<>
        ) : 
            cached_output_ptr(nullptr),
            gradient_input_is_stale(true)
        {}

1835
1836
        template <typename input_iterator>
        void to_tensor (
1837
1838
            input_iterator ibegin,
            input_iterator iend,
1839
1840
1841
            resizable_tensor& data
        ) const
        {
1842
            input_layer.to_tensor(ibegin,iend,data);
1843
1844
1845
1846
        }

        template <typename input_iterator>
        const tensor& operator() (
1847
            input_iterator ibegin, 
1848
1849
1850
1851
            input_iterator iend
        )
        {
            input_layer.to_tensor(ibegin,iend,cached_output);
1852
            cached_output_ptr = nullptr;
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
            return get_output();
        }

        const tensor& operator() (const input_type& x)
        {
            return (*this)(&x, &x+1);
        }

        const tensor& forward(const tensor& x)
        {
1863
1864
1865
1866
1867
1868
1869
1870
1871
            // If this tag is the first layer in one of the sub networks inside a repeat
            // layer then we don't want it to be creating copies of x.  This is because, we
            // can just hold a pointer to x since the way repeat is constructed guarantees
            // that x will have a lifetime larger than this pointer. 
            if (is_same_type<INPUT_LAYER, impl::repeat_input_layer>::value)
                cached_output_ptr = const_cast<tensor*>(&x);
            else
                cached_output = x;
            gradient_input_is_stale = true;
1872
1873
1874
1875
1876
            return get_output();
        }

        const tensor& get_output() const 
        { 
1877
1878
1879
1880
            if (cached_output_ptr)
                return *cached_output_ptr;
            else
                return cached_output; 
1881
1882
        }

1883
        const tensor& get_final_data_gradient(
1884
        ) const { return grad_final; }
1885

1886
1887
        tensor& get_gradient_input() 
        { 
1888
1889
            if (!have_same_dimensions(get_output(), grad_final) ||
                gradient_input_is_stale)
1890
            {
1891
1892
1893
                grad_final.copy_size(get_output());
                grad_final = 0;
                gradient_input_is_stale = false;
1894
            }
1895
            return grad_final; 
1896
1897
        }

1898
        void back_propagate_error(const tensor& /*x*/)
1899
        {
1900
1901
1902
1903
1904
            // nothing to do
        }
        void back_propagate_error(const tensor& /*x*/, const tensor& /*gradient_input*/)
        {
            // nothing to do
1905
1906
        }

1907
        template <typename solver_type>
1908
        void update_parameters(sstack<solver_type> /*solvers*/, double /*step_size*/)
1909
        {
1910
            // nothing to do
1911
1912
        }

Davis King's avatar
Davis King committed
1913
1914
        const subnet_type& subnet() const { return input_layer; }
        subnet_type& subnet() { return input_layer; }
1915
1916
1917

        void clean()
        {
1918
            grad_final.clear();
1919
            cached_output.clear();
1920
            cached_output_ptr = 0;
1921
1922
        }

1923
1924
1925
1926
1927
1928
        friend void serialize(const add_tag_layer& item, std::ostream& out)
        {
            int version = 1;
            serialize(version, out);
            serialize(item.input_layer, out);
            serialize(item.cached_output, out);
1929
1930
            serialize(item.grad_final, out);
            serialize(item.gradient_input_is_stale, out);
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
        }

        friend void deserialize(add_tag_layer& item, std::istream& in)
        {
            int version = 0;
            deserialize(version, in);
            if (version != 1)
                throw serialization_error("Unexpected version found while deserializing dlib::add_tag_layer.");
            deserialize(item.input_layer, in);
            deserialize(item.cached_output, in);
1941
1942
1943
            deserialize(item.grad_final, in);
            deserialize(item.gradient_input_is_stale, in);
            item.cached_output_ptr = nullptr;
1944
1945
        }

1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
        friend std::ostream& operator<< (std::ostream& out, const add_tag_layer& item)
        {
            item.print(out, 0);
            return out;
        }

        void print (std::ostream& out, unsigned long idx=0) const
        {
            out << "layer<"<<idx << ">\ttag" << ID << "\n";
            // Don't print the repeat_input_layer since it doesn't exist from the user's
            // point of view.  It's just an artifact of how repeat<> works.
            if (!std::is_same<subnet_type, impl::repeat_input_layer>::value)
                out << "layer<"<< idx+1 << ">\t" << subnet() << "\n";
        }

1961
1962
    private:

1963
1964
1965
1966
1967
1968
1969
1970
        template <typename T, typename U, typename E>
        friend class add_layer;
        template <typename T, bool is_first, typename E>
        friend class dimpl::subnet_wrapper;
        template <unsigned long T, typename U, typename E>
        friend class add_tag_layer;
        template <template<typename> class T, typename U>
        friend class add_skip_layer;
1971
1972
        template <size_t N, template<typename> class L, typename S>
        friend class repeat;
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990

        // You woudln't put a tag on a layer if you didn't want to access its forward
        // outputs.  So this is always true.
        bool this_layer_requires_forward_output(
        ) { return true; } 

        void disable_output_and_gradient_getters (
        ) 
        { 
            // This should never happen because only inplace layers call
            // disable_output_and_gradient_getters(), however, putting a tag layer right
            // before an inplace layer basically means you don't want the following layer
            // to operate in place.  So the inplace layer should turn itself into an
            // out-of-place layer and not call disable_output_and_gradient_getters(). 
            DLIB_CASSERT(false,"This should never happen");
        }

        tensor& private_get_output() const
1991
        { return const_cast<tensor&>(get_output()); }
1992
1993
1994
        tensor& private_get_gradient_input() 
        { return get_gradient_input(); }

1995
1996
1997
1998
        void swap(add_tag_layer& item)
        {
            std::swap(input_layer, item.input_layer);
            std::swap(cached_output, item.cached_output);
1999
2000
2001
            std::swap(cached_output_ptr, item.cached_output_ptr);
            std::swap(grad_final, item.grad_final);
            std::swap(gradient_input_is_stale, item.gradient_input_is_stale);
2002
2003
        }

Davis King's avatar
Davis King committed
2004
        subnet_type input_layer;
2005
        resizable_tensor cached_output;
2006
2007
2008
        tensor* cached_output_ptr;
        resizable_tensor grad_final;
        bool gradient_input_is_stale;
2009
2010
2011
2012
    };

    template <unsigned long ID, typename U, typename E>
    struct is_nonloss_layer_type<add_tag_layer<ID,U,E>> : std::true_type {};
2013
2014
2015
2016
2017
2018


// ----------------------------------------------------------------------------------------
// ----------------------------------------------------------------------------------------
// ----------------------------------------------------------------------------------------

Davis King's avatar
Davis King committed
2019
    template <typename LOSS_DETAILS, typename SUBNET>
Davis King's avatar
Davis King committed
2020
    class add_loss_layer;
2021
2022
2023
2024
2025

    class no_label_type
    {
    private:
        // We don't want anyone making these no_label_type objects.  They are here only to
Davis King's avatar
Davis King committed
2026
        // allow add_loss_layer::label_type and dnn_trainer::label_type to exist which avoids
Davis King's avatar
Davis King committed
2027
        // needing to overload add_loss_layer and dnn_trainer for supervised an unsupervised
2028
2029
        // losses.  It also can be a type to use in template metaprogramming to indicate
        // "no label".  So here we make the constructor private with the exception that
Davis King's avatar
Davis King committed
2030
        // add_loss_layer objects can make it (again, just to simplify add_loss_layer's
2031
        // implementation).
2032
        no_label_type(){};
Davis King's avatar
Davis King committed
2033
        template <typename LOSS_DETAILS, typename SUBNET> friend class add_loss_layer;
2034
        template < typename net_type, typename solver_type > friend class dnn_trainer; 
2035
2036
2037
2038
    };

// ----------------------------------------------------------------------------------------

Davis King's avatar
Davis King committed
2039
    template <typename LOSS_DETAILS, typename SUBNET>
Davis King's avatar
Davis King committed
2040
    class add_loss_layer
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
    {
        template <typename T, typename enabled=void>
        struct get_loss_layer_label_type
        {
            typedef no_label_type type;
        };
        template <typename T>
        struct get_loss_layer_label_type<T,typename std::enable_if<sizeof(typename T::label_type)!=0>::type>
        {
            typedef typename T::label_type type;
        };

    public:
        typedef LOSS_DETAILS loss_details_type;
Davis King's avatar
Davis King committed
2055
2056
        typedef SUBNET subnet_type;
        typedef typename subnet_type::input_type input_type;
2057
2058
2059
        const static size_t num_layers = subnet_type::num_layers + 1;
        // Note that the loss layer doesn't count as an additional computational layer.
        const static size_t num_computational_layers = subnet_type::num_computational_layers;
Davis King's avatar
Davis King committed
2060
        const static unsigned int sample_expansion_factor = subnet_type::sample_expansion_factor;
2061
2062
        typedef typename get_loss_layer_label_type<LOSS_DETAILS>::type label_type;

2063
2064
        static_assert(is_nonloss_layer_type<SUBNET>::value, 
            "SUBNET must be of type add_layer, add_skip_layer, or add_tag_layer."); 
2065
2066
2067
2068
        static_assert(sample_expansion_factor == LOSS_DETAILS::sample_expansion_factor,
            "The loss layer and input layer must agree on the sample_expansion_factor.");


2069
        add_loss_layer() {};
Davis King's avatar
Davis King committed
2070
2071
        add_loss_layer(const add_loss_layer&) = default;
        add_loss_layer& operator=(const add_loss_layer&) = default;
2072
2073
        add_loss_layer(add_loss_layer&& item) : add_loss_layer() { swap(item); }
        add_loss_layer& operator=(add_loss_layer&& item) { swap(item); return *this; }
2074
2075

        template <typename T, typename U>
Davis King's avatar
Davis King committed
2076
2077
        add_loss_layer(
            const add_loss_layer<T,U>& item
2078
2079
        ) : 
            loss(item.loss_details()),
Davis King's avatar
Davis King committed
2080
            subnetwork(item.subnet())
2081
2082
2083
        {}

        template <typename ...T>
Davis King's avatar
Davis King committed
2084
        add_loss_layer(
2085
2086
2087
2088
            const LOSS_DETAILS& layer_det, 
            T&& ...args
        ) : 
            loss(layer_det), 
Davis King's avatar
Davis King committed
2089
            subnetwork(std::forward<T>(args)...)
2090
2091
2092
2093
        {
        }

        template <typename ...T>
Davis King's avatar
Davis King committed
2094
        add_loss_layer(
2095
2096
2097
2098
            LOSS_DETAILS&& layer_det, 
            T&& ...args
        ) : 
            loss(std::move(layer_det)), 
Davis King's avatar
Davis King committed
2099
            subnetwork(std::forward<T>(args)...)
2100
2101
2102
2103
        {
        }

        template <typename ...T>
Davis King's avatar
Davis King committed
2104
        add_loss_layer(
2105
2106
            T ...args
        ) : 
Davis King's avatar
Davis King committed
2107
            subnetwork(std::move(args)...)
2108
        {
2109
2110
2111
2112
2113
2114
2115
2116
2117
        }

        template <typename input_iterator>
        void to_tensor (
            input_iterator ibegin,
            input_iterator iend,
            resizable_tensor& data
        ) const
        {
Davis King's avatar
Davis King committed
2118
            subnetwork.to_tensor(ibegin,iend,data);
2119
2120
2121
2122
2123
2124
2125
2126
        }

        template <typename output_iterator>
        void operator() (
            const tensor& x, 
            output_iterator obegin
        )
        {
Davis King's avatar
Davis King committed
2127
2128
            subnetwork.forward(x);
            const dimpl::subnet_wrapper<subnet_type> wsub(subnetwork);
2129
            loss.to_label(x, wsub, obegin);
2130
2131
2132
2133
2134
2135
2136
2137
2138
        }

        template <typename input_iterator, typename output_iterator>
        void operator() (
            input_iterator ibegin,
            input_iterator iend,
            output_iterator obegin
        )
        {
2139
2140
            to_tensor(ibegin,iend,temp_tensor);
            (*this)(temp_tensor, obegin);
2141
2142
2143
2144
2145
2146
2147
2148
        }

        const label_type& operator() (const input_type& x)
        {
            (*this)(&x, &x+1, &temp_label);
            return temp_label;
        }

2149
        template <typename iterable_type>
2150
        std::vector<label_type> operator() (
2151
            const iterable_type& data,
2152
2153
2154
            size_t batch_size = 128
        )
        {
2155
            std::vector<label_type> results(std::distance(data.begin(), data.end()));
2156
2157
2158
2159
2160
2161
2162
2163
2164
            auto o = results.begin();
            for (auto i = data.begin(); i < data.end(); i+=batch_size, o+=batch_size)
            {
                auto end = std::min(i+batch_size, data.end());
                (*this)(i, end, o);
            }
            return results;
        }

2165
2166
2167
2168
2169
2170
        template <typename label_iterator>
        double compute_loss (
            const tensor& x,
            label_iterator lbegin 
        )
        {
Davis King's avatar
Davis King committed
2171
2172
            subnetwork.forward(x);
            dimpl::subnet_wrapper<subnet_type> wsub(subnetwork);
2173
2174
            return loss.compute_loss(x, lbegin, wsub);
        }
2175
2176
2177
2178
2179
2180
2181
2182

        template <typename input_iterator, typename label_iterator>
        double compute_loss (
            input_iterator ibegin,
            input_iterator iend,
            label_iterator lbegin 
        )
        {
2183
2184
2185
2186
2187
2188
2189
2190
            to_tensor(ibegin,iend,temp_tensor);
            return compute_loss(temp_tensor, lbegin);
        }

        double compute_loss (
            const tensor& x
        )
        {
Davis King's avatar
Davis King committed
2191
2192
            subnetwork.forward(x);
            dimpl::subnet_wrapper<subnet_type> wsub(subnetwork);
2193
            return loss.compute_loss(x, wsub);
2194
2195
2196
2197
2198
2199
2200
2201
        }

        template <typename input_iterator>
        double compute_loss (
            input_iterator ibegin,
            input_iterator iend
        )
        {
2202
2203
2204
2205
            to_tensor(ibegin,iend,temp_tensor);
            return compute_loss(temp_tensor);
        }

2206
2207
        template <typename label_iterator>
        double compute_parameter_gradients (
2208
            const tensor& x,
2209
            label_iterator lbegin
2210
2211
        )
        {
Davis King's avatar
Davis King committed
2212
2213
            subnetwork.forward(x);
            dimpl::subnet_wrapper<subnet_type> wsub(subnetwork);
2214
            double l = loss.compute_loss(x, lbegin, wsub);
2215
            subnetwork.back_propagate_error(x);
2216
            return l;
2217
        }
2218
2219
        template <typename input_iterator, typename label_iterator>
        double compute_parameter_gradients (
2220
2221
            input_iterator ibegin,
            input_iterator iend,
2222
            label_iterator lbegin
2223
2224
        )
        {
2225
            to_tensor(ibegin,iend,temp_tensor);
2226
            return compute_parameter_gradients(temp_tensor, lbegin);
2227
        }
2228
2229
        double compute_parameter_gradients (
            const tensor& x
2230
2231
        )
        {
Davis King's avatar
Davis King committed
2232
2233
            subnetwork.forward(x);
            dimpl::subnet_wrapper<subnet_type> wsub(subnetwork);
2234
            double l = loss.compute_loss(x, wsub);
2235
            subnetwork.back_propagate_error(x);
2236
2237
            return l;
        }
2238
2239
        template <typename input_iterator>
        double compute_parameter_gradients (
2240
            input_iterator ibegin,
2241
2242
2243
2244
2245
2246
2247
2248
2249
            input_iterator iend
        )
        {
            to_tensor(ibegin,iend,temp_tensor);
            return compute_parameter_gradients(temp_tensor);
        }

        template <typename solver_type>
        void update_parameters (
2250
2251
            sstack<solver_type> solvers,
            double step_size
2252
2253
        )
        {
2254
            subnetwork.update_parameters(solvers, step_size);
2255
2256
        }

Davis King's avatar
Davis King committed
2257
2258
        const subnet_type& subnet() const { return subnetwork; }
        subnet_type& subnet() { return subnetwork; }
2259
2260
2261
2262
2263
2264
2265
        const loss_details_type& loss_details() const { return loss; }
        loss_details_type& loss_details() { return loss; }

        void clean (
        )
        {
            temp_tensor.clear();
2266
            subnetwork.clean();
2267
2268
        }

2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
        friend void serialize(const add_loss_layer& item, std::ostream& out)
        {
            int version = 1;
            serialize(version, out);
            serialize(item.loss, out);
            serialize(item.subnetwork, out);
        }

        friend void deserialize(add_loss_layer& item, std::istream& in)
        {
            int version = 0;
            deserialize(version, in);
            if (version != 1)
                throw serialization_error("Unexpected version found while deserializing dlib::add_loss_layer.");
            deserialize(item.loss, in);
            deserialize(item.subnetwork, in);
        }

2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
        friend std::ostream& operator<< (std::ostream& out, const add_loss_layer& item)
        {
            item.print(out, 0);
            return out;
        }

        void print (std::ostream& out, unsigned long idx=0) const
        {
            out << "layer<" << idx << ">\t" << loss_details() << "\n";
            subnet().print(out, idx+1);
        }

2299
2300
    private:

2301

2302
2303
2304
2305
2306
2307
        void swap(add_loss_layer& item)
        {
            std::swap(loss, item.loss);
            std::swap(subnetwork, item.subnetwork);
        }

2308
        loss_details_type loss;
Davis King's avatar
Davis King committed
2309
        subnet_type subnetwork;
2310
2311
2312
2313
2314
2315
2316
2317
2318

        // These two objects don't logically contribute to the state of this object.  They
        // are here to prevent them from being reallocated over and over.
        label_type temp_label;
        resizable_tensor temp_tensor;
    };


    template <typename T, typename U>
Davis King's avatar
Davis King committed
2319
    struct is_loss_layer_type<add_loss_layer<T,U>> : std::true_type {};
2320
2321
2322
2323
2324
2325
2326

// ----------------------------------------------------------------------------------------
// ----------------------------------------------------------------------------------------
// ----------------------------------------------------------------------------------------

    namespace impl
    {
2327
        template <unsigned int i, typename T, typename enabled = void>
2328
2329
        struct layer_helper
        {
2330
            static_assert(i < T::num_layers, "Call to layer() attempted to access non-existing layer in neural network.");
2331
            static T& makeT();
Davis King's avatar
Davis King committed
2332
            using next_type = typename std::remove_reference<decltype(makeT().subnet())>::type;
2333
2334
2335
            using type = typename layer_helper<i-1,next_type>::type;
            static type& layer(T& n)
            {
Davis King's avatar
Davis King committed
2336
                return layer_helper<i-1,next_type>::layer(n.subnet());
2337
2338
            }
        };
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
        template <
            unsigned int i,
            size_t N, template<typename> class L, typename S
        >
        struct layer_helper<i,repeat<N,L,S>, typename std::enable_if<(i!=0&&i>=repeat<N,L,S>::layers_in_repeated_group)>::type>
        {
            const static size_t layers_in_repeated_group = repeat<N,L,S>::layers_in_repeated_group;

            static repeat<N,L,S>& makeT();
            using next_type = typename std::remove_reference<decltype(makeT().subnet())>::type;
            using type = typename layer_helper<i-layers_in_repeated_group,next_type>::type;
            static type& layer(repeat<N,L,S>& n)
            {
                return layer_helper<i-layers_in_repeated_group,next_type>::layer(n.subnet());
            }
        };
        template <
            unsigned int i,
            size_t N, template<typename> class L, typename S
        >
        struct layer_helper<i,repeat<N,L,S>, typename std::enable_if<(i!=0&&i<repeat<N,L,S>::layers_in_repeated_group)>::type>
        {
            const static size_t layers_in_each_group = repeat<N,L,S>::layers_in_each_group;
            typedef typename repeat<N,L,S>::repeated_layer_type repeated_layer_type;
            using next_type = repeated_layer_type;
            using type = typename layer_helper<i%layers_in_each_group,next_type>::type;
            static type& layer(repeat<N,L,S>& n)
            {
                return layer_helper<i%layers_in_each_group,next_type>::layer(n.get_repeated_layer(i/layers_in_each_group));
            }
        };
        template <
            size_t N, template<typename> class L, typename S
        >
        struct layer_helper<0,repeat<N,L,S>, void>
        {
            typedef typename repeat<N,L,S>::repeated_layer_type repeated_layer_type;
            using type = repeated_layer_type;
            static type& layer(repeat<N,L,S>& n)
            {
                return n.get_repeated_layer(0);
            }
        };
2382
        template <typename T>
2383
        struct layer_helper<0,T,void>
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
        {
            using type = T;
            static type& layer(T& n)
            {
                return n;
            }
        };

        template <template<typename> class Match, typename T, unsigned int i, typename enabled = void>
        struct layer_helper_match
        {
            static T& makeT();
Davis King's avatar
Davis King committed
2396
            using next_type = typename std::remove_reference<decltype(makeT().subnet())>::type;
2397
2398
2399
            using type = typename layer_helper_match<Match,next_type,i>::type;
            static type& layer(T& n)
            {
Davis King's avatar
Davis King committed
2400
                return layer_helper_match<Match,next_type,i>::layer(n.subnet());
2401
2402
            }
        };
Davis King's avatar
Davis King committed
2403
        // This overload catches add_layer and add_loss_layer templates.
2404
2405
        template <template<typename> class Match, typename T, unsigned int i>
        struct layer_helper_match<Match,T,i,
Davis King's avatar
Davis King committed
2406
            typename std::enable_if<std::is_same<const T,const  Match<typename T::subnet_type>>::value>::type>
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
        {
            using type = typename layer_helper<i,T>::type;
            static type& layer(T& n)
            {
                return layer_helper<i,T>::layer(n);
            }
        };
        // This overload catches input templates.
        template <template<typename> class Match, typename T, unsigned int i>
        struct layer_helper_match<Match,T,i,
            typename std::enable_if<std::is_same<const T,const  Match<typename T::input_type>>::value>::type>
        {
            using type = typename layer_helper<i,T>::type;
            static type& layer(T& n)
            {
                return layer_helper<i,T>::layer(n);
            }
        };
Davis King's avatar
Davis King committed
2425
        // This overload catches subnet_wrapper templates.
2426
2427
2428
        template <template<typename> class Match, typename T, unsigned int i>
        struct layer_helper_match<Match,T,i,
            typename std::enable_if<std::is_same<const typename T::wrapped_type, 
Davis King's avatar
Davis King committed
2429
                                                 const Match<typename T::wrapped_type::subnet_type>>::value>::type>
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
        {
            using type = typename layer_helper<i,T>::type;
            static type& layer(T& n)
            {
                return layer_helper<i,T>::layer(n);
            }
        };
    }

    template <unsigned int i, typename T>
    typename impl::layer_helper<i,T>::type& layer (T& n) 
    {
        return impl::layer_helper<i,T>::layer(n);
    }

    template <template<typename> class Match, typename T>
    typename impl::layer_helper_match<Match,T,0>::type& layer (T& n) 
    {
        return impl::layer_helper_match<Match,T,0>::layer(n);
    }

    template <template<typename> class Match, unsigned int i, typename T>
    typename impl::layer_helper_match<Match,T,i>::type& layer (T& n) 
    {
        return impl::layer_helper_match<Match,T,i>::layer(n);
    }

// ----------------------------------------------------------------------------------------

Davis King's avatar
Davis King committed
2459
    template <template<typename> class TAG_TYPE, typename SUBNET>
Davis King's avatar
Davis King committed
2460
    class add_skip_layer
2461
2462
    {
    public:
Davis King's avatar
Davis King committed
2463
2464
        typedef SUBNET subnet_type;
        typedef typename subnet_type::input_type input_type;
2465
2466
        const static size_t num_layers = subnet_type::num_layers + 1;
        const static size_t num_computational_layers = subnet_type::num_computational_layers;
Davis King's avatar
Davis King committed
2467
        const static unsigned int sample_expansion_factor = subnet_type::sample_expansion_factor;
2468
2469
2470
        static_assert(sample_expansion_factor >= 1,
            "The input layer can't produce fewer output tensors than there are inputs.");

Davis King's avatar
Davis King committed
2471
2472
2473
2474
2475
        add_skip_layer() = default;
        add_skip_layer(const add_skip_layer&) = default;
        add_skip_layer(add_skip_layer&&) = default;
        add_skip_layer& operator=(add_skip_layer&&) = default;
        add_skip_layer& operator=(const add_skip_layer&) = default;
2476
2477

        template <typename T>
Davis King's avatar
Davis King committed
2478
2479
        add_skip_layer(
            const add_skip_layer<TAG_TYPE,T>& item
Davis King's avatar
Davis King committed
2480
        ) : subnetwork(item.subnet())
2481
2482
2483
        {}

        template <typename ...T>
Davis King's avatar
Davis King committed
2484
        add_skip_layer(
2485
2486
            T ...args
        ) : 
Davis King's avatar
Davis King committed
2487
            subnetwork(std::move(args)...) 
2488
2489
2490
2491
2492
        {
        }

        template <typename input_iterator>
        void to_tensor (
2493
2494
            input_iterator ibegin,
            input_iterator iend,
2495
2496
2497
            resizable_tensor& data
        ) const
        {
Davis King's avatar
Davis King committed
2498
            subnetwork.to_tensor(ibegin,iend,data);
2499
2500
2501
2502
2503
2504
2505
2506
        }

        template <typename input_iterator>
        const tensor& operator() (
            input_iterator ibegin,
            input_iterator iend
        )
        {
Davis King's avatar
Davis King committed
2507
2508
            subnetwork(ibegin,iend);
            return layer<TAG_TYPE>(subnetwork).get_output();
2509
2510
2511
2512
        }

        const tensor& operator() (const input_type& x)
        {
Davis King's avatar
Davis King committed
2513
2514
            subnetwork(x);
            return layer<TAG_TYPE>(subnetwork).get_output();
2515
2516
2517
2518
        }

        const tensor& forward(const tensor& x)
        {
Davis King's avatar
Davis King committed
2519
2520
            subnetwork.forward(x);
            return layer<TAG_TYPE>(subnetwork).get_output();
2521
2522
2523
2524
        }

        const tensor& get_output() const 
        { 
Davis King's avatar
Davis King committed
2525
            return layer<TAG_TYPE>(subnetwork).get_output();
2526
2527
2528
2529
        }

        tensor& get_gradient_input() 
        { 
Davis King's avatar
Davis King committed
2530
            return layer<TAG_TYPE>(subnetwork).get_gradient_input();
2531
2532
        }

2533
2534
2535
2536
2537
2538
        const tensor& get_final_data_gradient(
        ) const 
        { 
            return subnetwork.get_final_data_gradient(); 
        }

2539
        void back_propagate_error(const tensor& x)
2540
        {
2541
            subnetwork.back_propagate_error(x);
2542
2543
        }

2544
        template <typename solver_type>
2545
        void update_parameters(sstack<solver_type> solvers, double step_size)
2546
        {
2547
            subnetwork.update_parameters(solvers, step_size);
2548
2549
        }

2550
2551
2552
2553
2554
2555
2556
        const tensor& get_parameter_gradient(
        ) const { return params_grad; }

        tensor& get_parameter_gradient (
        ) { return params_grad; }


Davis King's avatar
Davis King committed
2557
        const subnet_type& subnet() const 
2558
        { 
Davis King's avatar
Davis King committed
2559
            return subnetwork; 
2560
2561
        }

Davis King's avatar
Davis King committed
2562
        subnet_type& subnet() 
2563
        { 
Davis King's avatar
Davis King committed
2564
            return subnetwork; 
2565
2566
2567
2568
        }

        void clean()
        {
Davis King's avatar
Davis King committed
2569
            subnetwork.clean();
2570
2571
        }

2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
        friend void serialize(const add_skip_layer& item, std::ostream& out)
        {
            int version = 1;
            serialize(version, out);
            serialize(item.subnetwork, out);
        }

        friend void deserialize(add_skip_layer& item, std::istream& in)
        {
            int version = 0;
            deserialize(version, in);
            if (version != 1)
                throw serialization_error("Unexpected version found while deserializing dlib::add_skip_layer.");
            deserialize(item.subnetwork, in);
        }

2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
        friend std::ostream& operator<< (std::ostream& out, const add_skip_layer& item)
        {
            item.print(out, 0);
            return out;
        }

        void print (std::ostream& out, unsigned long idx=0) const
        {
            out << "layer<" << idx << ">\tskip\n";
            subnet().print(out, idx+1);
        }

2600
2601
    private:

2602

2603
2604
2605
2606
2607
2608
2609
2610
        template <typename T, typename U, typename E>
        friend class add_layer;
        template <typename T, bool is_first, typename E>
        friend class dimpl::subnet_wrapper;
        template <unsigned long T, typename U, typename E>
        friend class add_tag_layer;
        template <template<typename> class T, typename U>
        friend class add_skip_layer;
2611
2612
        template <size_t N, template<typename> class L, typename S>
        friend class repeat;
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624

        bool this_layer_requires_forward_output(
        ) { return layer<TAG_TYPE>(subnetwork).this_layer_requires_forward_output(); } 

        void disable_output_and_gradient_getters (
        ) { layer<TAG_TYPE>(subnetwork).disable_output_and_gradient_getters(); }

        tensor& private_get_output() const
        { return layer<TAG_TYPE>(subnetwork).private_get_output(); }
        tensor& private_get_gradient_input() 
        { return layer<TAG_TYPE>(subnetwork).private_get_gradient_input(); }

Davis King's avatar
Davis King committed
2625
        subnet_type subnetwork;
2626
2627
2628
2629
2630

        // This member doesn't logically contribute to the state of the object since it is
        // always empty. It's just here so we can have the get_parameter_gradient() methods
        // which have to return something.  So they return this empty tensor.
        resizable_tensor params_grad;
2631
2632
    };
    template <template<typename> class T, typename U>
Davis King's avatar
Davis King committed
2633
2634
    struct is_nonloss_layer_type<add_skip_layer<T,U>> : std::true_type {};

Davis King's avatar
Davis King committed
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
    template <typename SUBNET> using tag1  = add_tag_layer< 1, SUBNET>;
    template <typename SUBNET> using tag2  = add_tag_layer< 2, SUBNET>;
    template <typename SUBNET> using tag3  = add_tag_layer< 3, SUBNET>;
    template <typename SUBNET> using tag4  = add_tag_layer< 4, SUBNET>;
    template <typename SUBNET> using tag5  = add_tag_layer< 5, SUBNET>;
    template <typename SUBNET> using tag6  = add_tag_layer< 6, SUBNET>;
    template <typename SUBNET> using tag7  = add_tag_layer< 7, SUBNET>;
    template <typename SUBNET> using tag8  = add_tag_layer< 8, SUBNET>;
    template <typename SUBNET> using tag9  = add_tag_layer< 9, SUBNET>;
    template <typename SUBNET> using tag10 = add_tag_layer<10, SUBNET>;

    template <typename SUBNET> using skip1  = add_skip_layer< tag1, SUBNET>;
    template <typename SUBNET> using skip2  = add_skip_layer< tag2, SUBNET>;
    template <typename SUBNET> using skip3  = add_skip_layer< tag3, SUBNET>;
    template <typename SUBNET> using skip4  = add_skip_layer< tag4, SUBNET>;
    template <typename SUBNET> using skip5  = add_skip_layer< tag5, SUBNET>;
    template <typename SUBNET> using skip6  = add_skip_layer< tag6, SUBNET>;
    template <typename SUBNET> using skip7  = add_skip_layer< tag7, SUBNET>;
    template <typename SUBNET> using skip8  = add_skip_layer< tag8, SUBNET>;
    template <typename SUBNET> using skip9  = add_skip_layer< tag9, SUBNET>;
    template <typename SUBNET> using skip10 = add_skip_layer<tag10, SUBNET>;
2656
2657
2658
2659
2660

// ----------------------------------------------------------------------------------------

    namespace timpl
    {
2661
        inline void fill_with_gassuan_random_numbers (
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
            tensor& t,
            dlib::rand& rnd,
            double sigma = 1
        )
        {
            float* data = t.host();
            for (size_t i = 0; i < t.size(); ++i)
                data[i] = rnd.get_random_gaussian()*sigma;
        }

Davis King's avatar
Davis King committed
2672
        class test_layer_subnet 
2673
2674
        {
        public:
Davis King's avatar
Davis King committed
2675
            test_layer_subnet (
2676
2677
2678
2679
2680
2681
                dlib::rand& rnd_
            ) : rnd(rnd_) 
            {
                // Output and gradient_input have to have the same dimensions in each
                // layer.
                const long num_samples = rnd.get_random_32bit_number()%4+3;
2682
                const long k  = rnd.get_random_32bit_number()%4+2;
2683
2684
2685
                const long nr = rnd.get_random_32bit_number()%4+2;
                const long nc = rnd.get_random_32bit_number()%4+2;

2686
2687
                output.set_size(num_samples, k, nr, nc);
                gradient_input.set_size(num_samples, k, nr, nc);
2688
2689
2690
2691
2692
2693
2694
2695
2696

                // Use a non-zero initial gradient to make sure the layers add to it
                // rather than assign and blow away the initial value.
                fill_with_gassuan_random_numbers(gradient_input, rnd, 0.01);

                fill_with_gassuan_random_numbers(output, rnd);
            }


2697
            tensor& get_mutable_output() { return output; }
2698
            const tensor& get_output() const { return output; }
2699
            const tensor& private_get_output() const { return get_output(); }
Davis King's avatar
Davis King committed
2700
            const test_layer_subnet& subnet() const { init_sub(); return *subnetwork; }
2701
2702

            tensor& get_gradient_input() { return gradient_input; }
2703
            tensor& private_get_gradient_input() { return get_gradient_input(); }
Davis King's avatar
Davis King committed
2704
            test_layer_subnet& subnet() { init_sub(); return *subnetwork; }
2705
2706
2707
2708
2709



            unsigned long count_outputs() const
            {
Davis King's avatar
Davis King committed
2710
2711
                if (subnetwork)
                    return subnetwork->count_outputs() + output.size();
2712
2713
2714
2715
2716
2717
2718
2719
2720
                else
                    return output.size();
            }

            float& get_output_element(unsigned long i)
            {
                if (i < output.size())
                    return output.host()[i];
                else
Davis King's avatar
Davis King committed
2721
                    return subnet().get_output_element(i-output.size());
2722
2723
2724
2725
2726
2727
2728
            }

            float get_gradient_input_element(unsigned long i) const
            {
                if (i < gradient_input.size())
                    return gradient_input.host()[i];
                else
Davis King's avatar
Davis King committed
2729
                    return subnet().get_gradient_input_element(i-gradient_input.size());
2730
2731
2732
2733
2734
            }


        private:
            // We lazily initialize sub-layers as needed when someone tries to call
Davis King's avatar
Davis King committed
2735
            // subnet()
2736
2737
            void init_sub() const
            {
Davis King's avatar
Davis King committed
2738
2739
                if (!subnetwork)
                    subnetwork.reset(new test_layer_subnet(rnd));
2740
2741
2742
            }

            dlib::rand& rnd;
Davis King's avatar
Davis King committed
2743
            mutable std::unique_ptr<test_layer_subnet> subnetwork;
2744
2745
2746
2747
            resizable_tensor output;
            resizable_tensor gradient_input;
        };

2748
    }
2749

2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
    struct layer_test_results
    {
        layer_test_results() : was_good(true) {}
        explicit layer_test_results(const std::string& l) : log(l),was_good(false) {}

        std::string log;
        bool was_good;

        operator bool() const { return was_good; }
    };

    inline std::ostream& operator<< (std::ostream& out, const layer_test_results& item)
    {
        out << item.log;
        return out;
2765
2766
2767
2768
2769
    }

    template <
        typename layer_details_type
        >
Davis King's avatar
Davis King committed
2770
2771
2772
    layer_test_results impl_test_layer (
        layer_details_type l,
        const float base_eps 
2773
2774
2775
2776
    )
    {
        using namespace timpl;
        // Do some setup
2777
        running_stats<double> rs_data, rs_params;
2778
        dlib::rand rnd;
2779
2780
        std::ostringstream sout;
        for (int iter = 0; iter < 10; ++iter)
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
        {
            test_layer_subnet subnetwork(rnd);
            resizable_tensor output, out2, out3;
            // Run setup() and forward() as well to make sure any calls to subnet() have
            // happened before we start assuming we know how many data elements there are
            // (since we do a lazy layer creation thing based on calls to subnet() inside
            // test_layer_subnet).
            l.setup(subnetwork);
            impl::call_layer_forward(l, subnetwork, output);

            resizable_tensor input_grad;
            input_grad.copy_size(output);
            fill_with_gassuan_random_numbers(input_grad, rnd);


            // The f() we are computing gradients of is this thing.  It's value at the current
            // parameter and data values is:
            //sout << "f(data,params): " << dot(output, input_grad) << std::endl;

            // We are going to save a copy of the subnetwork.get_gradient_input() data before we do
            // backpropagation since the backward() function is supposed to *add* to the
            // gradients rather than overwrite them.  We will use this saved data to check if
            // that is the case.
            const unsigned long num_data_inputs = subnetwork.count_outputs();
            std::vector<float> initial_gradient_input(num_data_inputs);
            for (unsigned long i = 0; i < num_data_inputs; ++i)
                initial_gradient_input[i] = subnetwork.get_gradient_input_element(i);

2809

2810
2811
2812
            // Now tell the layer to compute all the gradients.  In the rest of this function
            // we will just be checking that these gradients were computed correctly by
            // comparing them to a central differences approximation.
2813
            resizable_tensor params_grad;
2814
2815
2816
            params_grad.copy_size(l.get_layer_params());
            // But first, set the params grad to something crazy so that it's very obvious if
            // it doesn't get fully assigned.
2817
            params_grad = std::numeric_limits<float>::infinity();
2818
            impl::call_layer_backward(l, output, input_grad, subnetwork, params_grad);
2819

2820
2821
2822
2823
2824
2825
            static_assert(impl::is_inplace_layer(l, subnetwork) == impl::has_inplace_backward(l, subnetwork),
                "Layer not defined correctly.  forward and backward methods must either both be in-place or both out-of-place. ");

            // Make sure the outputs of forward() and backward() are the same when they are run
            // in in-place mode.
            if (impl::is_inplace_layer(l, subnetwork))
2826
            {
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
                test_layer_subnet subnetwork2(rnd);
                layer_details_type ll(l);
                ll.setup(subnetwork2);
                resizable_tensor ip_out;
                impl::call_layer_forward(ll, subnetwork2, ip_out);
                impl::call_layer_forward(ll, subnetwork2, subnetwork2.get_mutable_output());
                const auto forward_error = max(abs(mat(ip_out) - mat(subnetwork2.get_output())));
                if (forward_error > 0.00001)
                {
                    using namespace std;
                    sout << "This layer is supposed to support in-place computations but the output of forward_inplace()\n";
                    sout << "changes when invoked in-place vs. out-of-place. The error was: " << forward_error << endl;
                    return layer_test_results(sout.str()); 
                }

                resizable_tensor params_grad;
                params_grad.copy_size(ll.get_layer_params());
                params_grad = std::numeric_limits<float>::infinity();

                resizable_tensor input_grad;
                input_grad.copy_size(ip_out);
                fill_with_gassuan_random_numbers(input_grad, rnd);
                resizable_tensor params_grad1, params_grad2, data_grad1, data_grad2;
                params_grad1 = params_grad;
                params_grad2 = params_grad;
                // Now call backward() and make sure it works as well.
                subnetwork2.get_gradient_input() = 9999;
                impl::call_layer_backward(ll, ip_out, input_grad, subnetwork2, params_grad1);
                data_grad1 = subnetwork2.get_gradient_input();

                subnetwork2.get_gradient_input() = mat(input_grad);
                impl::call_layer_backward(ll, ip_out, subnetwork2.get_gradient_input(), subnetwork2, params_grad2);
                data_grad2 = subnetwork2.get_gradient_input();
                if (params_grad.size() != 0)
                {
                    const auto backward_param_error = max(abs(mat(params_grad1) - mat(params_grad2)));
                    if (backward_param_error > 0.00001)
                    {
                        using namespace std;
                        sout << "This layer is supposed to support in-place computations but the output of backward_inplace()\n";
                        sout << "changes when invoked in-place vs. out-of-place. The error was: " << backward_param_error << endl;
                        return layer_test_results(sout.str()); 
                    }
                }
                const auto backward_data_error = max(abs(mat(data_grad1) - mat(data_grad2)));
                if (backward_data_error > 0.00001)
2873
2874
2875
                {
                    using namespace std;
                    sout << "This layer is supposed to support in-place computations but the output of backward_inplace()\n";
2876
                    sout << "changes when invoked in-place vs. out-of-place. The error was: " << backward_data_error << endl;
2877
2878
2879
                    return layer_test_results(sout.str()); 
                }
            }
2880

2881
2882
2883
            // ==================================================================
            // first validate the way the parameter gradients are computed
            for (unsigned long i = 0; i < params_grad.size(); ++i)
2884
            {
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
                layer_details_type l1(l);

                float eps = l1.get_layer_params().host()[i]*base_eps;
                if (eps == 0)
                    eps = base_eps;
                const float oldval = l1.get_layer_params().host()[i];
                l1.get_layer_params().host()[i] = oldval+eps;
                impl::call_layer_forward(l1, subnetwork, out2);
                l1.get_layer_params().host()[i] = oldval-eps;
                impl::call_layer_forward(l1, subnetwork, out3);
                l1.get_layer_params().host()[i] = oldval;

                // Compute a reference derivative via a central differences approximation and
                // compare it to the one output by the layer and make sure they match.
                double reference_derivative = (dot(out2,input_grad)-dot(out3, input_grad))/(2*eps);
                double output_derivative = params_grad.host()[i];
2901
2902
2903
2904
2905
                double relative_error;
                if (reference_derivative != 0)
                    relative_error = (reference_derivative - output_derivative)/(reference_derivative);
                else
                    relative_error = (reference_derivative - output_derivative);
2906
                double absolute_error = (reference_derivative - output_derivative);
2907
                rs_params.add(std::abs(relative_error));
Davis King's avatar
Davis King committed
2908
                if (std::abs(relative_error) > 0.05 && std::abs(absolute_error) > 0.006)
2909
2910
2911
2912
2913
                {
                    using namespace std;
                    sout << "Gradient error in parameter #" << i <<".  Relative error: "<< relative_error << endl;
                    sout << "expected derivative: " << reference_derivative << endl;
                    sout << "output derivative:   " << output_derivative << endl;
2914
                    sout << "iteration:           " << iter << endl;
2915
2916
2917
                    return layer_test_results(sout.str()); 
                }
            }
2918

2919
2920
2921
            // ==================================================================
            // now validate the data gradients
            for (unsigned long i = 0; i < num_data_inputs; ++i)
2922
            {
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
                const float oldval = subnetwork.get_output_element(i);
                float eps = oldval*base_eps;
                if (eps == 0)
                    eps = base_eps;
                subnetwork.get_output_element(i) = oldval+eps;
                impl::call_layer_forward(l, subnetwork, out2);
                subnetwork.get_output_element(i) = oldval-eps;
                impl::call_layer_forward(l, subnetwork, out3);
                subnetwork.get_output_element(i) = oldval;

                // Compute a reference derivative via a central differences approximation and
                // compare it to the one output by the layer and make sure they match.
                double reference_derivative = (dot(out2,input_grad)-dot(out3, input_grad))/(2*eps);
                double output_derivative = subnetwork.get_gradient_input_element(i);
                if (!impl::is_inplace_layer(l,subnetwork))
                    output_derivative -= initial_gradient_input[i];
2939
2940
2941
2942
2943
                double relative_error;
                if (reference_derivative != 0)
                    relative_error = (reference_derivative - output_derivative)/(reference_derivative);
                else
                    relative_error = (reference_derivative - output_derivative);
2944
                double absolute_error = (reference_derivative - output_derivative);
2945
                rs_data.add(std::abs(relative_error));
Davis King's avatar
Davis King committed
2946
                if (std::abs(relative_error) > 0.05 && std::abs(absolute_error) > 0.006)
2947
2948
2949
2950
2951
                {
                    using namespace std;
                    sout << "Gradient error in data variable #" << i <<".  Relative error: "<< relative_error << endl;
                    sout << "expected derivative: " << reference_derivative << endl;
                    sout << "output derivative:   " << output_derivative << endl;
2952
                    sout << "iteration:           " << iter << endl;
2953
2954
                    return layer_test_results(sout.str()); 
                }
2955
            }
2956
2957

        } // end for (int iter = 0; iter < 5; ++iter)
2958

2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
        if (rs_params.mean() > 0.003)
        {
            using namespace std;
            sout << "Average parameter gradient error is somewhat large at: "<< rs_params.mean() << endl;
            return layer_test_results(sout.str()); 
        }
        if (rs_data.mean() > 0.003)
        {
            using namespace std;
            sout << "Average data gradient error is somewhat large at: "<< rs_data.mean() << endl;
            return layer_test_results(sout.str()); 
        }

2972
        return layer_test_results();
2973
2974
    }

Davis King's avatar
Davis King committed
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
    template <
        typename layer_details_type
        >
    layer_test_results test_layer (
        layer_details_type l
    )
    {
        // Try a few different derivative step sizes to see if any work. 
        for (float base_eps = 0.0001; base_eps < 0.1; base_eps *= 2)
        {
            auto result = impl_test_layer(l, base_eps);
            if (result)
                return result;
        }
        // However, if none of the step sizes worked then try this one and probably result
        // in returning an error.
        return impl_test_layer(l, 0.01);
    }

Davis King's avatar
Davis King committed
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
// ----------------------------------------------------------------------------------------

    namespace impl
    {
        template <size_t i, size_t num>
        struct vlp_loop
        {
            template <typename T, typename U>
            static typename std::enable_if<!is_add_layer<U>::value>::type invoke_functor(T&& , size_t& , U&& )
            {
                // intentionally left empty
            }

            template <typename T, typename U>
            static typename std::enable_if<is_add_layer<U>::value>::type invoke_functor(T&& v , size_t& comp_i, U&& l )
            {
                v(comp_i, l.layer_details().get_layer_params());
                ++comp_i;
            }

            template <
                typename net_type,
                typename visitor
                >
            static void visit(
                size_t comp_i,
                net_type& net,
                visitor&& v
            )
            {
                invoke_functor(v, comp_i, layer<i>(net));
                vlp_loop<i+1, num>::visit(comp_i, net,v);
            }
        };

        template <size_t num>
        struct vlp_loop<num,num>
        {
            template <
                typename net_type,
                typename visitor
                >
            static void visit(
                size_t,
                net_type&,
                visitor&& 
            )
            {
                // Base case of recursion.  Don't do anything.
            }
        };

    }

    template <
        typename net_type,
        typename visitor
        >
    void visit_layer_parameters(
        net_type& net,
        visitor v
    )
    {
        size_t comp_i = 0;
        impl::vlp_loop<0, net_type::num_layers>::visit(comp_i, net, v);
    }

3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
// ----------------------------------------------------------------------------------------

    namespace impl
    {
        template <size_t i, size_t num>
        struct vlpg_loop
        {
            template <typename T, typename U>
            static typename std::enable_if<!is_add_layer<U>::value>::type invoke_functor(T&& , size_t& , U&& )
            {
                // intentionally left empty
            }

            template <typename T, typename U>
            static typename std::enable_if<is_add_layer<U>::value>::type invoke_functor(T&& v , size_t& comp_i, U&& l )
            {
                v(comp_i, l.get_parameter_gradient());
                ++comp_i;
            }

            template <
                typename net_type,
                typename visitor
                >
            static void visit(
                size_t comp_i,
                net_type& net,
                visitor&& v
            )
            {
                invoke_functor(v, comp_i, layer<i>(net));
                vlpg_loop<i+1, num>::visit(comp_i, net,v);
            }
        };

        template <size_t num>
        struct vlpg_loop<num,num>
        {
            template <
                typename net_type,
                typename visitor
                >
            static void visit(
                size_t,
                net_type&,
                visitor&& 
            )
            {
                // Base case of recursion.  Don't do anything.
            }
        };

    }

    template <
        typename net_type,
        typename visitor
        >
    void visit_layer_parameter_gradients(
        net_type& net,
        visitor v
    )
    {
        size_t comp_i = 0;
        impl::vlpg_loop<0, net_type::num_layers>::visit(comp_i, net, v);
    }

3128
3129
3130
3131
// ----------------------------------------------------------------------------------------

}

3132
#endif // DLIB_DNn_CORE_H_
3133
3134