cpu_dlib.cpp 23.5 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
// Copyright (C) 2015  Davis E. King (davis@dlib.net)
// License: Boost Software License   See LICENSE.txt for the full license.
#ifndef DLIB_DNN_CPU_cPP_
#define DLIB_DNN_CPU_cPP_

// This file contains CPU implementations of the GPU based functions in cuda_dlib.h

#include "cpu_dlib.h"

namespace dlib
{
    namespace cpu 
    {

15
16
17
18
19
20
21
    // -----------------------------------------------------------------------------------

        void multiply (
            tensor& dest,
            const tensor& src
        )
        {
22
            DLIB_CASSERT(dest.size()==src.size(),"");
23
24
25
26
27
28
            const auto d = dest.host();
            const auto s = src.host();
            for (size_t i = 0; i < src.size(); ++i)
                d[i] *= s[i];
        }

29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
    // -----------------------------------------------------------------------------------

        void multiply (
            tensor& dest,
            const tensor& src1,
            const tensor& src2
        )
        {
            DLIB_CASSERT(dest.size()==src1.size(),"");
            DLIB_CASSERT(dest.size()==src2.size(),"");
            const auto d = dest.host();
            const auto s1 = src1.host();
            const auto s2 = src2.host();
            for (size_t i = 0; i < src1.size(); ++i)
                d[i] = s1[i]*s2[i];
        }

46
47
48
    // -----------------------------------------------------------------------------------

        void affine_transform(
49
            tensor& dest,
50
51
52
53
54
            const tensor& src,
            const float A,
            const float B
        )
        {
55
            DLIB_CASSERT(dest.size()==src.size(),"");
56
57
58
59
            const auto d = dest.host();
            const auto s = src.host();
            for (size_t i = 0; i < src.size(); ++i)
                d[i] = A*s[i] + B;
60
61
        }

62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
        void affine_transform(
            tensor& dest,
            const tensor& src1,
            const tensor& src2,
            const float A,
            const float B,
            const float C
        )
        {
            DLIB_CASSERT(dest.size()==src1.size(),"");
            DLIB_CASSERT(dest.size()==src2.size(),"");
            const auto d = dest.host();
            const auto s1 = src1.host();
            const auto s2 = src2.host();
            for (size_t i = 0; i < src1.size(); ++i)
                d[i] = A*s1[i] + B*s2[i] + C;
        }

        void affine_transform(
            tensor& dest,
            const tensor& src1,
            const tensor& src2,
            const tensor& src3,
            const float A,
            const float B,
            const float C,
            const float D
        )
        {
            DLIB_CASSERT(dest.size()==src1.size(),"");
            DLIB_CASSERT(dest.size()==src2.size(),"");
            DLIB_CASSERT(dest.size()==src3.size(),"");
            const auto d = dest.host();
            const auto s1 = src1.host();
            const auto s2 = src2.host();
            const auto s3 = src3.host();
            for (size_t i = 0; i < src1.size(); ++i)
                d[i] = A*s1[i] + B*s2[i] + C*s3[i] + D;
        }

102
103
104
    // -----------------------------------------------------------------------------------

        void affine_transform(
105
            tensor& dest,
106
107
108
109
110
            const tensor& src,
            const tensor& A,
            const tensor& B
        )
        {
111
            DLIB_CASSERT(have_same_dimensions(dest,src),"");
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
            DLIB_CASSERT(
                  ((A.num_samples()==1 && B.num_samples()==1) ||
                  (A.num_samples()==src.num_samples() && B.num_samples()==src.num_samples())) &&
                  A.nr()==B.nr() && B.nr()==src.nr() &&
                  A.nc()==B.nc() && B.nc()==src.nc() &&
                  A.k() ==B.k()  && B.k()==src.k(),"");

            auto d = dest.host();
            auto s = src.host();
            const auto a = A.host();
            const auto b = B.host();
            if (A.num_samples() == 1)
            {
                const long num = src.size()/src.num_samples();
                for (size_t i = 0; i < src.num_samples(); ++i)
                {
                    for (long j = 0; j < num; ++j)
                    {
                        *d = a[j]*(*s) + b[j];
                        d++;
                        s++;
                    }
                }
            }
            else
            {
                for (size_t i = 0; i < src.size(); ++i)
                    d[i] = a[i]*s[i] + b[i];
            }
141
142
143
144
145
146
147
        }

    // -----------------------------------------------------------------------------------

        void batch_normalize (
            resizable_tensor& dest,
            resizable_tensor& means,
148
            resizable_tensor& invstds,
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
            const tensor& src,
            const tensor& gamma, 
            const tensor& beta 
        )
        {
            DLIB_CASSERT(
                src.num_samples() > 1 &&
                gamma.num_samples() == 1 && 
                beta.num_samples() == 1 && 
                gamma.nr() == beta.nr() && beta.nr() == src.nr() &&
                gamma.nc() == beta.nc() && beta.nc() == src.nc() &&
                gamma.k()  == beta.k()  && beta.k() == src.k(), 
                "\ngamma.num_samples(): " << gamma.num_samples() << 
                "\ngamma.k():  " << gamma.k() << 
                "\ngamma.nr(): " << gamma.nr() << 
                "\ngamma.nc(): " << gamma.nc() << 
                "\nbeta.num_samples(): " << beta.num_samples() << 
                "\nbeta.k():   " << beta.k() << 
                "\nbeta.nr():  " << beta.nr() << 
                "\nbeta.nc():  " << beta.nc() << 
                "\nsrc.k():   " << src.k() << 
                "\nsrc.nr():  " << src.nr() << 
                "\nsrc.nc():  " << src.nc() 
            );

            dest.copy_size(src);
            means.set_size(1, src.k(), src.nr(), src.nc());
176
            invstds.set_size(1, src.k(), src.nr(), src.nc());
177

178
            // first compute means and invstds
179
            means = 0;
180
181
            invstds = 0;
            const auto p_invstds = invstds.host();
182
183
184
185
186
187
188
189
190
191
            const auto p_means = means.host();
            auto p_src = src.host();
            const long num = src.k()*src.nr()*src.nc();
            // compute means, and sum of squares
            for (long i = 0; i < num; ++i)
            {
                for (long n = 0; n < src.num_samples(); ++n)
                {
                    float val = p_src[n*num+i];
                    p_means[i] += val;
192
                    p_invstds[i] += val*val;
193
194
195
                }
            }
            means /= src.num_samples();
196
            invstds /= src.num_samples();
197
            // copy data back to host
198
            invstds.host(); means.host();
199

200
            const float eps = 0.00001;
201
202
203
            // compute variances 
            for (long i = 0; i < num; ++i)
            {
204
                auto actual_var = p_invstds[i] - p_means[i]*p_means[i];
205
                p_invstds[i] = 1.0f/std::sqrt(actual_var+eps);
206
207
208
209
210
211
212
213
214
215
            }

            p_src = src.host();
            auto p_dest = dest.host();
            const auto p_gamma = gamma.host();   
            const auto p_beta = beta.host();   
            for (long n = 0; n < src.num_samples(); ++n)
            {
                for (long i = 0; i < num; ++i)
                {
216
                    *p_dest = (*p_src - p_means[i])*p_invstds[i];
217
218
219
220
221
222
223
                    *p_dest = (*p_dest)*p_gamma[i] + p_beta[i];
                    ++p_src;
                    ++p_dest;
                }
            }
        }

Davis King's avatar
Davis King committed
224
        void batch_normalize_gradient::operator() (
225
226
            const tensor& gradient_input,
            const tensor& means,
227
            const tensor& invstds,
228
229
230
231
232
233
234
235
236
237
            const tensor& src,
            const tensor& gamma,
            tensor& src_grad,
            tensor& gamma_grad, 
            tensor& beta_grad 
        )
        {

            const long num = src.k()*src.nr()*src.nc();
            DLIB_CASSERT(num == means.size(),"");
238
            DLIB_CASSERT(num == invstds.size(),"");
239
240
241
242
243
            DLIB_CASSERT(num == gamma.size(),"");
            DLIB_CASSERT(num == gamma_grad.size(),"");
            DLIB_CASSERT(num == beta_grad.size(),"");
            DLIB_CASSERT(have_same_dimensions(gradient_input, src),"");
            DLIB_CASSERT(have_same_dimensions(gradient_input, src_grad),"");
244
245
246

            beta_grad = 0;
            gamma_grad = 0;
247
248
249
250
251
            auto p_grad = gradient_input.host();
            auto p_src = src.host();
            const auto p_gamma = gamma.host();   
            const auto p_gamma_grad = gamma_grad.host();   
            const auto p_beta_grad = beta_grad.host();   
252
            const auto p_invstds = invstds.host();
253
254
            const auto p_means = means.host();

255
            dvars.copy_size(invstds);
256
257
258
259
260
261
262
263
264
265
            dmeans.copy_size(means);
            dvars = 0;
            dmeans = 0;
            const auto p_dvars = dvars.host();
            const auto p_dmeans = dmeans.host();

            for (long n = 0; n < src.num_samples(); ++n)
            {
                for (long i = 0; i < num; ++i)
                {
266
                    const float x_hat = (*p_src - p_means[i])*p_invstds[i];
267
268
269
270
271
                    p_beta_grad[i] += *p_grad;
                    p_gamma_grad[i] += (*p_grad)*x_hat;

                    const float dx = *p_grad * p_gamma[i];

272
                    p_dvars[i] += dx*(*p_src - p_means[i])*-0.5*std::pow(p_invstds[i], 3.0f);
273
274
275
276
277
278

                    ++p_grad;
                    ++p_src;
                }
            }

279
            const float invnum = 1.0f/src.num_samples();
280
281
282
283
284
285
286
287
            p_grad = gradient_input.host();
            p_src = src.host();
            for (long n = 0; n < src.num_samples(); ++n)
            {
                for (long i = 0; i < num; ++i)
                {
                    const float dx = *p_grad * p_gamma[i];

288
                    p_dmeans[i] += dx*-p_invstds[i] + p_dvars[i] * -2*(*p_src - p_means[i])*invnum;
289
290
291
292
293
294
295
296
297
298
299
300
301
302

                    ++p_grad;
                    ++p_src;
                }
            }
            p_grad = gradient_input.host();
            p_src = src.host();
            auto p_src_grad = src_grad.host();
            for (long n = 0; n < src.num_samples(); ++n)
            {
                for (long i = 0; i < num; ++i)
                {
                    const float dx = *p_grad * p_gamma[i];

303
                    *p_src_grad += dx*p_invstds[i] + 
304
305
                        p_dvars[i] *2*(*p_src - p_means[i])*invnum + 
                        p_dmeans[i]*invnum;
306
307
308
309
310
311
312
313
314
315
316
317
318
319


                    ++p_grad;
                    ++p_src;
                    ++p_src_grad;
                }
            }
        }

    // ----------------------------------------------------------------------------------------

        void batch_normalize_conv (
            resizable_tensor& dest,
            resizable_tensor& means,
320
            resizable_tensor& invstds,
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
            const tensor& src,
            const tensor& gamma, 
            const tensor& beta 
        )
        {
            DLIB_CASSERT(
                src.num_samples() > 1 &&
                gamma.num_samples() == 1 && 
                beta.num_samples() == 1 && 
                gamma.nr() == 1 && 
                beta.nr() == 1 && 
                gamma.nc() == 1 && 
                beta.nc() == 1 && 
                gamma.k()  == beta.k()  && beta.k() == src.k(), 
                "\ngamma.num_samples(): " << gamma.num_samples() << 
                "\ngamma.k():  " << gamma.k() << 
                "\ngamma.nr(): " << gamma.nr() << 
                "\ngamma.nc(): " << gamma.nc() << 
                "\nbeta.num_samples(): " << beta.num_samples() << 
                "\nbeta.k():   " << beta.k() << 
                "\nbeta.nr():  " << beta.nr() << 
                "\nbeta.nc():  " << beta.nc() << 
                "\nsrc.k():   " << src.k() << 
                "\nsrc.nr():  " << src.nr() << 
                "\nsrc.nc():  " << src.nc() 
            );

            dest.copy_size(src);
            means.set_size(1, src.k());
350
            invstds.set_size(1, src.k());
351

352
            // first compute means and invstds
353
            means = 0;
354
355
            invstds = 0;
            const auto p_invstds = invstds.host();
356
357
358
359
360
361
362
363
364
365
366
367
368
            const auto p_means = means.host();
            const auto p_gamma = gamma.host();   
            const auto p_beta = beta.host();   
            auto p_src = src.host();
            const long num = src.nr()*src.nc();
            // compute means, and sum of squares
            for (long n = 0; n < src.num_samples(); ++n)
            {
                for (long k = 0; k < src.k(); ++k)
                {
                    for (long i = 0; i < num; ++i)
                    {
                        p_means[k] += *p_src;
369
                        p_invstds[k] += (*p_src)*(*p_src);
370
371
372
373
374
                        ++p_src;
                    }
                }
            }
            means /= src.num_samples()*num;
375
            invstds /= src.num_samples()*num;
376
            // copy data back to host
377
            invstds.host(); means.host();
378

379
            const float eps = 0.00001;
380
381
382
383
            p_src = src.host();
            // compute variances 
            for (long k = 0; k < src.k(); ++k)
            {
384
385
                float actual_var = p_invstds[k] - p_means[k]*p_means[k];
                p_invstds[k] = 1.0f/std::sqrt(actual_var + eps);
386
387
388
389
390
391
392
393
394
395
            }

            p_src = src.host();
            auto p_dest = dest.host();
            for (long n = 0; n < src.num_samples(); ++n)
            {
                for (long k = 0; k < src.k(); ++k)
                {
                    for (long i = 0; i < num; ++i)
                    {
396
                        *p_dest = (*p_src - p_means[k])*p_invstds[k];
397
398
399
400
401
402
403
404
                        *p_dest = (*p_dest)*p_gamma[k] + p_beta[k];
                        ++p_src;
                        ++p_dest;
                    }
                }
            }
        }

Davis King's avatar
Davis King committed
405
        void batch_normalize_conv_gradient::operator() (
406
407
            const tensor& gradient_input,
            const tensor& means,
408
            const tensor& invstds,
409
410
411
412
413
414
415
416
417
418
            const tensor& src,
            const tensor& gamma,
            tensor& src_grad,
            tensor& gamma_grad, 
            tensor& beta_grad 
        )
        {

            const long num = src.nr()*src.nc();
            DLIB_CASSERT(src.k() == means.size(),"");
419
            DLIB_CASSERT(src.k() == invstds.size(),"");
420
421
422
423
424
425
426
427
428
429
            DLIB_CASSERT(src.k() == gamma.size(),"");
            DLIB_CASSERT(src.k() == gamma_grad.size(),"");
            DLIB_CASSERT(src.k() == beta_grad.size(),"");
            DLIB_CASSERT(have_same_dimensions(gradient_input, src),"");
            DLIB_CASSERT(have_same_dimensions(gradient_input, src_grad),"");
            auto p_grad = gradient_input.host();
            auto p_src = src.host();
            const auto p_gamma = gamma.host();   
            const auto p_gamma_grad = gamma_grad.host();   
            const auto p_beta_grad = beta_grad.host();   
430
            const auto p_invstds = invstds.host();
431
432
            const auto p_means = means.host();

433
            dvars.copy_size(invstds);
434
435
436
437
438
439
440
441
442
443
            dmeans.copy_size(means);
            dvars = 0;
            dmeans = 0;
            const auto p_dvars = dvars.host();
            const auto p_dmeans = dmeans.host();

            for (long n = 0; n < src.num_samples(); ++n)
            {
                for (long k = 0; k < src.k(); ++k)
                {
444
                    const float invstd_pow = -0.5*std::pow(p_invstds[k], 3.0f);
445
446
                    for (long i = 0; i < num; ++i)
                    {
447
                        const float x_hat = (*p_src - p_means[k])*p_invstds[k];
448
449
450
451
452
                        p_beta_grad[k] += *p_grad;
                        p_gamma_grad[k] += (*p_grad)*x_hat;

                        const float dx = *p_grad * p_gamma[k];

453
                        p_dvars[k] += dx*(*p_src - p_means[k])*invstd_pow;
454
455
456
457
458
459
460
461
462

                        ++p_grad;
                        ++p_src;
                    }
                }
            }

            p_grad = gradient_input.host();
            p_src = src.host();
463
            const float invnum = 1.0f/(src.num_samples()*num);
464
465
466
467
468
469
470
471
            for (long n = 0; n < src.num_samples(); ++n)
            {
                for (long k = 0; k < src.k(); ++k)
                {
                    for (long i = 0; i < num; ++i)
                    {
                        const float dx = *p_grad * p_gamma[k];

472
                        p_dmeans[k] += -dx*p_invstds[k] + p_dvars[k] * -2*(*p_src - p_means[k])*invnum;
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489

                        ++p_grad;
                        ++p_src;
                    }
                }
            }
            p_grad = gradient_input.host();
            p_src = src.host();
            auto p_src_grad = src_grad.host();
            for (long n = 0; n < src.num_samples(); ++n)
            {
                for (long k = 0; k < src.k(); ++k)
                {
                    for (long i = 0; i < num; ++i)
                    {
                        const float dx = *p_grad * p_gamma[k];

490
                        *p_src_grad += dx*p_invstds[k] + 
491
492
                            p_dvars[k]*2*(*p_src - p_means[k])*invnum + 
                            p_dmeans[k]*invnum;
493
494
495
496
497
498
499
500
501
502
503
504


                        ++p_grad;
                        ++p_src;
                        ++p_src_grad;
                    }
                }
            }
        }

    // -----------------------------------------------------------------------------------

505
506
507
        void threshold (
            tensor& data,
            float thresh
508
509
        )
        {
510
511
512
            const auto d = data.host();
            for (size_t i = 0; i < data.size(); ++i)
                d[i] = d[i]>thresh ? 1:0;
513
514
515
        }

    // -----------------------------------------------------------------------------------
516
517
518
519
520
521
522
523
    // -----------------------------------------------------------------------------------
    // -----------------------------------------------------------------------------------

        void softmax (
            tensor& dest,
            const tensor& src
        )
        {
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
            DLIB_CASSERT(have_same_dimensions(dest,src),"");
            const auto d = dest.host();
            const auto s = src.host();

            const long num = src.nr()*src.nc();
            // Note that we subtract out the max values in each channel before applying
            // exp() to avoid numeric overflow in the subsequent computations.  Doing this
            // doesn't change the resulting output, it just makes it more numerically
            // stable.
            for (long n = 0; n < src.num_samples(); ++n)
            {
                auto ss = s + num*src.k()*n;
                auto dd = d + num*src.k()*n;
                for (long i = 0; i < num; ++i)
                {
                    float max_val = -std::numeric_limits<float>::infinity();
                    for (long k = 0; k < src.k(); ++k)
                        max_val = std::max(max_val, ss[k*num]);

                    for (long k = 0; k < src.k(); ++k)
                        dd[k*num] = std::exp(ss[k*num]-max_val);

                    ++ss;
                    ++dd;
                }
            }

            // Now normalize each channel so they sum to 1.
            for (long n = 0; n < src.num_samples(); ++n)
            {
                const auto ss = s + num*src.k()*n;
                const auto dd = d + num*src.k()*n;
                for (long r = 0; r < src.nr(); ++r)
                {
                    for (long c = 0; c < src.nc(); ++c)
                    {
                        const auto sss = ss+r*src.nc()+c;
                        const auto ddd = dd+r*src.nc()+c;

                        float temp = 0;
                        for (long k = 0; k < src.k(); ++k)
                            temp += ddd[k*num];
                        for (long k = 0; k < src.k(); ++k)
                            ddd[k*num] /= temp;
                    }
                }
            }
571
572
573
574
575
576
577
578
        }

        void softmax_gradient (
            tensor& grad,
            const tensor& dest,
            const tensor& gradient_input
        )
        {
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
            DLIB_CASSERT(have_same_dimensions(grad,dest),"");
            DLIB_CASSERT(have_same_dimensions(grad,gradient_input),"");
            const auto d = dest.host();
            const auto g = grad.host();
            const auto in = gradient_input.host();

            const long num = grad.nr()*grad.nc();

            // Now normalize each channel so they sum to 1.
            for (long n = 0; n < grad.num_samples(); ++n)
            {
                const auto d2 = d + num*grad.k()*n;
                const auto g2 = g + num*grad.k()*n;
                const auto in2 = in + num*grad.k()*n;
                for (long r = 0; r < grad.nr(); ++r)
                {
                    for (long c = 0; c < grad.nc(); ++c)
                    {
                        const auto d3 = d2+r*grad.nc()+c;
                        const auto g3 = g2+r*grad.nc()+c;
                        const auto in3 = in2+r*grad.nc()+c;

                        float temp = 0;
                        for (long k = 0; k < grad.k(); ++k)
                            temp += -d2[k*num]*in3[k*num];
                        for (long k = 0; k < grad.k(); ++k)
                            g3[k*num] = d3[k*num]*(temp+in3[k*num]);
                    }
                }
            }
609
610
611
612
613
614
615
616
617
        }

    // ------------------------------------------------------------------------------------

        void sigmoid (
            tensor& dest,
            const tensor& src
        )
        {
618
619
620
621
            const auto d = dest.host();
            const auto s = src.host();
            for (size_t i = 0; i < src.size(); ++i)
                d[i] = 1/(1+std::exp(-s[i]));
622
623
624
625
626
627
628
629
        }

        void sigmoid_gradient (
            tensor& grad,
            const tensor& dest,
            const tensor& gradient_input
        )
        {
630
631
632
633
634
            const auto g = grad.host();
            const auto d = dest.host();
            const auto in = gradient_input.host();
            for (size_t i = 0; i < dest.size(); ++i)
                g[i] = in[i]*d[i]*(1-d[i]);
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
        }

    // ------------------------------------------------------------------------------------

        void relu (
            tensor& dest,
            const tensor& src
        )
        {
            dest = lowerbound(mat(src), 0);
        }

        void relu_gradient (
            tensor& grad,
            const tensor& dest,
            const tensor& gradient_input
        )
        {
            const float* gi = gradient_input.host();
            const float* in = dest.host();
            float* out = grad.host();
            for (size_t i = 0; i < dest.size(); ++i)
            {
                if (in[i] > 0)
                    out[i] = gi[i];
                else
                    out[i] = 0;
            }
        }

    // ------------------------------------------------------------------------------------

        void tanh (
            tensor& dest,
            const tensor& src
        )
        {
Davis King's avatar
Davis King committed
672
673
674
675
            const auto d = dest.host();
            const auto s = src.host();
            for (size_t i = 0; i < src.size(); ++i)
                d[i] = std::tanh(s[i]);
676
677
678
679
680
681
682
683
        }

        void tanh_gradient (
            tensor& grad,
            const tensor& dest,
            const tensor& gradient_input
        )
        {
Davis King's avatar
Davis King committed
684
685
686
687
688
            const auto g = grad.host();
            const auto d = dest.host();
            const auto in = gradient_input.host();
            for (size_t i = 0; i < dest.size(); ++i)
                g[i] = in[i]*(1-d[i]*d[i]);
689
        }
690

691
    // ------------------------------------------------------------------------------------
692

693
694
695
696
697
698
699
    } 
}


#endif // DLIB_DNN_CPU_cPP_