cpu_dlib.cpp 18 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
// Copyright (C) 2015  Davis E. King (davis@dlib.net)
// License: Boost Software License   See LICENSE.txt for the full license.
#ifndef DLIB_DNN_CPU_cPP_
#define DLIB_DNN_CPU_cPP_

// This file contains CPU implementations of the GPU based functions in cuda_dlib.h

#include "cpu_dlib.h"

namespace dlib
{
    namespace cpu 
    {

15
16
17
18
19
20
21
22
23
24
25
26
27
    // -----------------------------------------------------------------------------------

        void multiply (
            tensor& dest,
            const tensor& src
        )
        {
            const auto d = dest.host();
            const auto s = src.host();
            for (size_t i = 0; i < src.size(); ++i)
                d[i] *= s[i];
        }

28
29
30
31
32
33
34
35
36
    // -----------------------------------------------------------------------------------

        void affine_transform(
            resizable_tensor& dest,
            const tensor& src,
            const float A,
            const float B
        )
        {
37
38
39
40
41
            dest.copy_size(src);
            const auto d = dest.host();
            const auto s = src.host();
            for (size_t i = 0; i < src.size(); ++i)
                d[i] = A*s[i] + B;
42
43
44
45
46
47
48
49
50
51
52
        }

    // -----------------------------------------------------------------------------------

        void affine_transform(
            resizable_tensor& dest,
            const tensor& src,
            const tensor& A,
            const tensor& B
        )
        {
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
            DLIB_CASSERT(
                  ((A.num_samples()==1 && B.num_samples()==1) ||
                  (A.num_samples()==src.num_samples() && B.num_samples()==src.num_samples())) &&
                  A.nr()==B.nr() && B.nr()==src.nr() &&
                  A.nc()==B.nc() && B.nc()==src.nc() &&
                  A.k() ==B.k()  && B.k()==src.k(),"");

            dest.copy_size(src);
            auto d = dest.host();
            auto s = src.host();
            const auto a = A.host();
            const auto b = B.host();
            if (A.num_samples() == 1)
            {
                const long num = src.size()/src.num_samples();
                for (size_t i = 0; i < src.num_samples(); ++i)
                {
                    for (long j = 0; j < num; ++j)
                    {
                        *d = a[j]*(*s) + b[j];
                        d++;
                        s++;
                    }
                }
            }
            else
            {
                for (size_t i = 0; i < src.size(); ++i)
                    d[i] = a[i]*s[i] + b[i];
            }
83
84
85
86
87
88
89
        }

    // -----------------------------------------------------------------------------------

        void batch_normalize (
            resizable_tensor& dest,
            resizable_tensor& means,
90
            resizable_tensor& invstds,
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
            const tensor& src,
            const tensor& gamma, 
            const tensor& beta 
        )
        {
            DLIB_CASSERT(
                src.num_samples() > 1 &&
                gamma.num_samples() == 1 && 
                beta.num_samples() == 1 && 
                gamma.nr() == beta.nr() && beta.nr() == src.nr() &&
                gamma.nc() == beta.nc() && beta.nc() == src.nc() &&
                gamma.k()  == beta.k()  && beta.k() == src.k(), 
                "\ngamma.num_samples(): " << gamma.num_samples() << 
                "\ngamma.k():  " << gamma.k() << 
                "\ngamma.nr(): " << gamma.nr() << 
                "\ngamma.nc(): " << gamma.nc() << 
                "\nbeta.num_samples(): " << beta.num_samples() << 
                "\nbeta.k():   " << beta.k() << 
                "\nbeta.nr():  " << beta.nr() << 
                "\nbeta.nc():  " << beta.nc() << 
                "\nsrc.k():   " << src.k() << 
                "\nsrc.nr():  " << src.nr() << 
                "\nsrc.nc():  " << src.nc() 
            );

            dest.copy_size(src);
            means.set_size(1, src.k(), src.nr(), src.nc());
118
            invstds.set_size(1, src.k(), src.nr(), src.nc());
119

120
            // first compute means and invstds
121
            means = 0;
122
123
            invstds = 0;
            const auto p_invstds = invstds.host();
124
125
126
127
128
129
130
131
132
133
            const auto p_means = means.host();
            auto p_src = src.host();
            const long num = src.k()*src.nr()*src.nc();
            // compute means, and sum of squares
            for (long i = 0; i < num; ++i)
            {
                for (long n = 0; n < src.num_samples(); ++n)
                {
                    float val = p_src[n*num+i];
                    p_means[i] += val;
134
                    p_invstds[i] += val*val;
135
136
137
                }
            }
            means /= src.num_samples();
138
            invstds /= src.num_samples();
139
            // copy data back to host
140
            invstds.host(); means.host();
141

142
            const float eps = 0.00001;
143
144
145
146
            p_src = src.host();
            // compute variances 
            for (long i = 0; i < num; ++i)
            {
147
148
                auto actual_var = p_invstds[i] - p_means[i]*p_means[i];
                p_invstds[i] = 1.0/std::sqrt(actual_var+eps);
149
150
151
152
153
154
155
156
157
158
            }

            p_src = src.host();
            auto p_dest = dest.host();
            const auto p_gamma = gamma.host();   
            const auto p_beta = beta.host();   
            for (long n = 0; n < src.num_samples(); ++n)
            {
                for (long i = 0; i < num; ++i)
                {
159
                    *p_dest = (*p_src - p_means[i])*p_invstds[i];
160
161
162
163
164
165
166
167
168
169
                    *p_dest = (*p_dest)*p_gamma[i] + p_beta[i];
                    ++p_src;
                    ++p_dest;
                }
            }
        }

        void batch_normalize_gradient (
            const tensor& gradient_input,
            const tensor& means,
170
            const tensor& invstds,
171
172
173
174
175
176
177
178
179
180
            const tensor& src,
            const tensor& gamma,
            tensor& src_grad,
            tensor& gamma_grad, 
            tensor& beta_grad 
        )
        {

            const long num = src.k()*src.nr()*src.nc();
            DLIB_CASSERT(num == means.size(),"");
181
            DLIB_CASSERT(num == invstds.size(),"");
182
183
184
185
186
187
188
189
190
191
            DLIB_CASSERT(num == gamma.size(),"");
            DLIB_CASSERT(num == gamma_grad.size(),"");
            DLIB_CASSERT(num == beta_grad.size(),"");
            DLIB_CASSERT(have_same_dimensions(gradient_input, src),"");
            DLIB_CASSERT(have_same_dimensions(gradient_input, src_grad),"");
            auto p_grad = gradient_input.host();
            auto p_src = src.host();
            const auto p_gamma = gamma.host();   
            const auto p_gamma_grad = gamma_grad.host();   
            const auto p_beta_grad = beta_grad.host();   
192
            const auto p_invstds = invstds.host();
193
194
195
            const auto p_means = means.host();

            resizable_tensor dvars, dmeans;
196
            dvars.copy_size(invstds);
197
198
199
200
201
202
203
204
205
206
            dmeans.copy_size(means);
            dvars = 0;
            dmeans = 0;
            const auto p_dvars = dvars.host();
            const auto p_dmeans = dmeans.host();

            for (long n = 0; n < src.num_samples(); ++n)
            {
                for (long i = 0; i < num; ++i)
                {
207
                    const float x_hat = (*p_src - p_means[i])*p_invstds[i];
208
209
210
211
212
                    p_beta_grad[i] += *p_grad;
                    p_gamma_grad[i] += (*p_grad)*x_hat;

                    const float dx = *p_grad * p_gamma[i];

213
                    p_dvars[i] += dx*(*p_src - p_means[i])*-0.5*std::pow(p_invstds[i], 3.0f);
214
215
216
217
218
219

                    ++p_grad;
                    ++p_src;
                }
            }

220
            const float invnum = 1.0f/src.num_samples();
221
222
223
224
225
226
227
228
            p_grad = gradient_input.host();
            p_src = src.host();
            for (long n = 0; n < src.num_samples(); ++n)
            {
                for (long i = 0; i < num; ++i)
                {
                    const float dx = *p_grad * p_gamma[i];

229
                    p_dmeans[i] += dx*-p_invstds[i] + p_dvars[i] * -2*(*p_src - p_means[i])*invnum;
230
231
232
233
234
235
236
237
238
239
240
241
242
243

                    ++p_grad;
                    ++p_src;
                }
            }
            p_grad = gradient_input.host();
            p_src = src.host();
            auto p_src_grad = src_grad.host();
            for (long n = 0; n < src.num_samples(); ++n)
            {
                for (long i = 0; i < num; ++i)
                {
                    const float dx = *p_grad * p_gamma[i];

244
                    *p_src_grad += dx*p_invstds[i] + 
245
246
                        p_dvars[i] *2*(*p_src - p_means[i])*invnum + 
                        p_dmeans[i]*invnum;
247
248
249
250
251
252
253
254
255
256
257
258
259
260


                    ++p_grad;
                    ++p_src;
                    ++p_src_grad;
                }
            }
        }

    // ----------------------------------------------------------------------------------------

        void batch_normalize_conv (
            resizable_tensor& dest,
            resizable_tensor& means,
261
            resizable_tensor& invstds,
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
            const tensor& src,
            const tensor& gamma, 
            const tensor& beta 
        )
        {
            DLIB_CASSERT(
                src.num_samples() > 1 &&
                gamma.num_samples() == 1 && 
                beta.num_samples() == 1 && 
                gamma.nr() == 1 && 
                beta.nr() == 1 && 
                gamma.nc() == 1 && 
                beta.nc() == 1 && 
                gamma.k()  == beta.k()  && beta.k() == src.k(), 
                "\ngamma.num_samples(): " << gamma.num_samples() << 
                "\ngamma.k():  " << gamma.k() << 
                "\ngamma.nr(): " << gamma.nr() << 
                "\ngamma.nc(): " << gamma.nc() << 
                "\nbeta.num_samples(): " << beta.num_samples() << 
                "\nbeta.k():   " << beta.k() << 
                "\nbeta.nr():  " << beta.nr() << 
                "\nbeta.nc():  " << beta.nc() << 
                "\nsrc.k():   " << src.k() << 
                "\nsrc.nr():  " << src.nr() << 
                "\nsrc.nc():  " << src.nc() 
            );

            dest.copy_size(src);
            means.set_size(1, src.k());
291
            invstds.set_size(1, src.k());
292

293
            // first compute means and invstds
294
            means = 0;
295
296
            invstds = 0;
            const auto p_invstds = invstds.host();
297
298
299
300
301
302
303
304
305
306
307
308
309
            const auto p_means = means.host();
            const auto p_gamma = gamma.host();   
            const auto p_beta = beta.host();   
            auto p_src = src.host();
            const long num = src.nr()*src.nc();
            // compute means, and sum of squares
            for (long n = 0; n < src.num_samples(); ++n)
            {
                for (long k = 0; k < src.k(); ++k)
                {
                    for (long i = 0; i < num; ++i)
                    {
                        p_means[k] += *p_src;
310
                        p_invstds[k] += (*p_src)*(*p_src);
311
312
313
314
315
                        ++p_src;
                    }
                }
            }
            means /= src.num_samples()*num;
316
            invstds /= src.num_samples()*num;
317
            // copy data back to host
318
            invstds.host(); means.host();
319

320
            const float eps = 0.00001;
321
322
323
324
            p_src = src.host();
            // compute variances 
            for (long k = 0; k < src.k(); ++k)
            {
325
326
                auto actual_var = p_invstds[k] - p_means[k]*p_means[k];
                p_invstds[k] = 1.0/std::sqrt(actual_var + eps);
327
328
329
330
331
332
333
334
335
336
            }

            p_src = src.host();
            auto p_dest = dest.host();
            for (long n = 0; n < src.num_samples(); ++n)
            {
                for (long k = 0; k < src.k(); ++k)
                {
                    for (long i = 0; i < num; ++i)
                    {
337
                        *p_dest = (*p_src - p_means[k])*p_invstds[k];
338
339
340
341
342
343
344
345
346
347
348
                        *p_dest = (*p_dest)*p_gamma[k] + p_beta[k];
                        ++p_src;
                        ++p_dest;
                    }
                }
            }
        }

        void batch_normalize_conv_gradient (
            const tensor& gradient_input,
            const tensor& means,
349
            const tensor& invstds,
350
351
352
353
354
355
356
357
358
359
            const tensor& src,
            const tensor& gamma,
            tensor& src_grad,
            tensor& gamma_grad, 
            tensor& beta_grad 
        )
        {

            const long num = src.nr()*src.nc();
            DLIB_CASSERT(src.k() == means.size(),"");
360
            DLIB_CASSERT(src.k() == invstds.size(),"");
361
362
363
364
365
366
367
368
369
370
            DLIB_CASSERT(src.k() == gamma.size(),"");
            DLIB_CASSERT(src.k() == gamma_grad.size(),"");
            DLIB_CASSERT(src.k() == beta_grad.size(),"");
            DLIB_CASSERT(have_same_dimensions(gradient_input, src),"");
            DLIB_CASSERT(have_same_dimensions(gradient_input, src_grad),"");
            auto p_grad = gradient_input.host();
            auto p_src = src.host();
            const auto p_gamma = gamma.host();   
            const auto p_gamma_grad = gamma_grad.host();   
            const auto p_beta_grad = beta_grad.host();   
371
            const auto p_invstds = invstds.host();
372
373
374
            const auto p_means = means.host();

            resizable_tensor dvars, dmeans;
375
            dvars.copy_size(invstds);
376
377
378
379
380
381
382
383
384
385
            dmeans.copy_size(means);
            dvars = 0;
            dmeans = 0;
            const auto p_dvars = dvars.host();
            const auto p_dmeans = dmeans.host();

            for (long n = 0; n < src.num_samples(); ++n)
            {
                for (long k = 0; k < src.k(); ++k)
                {
386
                    const auto invstd_pow = -0.5*std::pow(p_invstds[k], 3.0f);
387
388
                    for (long i = 0; i < num; ++i)
                    {
389
                        const float x_hat = (*p_src - p_means[k])*p_invstds[k];
390
391
392
393
394
                        p_beta_grad[k] += *p_grad;
                        p_gamma_grad[k] += (*p_grad)*x_hat;

                        const float dx = *p_grad * p_gamma[k];

395
                        p_dvars[k] += dx*(*p_src - p_means[k])*invstd_pow;
396
397
398
399
400
401
402
403
404

                        ++p_grad;
                        ++p_src;
                    }
                }
            }

            p_grad = gradient_input.host();
            p_src = src.host();
405
            const float invnum = 1.0f/(src.num_samples()*num);
406
407
408
409
410
411
412
413
            for (long n = 0; n < src.num_samples(); ++n)
            {
                for (long k = 0; k < src.k(); ++k)
                {
                    for (long i = 0; i < num; ++i)
                    {
                        const float dx = *p_grad * p_gamma[k];

414
                        p_dmeans[k] += -dx*p_invstds[k] + p_dvars[k] * -2*(*p_src - p_means[k])*invnum;
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431

                        ++p_grad;
                        ++p_src;
                    }
                }
            }
            p_grad = gradient_input.host();
            p_src = src.host();
            auto p_src_grad = src_grad.host();
            for (long n = 0; n < src.num_samples(); ++n)
            {
                for (long k = 0; k < src.k(); ++k)
                {
                    for (long i = 0; i < num; ++i)
                    {
                        const float dx = *p_grad * p_gamma[k];

432
                        *p_src_grad += dx*p_invstds[k] + 
433
434
                            p_dvars[k]*2*(*p_src - p_means[k])*invnum + 
                            p_dmeans[k]*invnum;
435
436
437
438
439
440
441
442
443
444
445
446


                        ++p_grad;
                        ++p_src;
                        ++p_src_grad;
                    }
                }
            }
        }

    // -----------------------------------------------------------------------------------

447
448
449
        void threshold (
            tensor& data,
            float thresh
450
451
        )
        {
452
453
454
            const auto d = data.host();
            for (size_t i = 0; i < data.size(); ++i)
                d[i] = d[i]>thresh ? 1:0;
455
456
457
        }

    // -----------------------------------------------------------------------------------
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
    // -----------------------------------------------------------------------------------
    // -----------------------------------------------------------------------------------

        void softmax (
            tensor& dest,
            const tensor& src
        )
        {
            // TODO
            DLIB_CASSERT(false,"");
        }

        void softmax_gradient (
            tensor& grad,
            const tensor& dest,
            const tensor& gradient_input
        )
        {
            // TODO
            DLIB_CASSERT(false,"");
        }

    // ------------------------------------------------------------------------------------

        void sigmoid (
            tensor& dest,
            const tensor& src
        )
        {
            // TODO
            DLIB_CASSERT(false,"");
        }

        void sigmoid_gradient (
            tensor& grad,
            const tensor& dest,
            const tensor& gradient_input
        )
        {
            // TODO
            DLIB_CASSERT(false,"");
        }

    // ------------------------------------------------------------------------------------

        void relu (
            tensor& dest,
            const tensor& src
        )
        {
            dest = lowerbound(mat(src), 0);
        }

        void relu_gradient (
            tensor& grad,
            const tensor& dest,
            const tensor& gradient_input
        )
        {
            const float* gi = gradient_input.host();
            const float* in = dest.host();
            float* out = grad.host();
            for (size_t i = 0; i < dest.size(); ++i)
            {
                if (in[i] > 0)
                    out[i] = gi[i];
                else
                    out[i] = 0;
            }
        }

    // ------------------------------------------------------------------------------------

        void tanh (
            tensor& dest,
            const tensor& src
        )
        {
            // TODO
            DLIB_CASSERT(false,"");
        }

        void tanh_gradient (
            tensor& grad,
            const tensor& dest,
            const tensor& gradient_input
        )
        {
            // TODO
            DLIB_CASSERT(false,"");
        }
549

550
    // ------------------------------------------------------------------------------------
551

552
553
554
555
556
557
558
    } 
}


#endif // DLIB_DNN_CPU_cPP_