Commit 0886042c authored by lishen's avatar lishen
Browse files

dlib from github, version=19.24

parent 5b127120
Pipeline #262 failed with stages
in 0 seconds
// Copyright (C) 2005 Davis E. King (davis@dlib.net)
// License: Boost Software License See LICENSE.txt for the full license.
#ifndef DLIB_CRC32_KERNEl_1_
#define DLIB_CRC32_KERNEl_1_
#include "../algs.h"
#include <string>
#include <vector>
#include "crc32_kernel_abstract.h"
namespace dlib
{
class crc32
{
/*!
INITIAL VALUE
checksum == 0xFFFFFFFF
CONVENTION
get_checksum() == checksum ^ 0xFFFFFFFF
!*/
public:
// this is here for backwards compatibility with older versions of dlib.
typedef crc32 kernel_1a;
inline crc32 (
);
inline crc32 (
const std::string& item
);
inline crc32 (
const std::vector<char>& item
);
inline virtual ~crc32 (
);
inline void clear(
);
inline void add (
unsigned char item
);
inline void add (
const std::string& item
);
inline void add (
const std::vector<char>& item
);
inline operator unsigned long (
) const { return get_checksum(); }
inline unsigned long get_checksum (
) const;
inline void swap (
crc32& item
);
private:
unsigned long checksum;
inline unsigned long table (
unsigned int idx
) const
{
/*
// This code generates the crc_table used below.
unsigned long crc_table[256];
for (unsigned long i = 0; i < 256; ++i)
{
unsigned long temp = i;
for (unsigned long j = 0; j < 8; ++j)
{
if (temp&1)
temp = (temp>>1)^0xedb88320;
else
temp >>= 1;
}
crc_table[i] = temp;
std::cout << std::hex << crc_table[i] << std::endl;
}
*/
const static unsigned long crc_table[256] = {
0x00000000, 0x77073096, 0xee0e612c, 0x990951ba, 0x76dc419, 0x706af48f, 0xe963a535, 0x9e6495a3,
0xedb8832, 0x79dcb8a4, 0xe0d5e91e, 0x97d2d988, 0x9b64c2b, 0x7eb17cbd, 0xe7b82d07, 0x90bf1d91,
0x1db71064, 0x6ab020f2, 0xf3b97148, 0x84be41de, 0x1adad47d, 0x6ddde4eb, 0xf4d4b551, 0x83d385c7,
0x136c9856, 0x646ba8c0, 0xfd62f97a, 0x8a65c9ec, 0x14015c4f, 0x63066cd9, 0xfa0f3d63, 0x8d080df5,
0x3b6e20c8, 0x4c69105e, 0xd56041e4, 0xa2677172, 0x3c03e4d1, 0x4b04d447, 0xd20d85fd, 0xa50ab56b,
0x35b5a8fa, 0x42b2986c, 0xdbbbc9d6, 0xacbcf940, 0x32d86ce3, 0x45df5c75, 0xdcd60dcf, 0xabd13d59,
0x26d930ac, 0x51de003a, 0xc8d75180, 0xbfd06116, 0x21b4f4b5, 0x56b3c423, 0xcfba9599, 0xb8bda50f,
0x2802b89e, 0x5f058808, 0xc60cd9b2, 0xb10be924, 0x2f6f7c87, 0x58684c11, 0xc1611dab, 0xb6662d3d,
0x76dc4190, 0x1db7106, 0x98d220bc, 0xefd5102a, 0x71b18589, 0x6b6b51f, 0x9fbfe4a5, 0xe8b8d433,
0x7807c9a2, 0xf00f934, 0x9609a88e, 0xe10e9818, 0x7f6a0dbb, 0x86d3d2d, 0x91646c97, 0xe6635c01,
0x6b6b51f4, 0x1c6c6162, 0x856530d8, 0xf262004e, 0x6c0695ed, 0x1b01a57b, 0x8208f4c1, 0xf50fc457,
0x65b0d9c6, 0x12b7e950, 0x8bbeb8ea, 0xfcb9887c, 0x62dd1ddf, 0x15da2d49, 0x8cd37cf3, 0xfbd44c65,
0x4db26158, 0x3ab551ce, 0xa3bc0074, 0xd4bb30e2, 0x4adfa541, 0x3dd895d7, 0xa4d1c46d, 0xd3d6f4fb,
0x4369e96a, 0x346ed9fc, 0xad678846, 0xda60b8d0, 0x44042d73, 0x33031de5, 0xaa0a4c5f, 0xdd0d7cc9,
0x5005713c, 0x270241aa, 0xbe0b1010, 0xc90c2086, 0x5768b525, 0x206f85b3, 0xb966d409, 0xce61e49f,
0x5edef90e, 0x29d9c998, 0xb0d09822, 0xc7d7a8b4, 0x59b33d17, 0x2eb40d81, 0xb7bd5c3b, 0xc0ba6cad,
0xedb88320, 0x9abfb3b6, 0x3b6e20c, 0x74b1d29a, 0xead54739, 0x9dd277af, 0x4db2615, 0x73dc1683,
0xe3630b12, 0x94643b84, 0xd6d6a3e, 0x7a6a5aa8, 0xe40ecf0b, 0x9309ff9d, 0xa00ae27, 0x7d079eb1,
0xf00f9344, 0x8708a3d2, 0x1e01f268, 0x6906c2fe, 0xf762575d, 0x806567cb, 0x196c3671, 0x6e6b06e7,
0xfed41b76, 0x89d32be0, 0x10da7a5a, 0x67dd4acc, 0xf9b9df6f, 0x8ebeeff9, 0x17b7be43, 0x60b08ed5,
0xd6d6a3e8, 0xa1d1937e, 0x38d8c2c4, 0x4fdff252, 0xd1bb67f1, 0xa6bc5767, 0x3fb506dd, 0x48b2364b,
0xd80d2bda, 0xaf0a1b4c, 0x36034af6, 0x41047a60, 0xdf60efc3, 0xa867df55, 0x316e8eef, 0x4669be79,
0xcb61b38c, 0xbc66831a, 0x256fd2a0, 0x5268e236, 0xcc0c7795, 0xbb0b4703, 0x220216b9, 0x5505262f,
0xc5ba3bbe, 0xb2bd0b28, 0x2bb45a92, 0x5cb36a04, 0xc2d7ffa7, 0xb5d0cf31, 0x2cd99e8b, 0x5bdeae1d,
0x9b64c2b0, 0xec63f226, 0x756aa39c, 0x26d930a, 0x9c0906a9, 0xeb0e363f, 0x72076785, 0x5005713,
0x95bf4a82, 0xe2b87a14, 0x7bb12bae, 0xcb61b38, 0x92d28e9b, 0xe5d5be0d, 0x7cdcefb7, 0xbdbdf21,
0x86d3d2d4, 0xf1d4e242, 0x68ddb3f8, 0x1fda836e, 0x81be16cd, 0xf6b9265b, 0x6fb077e1, 0x18b74777,
0x88085ae6, 0xff0f6a70, 0x66063bca, 0x11010b5c, 0x8f659eff, 0xf862ae69, 0x616bffd3, 0x166ccf45,
0xa00ae278, 0xd70dd2ee, 0x4e048354, 0x3903b3c2, 0xa7672661, 0xd06016f7, 0x4969474d, 0x3e6e77db,
0xaed16a4a, 0xd9d65adc, 0x40df0b66, 0x37d83bf0, 0xa9bcae53, 0xdebb9ec5, 0x47b2cf7f, 0x30b5ffe9,
0xbdbdf21c, 0xcabac28a, 0x53b39330, 0x24b4a3a6, 0xbad03605, 0xcdd70693, 0x54de5729, 0x23d967bf,
0xb3667a2e, 0xc4614ab8, 0x5d681b02, 0x2a6f2b94, 0xb40bbe37, 0xc30c8ea1, 0x5a05df1b, 0x2d02ef8d
};
return crc_table[idx];
}
};
inline void swap (
crc32& a,
crc32& b
) { a.swap(b); }
// ----------------------------------------------------------------------------------------
// ----------------------------------------------------------------------------------------
// member function definitions
// ----------------------------------------------------------------------------------------
// ----------------------------------------------------------------------------------------
crc32::
crc32 (
)
{
checksum = 0xFFFFFFFF;
}
// ----------------------------------------------------------------------------------------
crc32::
crc32 (
const std::string& item
)
{
checksum = 0xFFFFFFFF;
add(item);
}
// ----------------------------------------------------------------------------------------
crc32::
crc32 (
const std::vector<char>& item
)
{
checksum = 0xFFFFFFFF;
add(item);
}
// ----------------------------------------------------------------------------------------
crc32::
~crc32 (
)
{
}
// ----------------------------------------------------------------------------------------
void crc32::
clear(
)
{
checksum = 0xFFFFFFFF;
}
// ----------------------------------------------------------------------------------------
void crc32::
add (
unsigned char item
)
{
checksum = (checksum>>8) ^ table((checksum^item) & 0xFF);
}
// ----------------------------------------------------------------------------------------
void crc32::
add (
const std::string& item
)
{
for (std::string::size_type i = 0; i < item.size(); ++i)
checksum = (checksum>>8) ^ table((checksum^item[i]) & 0xFF);
}
// ----------------------------------------------------------------------------------------
void crc32::
add (
const std::vector<char>& item
)
{
for (unsigned long i = 0; i < item.size(); ++i)
checksum = (checksum>>8) ^ table((checksum^item[i]) & 0xFF);
}
// ----------------------------------------------------------------------------------------
unsigned long crc32::
get_checksum (
) const
{
return checksum ^ 0xFFFFFFFF;
}
// ----------------------------------------------------------------------------------------
void crc32::
swap (
crc32& item
)
{
exchange(checksum,item.checksum);
}
// ----------------------------------------------------------------------------------------
}
#endif // DLIB_CRC32_KERNEl_1_
// Copyright (C) 2005 Davis E. King (davis@dlib.net)
// License: Boost Software License See LICENSE.txt for the full license.
#undef DLIB_CRC32_KERNEl_ABSTRACT_
#ifdef DLIB_CRC32_KERNEl_ABSTRACT_
#include "../algs.h"
#include <string>
#include <vector>
namespace dlib
{
class crc32
{
/*!
INITIAL VALUE
The current checksum covers zero bytes.
get_checksum() == 0x00000000
WHAT THIS OBJECT REPRESENTS
This object represents the CRC32 algorithm for calculating
checksums.
!*/
public:
crc32 (
);
/*!
ensures
- #*this is properly initialized
!*/
crc32 (
const std::string& item
);
/*!
ensures
- #*this is properly initialized
- calls this->add(item).
(i.e. Using this constructor is the same as using the default
constructor and then calling add() on item)
!*/
crc32 (
const std::vector<char>& item
);
/*!
ensures
- #*this is properly initialized
- calls this->add(item).
(i.e. Using this constructor is the same as using the default
constructor and then calling add() on item)
!*/
virtual ~crc32 (
);
/*!
ensures
- any resources associated with *this have been released
!*/
void clear(
);
/*!
ensures
- #*this has its initial value
!*/
void add (
unsigned char item
);
/*!
ensures
- #get_checksum() == The checksum of all items added to *this previously
concatenated with item.
!*/
void add (
const std::string& item
);
/*!
ensures
- #get_checksum() == The checksum of all items added to *this previously
concatenated with item.
!*/
void add (
const std::vector<char>& item
);
/*!
ensures
- #get_checksum() == The checksum of all items added to *this previously
concatenated with item.
!*/
unsigned long get_checksum (
) const;
/*!
ensures
- returns the current checksum
!*/
operator unsigned long (
) const;
/*!
ensures
- returns get_checksum()
!*/
void swap (
crc32& item
);
/*!
ensures
- swaps *this and item
!*/
};
void swap (
crc32& a,
crc32& b
) { a.swap(b); }
/*!
provides a global swap function
!*/
}
#endif // DLIB_CRC32_KERNEl_ABSTRACT_
#include "dlib_include_path_tutorial.txt"
// Copyright (C) 2015 Davis E. King (davis@dlib.net)
// License: Boost Software License See LICENSE.txt for the full license.
#ifndef DLIB_DNN_CPU_cPP_
#define DLIB_DNN_CPU_cPP_
// This file contains CPU implementations of the GPU based functions in cuda_dlib.h
#include "cpu_dlib.h"
#include "tensor_tools.h"
#include "../image_transforms/interpolation.h"
#include "../threads.h"
namespace dlib
{
namespace cpu
{
// -----------------------------------------------------------------------------------
void multiply (
bool add_to,
tensor& dest,
const tensor& src1,
const tensor& src2
)
{
DLIB_CASSERT(dest.k() == src1.k() && src1.k() == src2.k() &&
dest.nr() == src1.nr() && src1.nr() == src2.nr() &&
dest.nc() == src1.nc() && src1.nc() == src2.nc() );
const long MD = std::max(std::max(dest.num_samples(),src1.num_samples()),src2.num_samples());
DLIB_CASSERT((dest.num_samples()==1 || dest.num_samples()==MD) &&
(src1.num_samples()==1 || src1.num_samples()==MD) &&
(src2.num_samples()==1 || src2.num_samples()==MD) );
if (dest.size() == 0)
return;
const size_t max_size = std::max(std::max(dest.size(),src1.size()),src2.size());
const auto d = dest.host();
const auto s1 = src1.host();
const auto s2 = src2.host();
if (dest.size() == src1.size() && src1.size() == src2.size())
{
if (add_to)
{
for (size_t i = 0; i < src1.size(); ++i)
d[i] += s1[i]*s2[i];
}
else
{
for (size_t i = 0; i < src1.size(); ++i)
d[i] = s1[i]*s2[i];
}
}
else if (dest.num_samples() == 1)
{
if (!add_to)
{
for (size_t i = 0; i < dest.size(); ++i)
d[i] = 0;
}
for (size_t i = 0; i < max_size; ++i)
d[i%dest.size()] += s1[i%src1.size()]*s2[i%src2.size()];
}
else
{
if (add_to)
{
for (size_t i = 0; i < max_size; ++i)
d[i] += s1[i%src1.size()]*s2[i%src2.size()];
}
else
{
for (size_t i = 0; i < max_size; ++i)
d[i] = s1[i%src1.size()]*s2[i%src2.size()];
}
}
}
// ------------------------------------------------------------------------------------
void multiply_conv (
bool add_to,
tensor& dest,
const tensor& src1,
const tensor& src2
)
{
auto d = dest.host();
auto s1 = src1.host();
auto s2 = src2.host();
if (have_same_dimensions(dest,src1))
{
DLIB_CASSERT(src2.num_samples() == 1 && src2.nr() == 1 && src2.nc() == 1 && src2.k() == src1.k());
if (add_to)
{
for (long n = 0; n < dest.num_samples(); ++n)
{
for (long k = 0; k < dest.k(); ++k)
{
for (long r = 0; r < dest.nr(); ++r)
{
for (long c = 0; c < dest.nc(); ++c)
{
*d++ += (*s1++)*s2[k];
}
}
}
}
}
else
{
for (long n = 0; n < dest.num_samples(); ++n)
{
for (long k = 0; k < dest.k(); ++k)
{
for (long r = 0; r < dest.nr(); ++r)
{
for (long c = 0; c < dest.nc(); ++c)
{
*d++ = (*s1++)*s2[k];
}
}
}
}
}
}
else
{
DLIB_CASSERT(have_same_dimensions(src1,src2));
DLIB_CASSERT(dest.num_samples() == 1 && dest.nr() == 1 && dest.nc() == 1 && dest.k() == src1.k());
if (!add_to)
{
for (long k = 0; k < src1.k(); ++k)
d[k] = 0;
}
for (long n = 0; n < src1.num_samples(); ++n)
{
for (long k = 0; k < src1.k(); ++k)
{
for (long r = 0; r < src1.nr(); ++r)
{
for (long c = 0; c < src1.nc(); ++c)
{
d[k] += (*s1++)*(*s2++);
}
}
}
}
}
}
// ------------------------------------------------------------------------------------
void scale_channels (
bool add_to,
tensor& dest,
const tensor& src,
const tensor& scales
)
{
DLIB_CASSERT(have_same_dimensions(dest,src) &&
scales.num_samples() == src.num_samples() &&
scales.k() == src.k() &&
scales.nr() == 1 &&
scales.nc() == 1 );
if (dest.size() == 0)
return;
if (add_to)
{
auto d = dest.host();
auto s = src.host();
auto scal = scales.host();
for (long n = 0; n < src.num_samples(); ++n)
{
for (long k = 0; k < src.k(); ++k)
{
const auto scale = scal[n*scales.k() + k];
for (long r = 0; r < src.nr(); ++r)
{
for (long c = 0; c < src.nc(); ++c)
{
*d++ += (*s++) * scale;
}
}
}
}
}
else
{
auto d = dest.host_write_only();
auto s = src.host();
auto scal = scales.host();
for (long n = 0; n < src.num_samples(); ++n)
{
for (long k = 0; k < src.k(); ++k)
{
const auto scale = scal[n*scales.k() + k];
for (long r = 0; r < src.nr(); ++r)
{
for (long c = 0; c < src.nc(); ++c)
{
*d++ = (*s++) * scale;
}
}
}
}
}
}
// ------------------------------------------------------------------------------------
void add(
float beta,
tensor& dest,
float alpha,
const tensor& src
)
{
DLIB_CASSERT(
(have_same_dimensions(src, dest) ||
(src.num_samples()==1 && src.k()==dest.k() && src.nr()==1 && src.nc()==1) ||
(src.num_samples()==1 && src.k()==dest.k() && src.nr()==dest.nr() && src.nc()==dest.nc()) ||
(src.num_samples()==1 && src.k()==1 && src.nr()==dest.nr() && src.nc()==dest.nc()) ||
(src.num_samples()==dest.num_samples() && src.k()==1 && src.nr()==1 && src.nc()==1)) &&
is_same_object(src,dest) == false ,
"\n\t dest.num_samples(): " << dest.num_samples()
<<"\n\t dest.k(): " << dest.k()
<<"\n\t dest.nr(): " << dest.nr()
<<"\n\t dest.nc(): " << dest.nc()
<<"\n\t src.num_samples(): " << src.num_samples()
<<"\n\t src.k(): " << src.k()
<<"\n\t src.nr(): " << src.nr()
<<"\n\t src.nc(): " << src.nc()
);
if (beta == 0 && alpha == 0)
{
dest = 0;
return;
}
auto d = dest.host();
auto s = src.host();
for (long n = 0; n < dest.num_samples(); ++n)
{
const auto sn = src.num_samples()==1 ? 0:n;
for (long k = 0; k < dest.k(); ++k)
{
const auto sk = src.k()==1 ? 0:k;
for (long r = 0; r < dest.nr(); ++r)
{
const auto sr = src.nr()==1 ? 0:r;
for (long c = 0; c < dest.nc(); ++c)
{
const auto sc = src.nc()==1 ? 0:c;
const auto s_idx = ((sn*src.k() + sk)*src.nr() + sr)*src.nc() + sc;
*d = beta*(*d) + alpha*s[s_idx];
++d;
}
}
}
}
}
// ----------------------------------------------------------------------------------------
void add (
tensor& dest,
const tensor& src1,
const tensor& src2
)
{
auto d = dest.host();
auto s1 = src1.host();
auto s2 = src2.host();
// Do the simple and fast version if everything has the same dimensions
if (have_same_dimensions(dest, src1) &&
have_same_dimensions(dest, src2))
{
for (size_t i = 0; i < dest.size(); ++i)
d[i] = s1[i] + s2[i];
return;
}
// Otherwise, do the more complex version with bounds checking.
for (long n = 0; n < dest.num_samples(); ++n)
{
for (long k = 0; k < dest.k(); ++k)
{
for (long r = 0; r < dest.nr(); ++r)
{
for (long c = 0; c < dest.nc(); ++c)
{
float v1 = 0;
float v2 = 0;
// if this index is inside src1
if (n < src1.num_samples() &&
k < src1.k() &&
r < src1.nr() &&
c < src1.nc() )
{
const auto s_idx = ((n*src1.k() + k)*src1.nr() + r)*src1.nc() + c;
v1 = s1[s_idx];
}
// if this index is inside src2
if (n < src2.num_samples() &&
k < src2.k() &&
r < src2.nr() &&
c < src2.nc() )
{
const auto s_idx = ((n*src2.k() + k)*src2.nr() + r)*src2.nc() + c;
v2 = s2[s_idx];
}
*d = v1 + v2;
++d;
}
}
}
}
}
// ----------------------------------------------------------------------------------------
void multiply_zero_padded (
bool add_to,
tensor& dest,
const tensor& src1,
const tensor& src2
)
{
auto d = dest.host();
auto s1 = src1.host();
auto s2 = src2.host();
// Do the simple and fast version if everything has the same dimensions
if (have_same_dimensions(dest, src1) &&
have_same_dimensions(dest, src2))
{
if (add_to)
{
for (size_t i = 0; i < dest.size(); ++i)
d[i] += s1[i] * s2[i];
}
else
{
for (size_t i = 0; i < dest.size(); ++i)
d[i] = s1[i] * s2[i];
}
return;
}
// Otherwise, do the more complex version with bounds checking.
for (long n = 0; n < dest.num_samples(); ++n)
{
for (long k = 0; k < dest.k(); ++k)
{
for (long r = 0; r < dest.nr(); ++r)
{
for (long c = 0; c < dest.nc(); ++c)
{
float v1 = 0;
float v2 = 0;
// if this index is inside src1
if (n < src1.num_samples() &&
k < src1.k() &&
r < src1.nr() &&
c < src1.nc() )
{
const auto s_idx = ((n*src1.k() + k)*src1.nr() + r)*src1.nc() + c;
v1 = s1[s_idx];
}
// if this index is inside src2
if (n < src2.num_samples() &&
k < src2.k() &&
r < src2.nr() &&
c < src2.nc() )
{
const auto s_idx = ((n*src2.k() + k)*src2.nr() + r)*src2.nc() + c;
v2 = s2[s_idx];
}
if (add_to)
*d += v1 * v2;
else
*d = v1 * v2;
++d;
}
}
}
}
}
// ----------------------------------------------------------------------------------------
void assign_bias_gradient (
tensor& grad,
const tensor& gradient_input
)
{
DLIB_CASSERT(
grad.num_samples() == 1 &&
gradient_input.k() == grad.k() &&
gradient_input.nr() == grad.nr() &&
gradient_input.nc() == grad.nc() &&
gradient_input.size() > 0);
auto out = grad.host();
auto in = gradient_input.host();
for (size_t i = 0; i < grad.size(); ++i)
out[i] = *in++;
for (long j = 1; j < gradient_input.num_samples(); ++j)
{
for (size_t i = 0; i < grad.size(); ++i)
out[i] += *in++;
}
}
// ------------------------------------------------------------------------------------
void assign_conv_bias_gradient (
tensor& grad,
const tensor& gradient_input
)
{
DLIB_CASSERT(
grad.num_samples() == 1 &&
grad.k() >= 1 &&
grad.nr() == 1 &&
grad.nc() == 1 &&
gradient_input.k() == grad.k() &&
gradient_input.size() > 0 &&
is_same_object(grad,gradient_input) == false
);
auto g = grad.host();
auto gi = gradient_input.host();
for (long k = 0; k < gradient_input.k(); ++k)
g[k] = 0;
for (long n = 0; n < gradient_input.num_samples(); ++n)
{
for (long k = 0; k < gradient_input.k(); ++k)
{
for (long r = 0; r < gradient_input.nr(); ++r)
{
for (long c = 0; c < gradient_input.nc(); ++c)
{
g[k] += (*gi++);
}
}
}
}
}
// -----------------------------------------------------------------------------------
void affine_transform(
tensor& dest,
const tensor& src,
const float A,
const float B
)
{
DLIB_CASSERT(dest.size()==src.size());
const auto d = dest.host();
const auto s = src.host();
for (size_t i = 0; i < src.size(); ++i)
d[i] = A*s[i] + B;
}
void affine_transform(
tensor& dest,
const tensor& src1,
const tensor& src2,
const float A,
const float B,
const float C
)
{
DLIB_CASSERT(dest.size()==src1.size());
DLIB_CASSERT(dest.size()==src2.size());
const auto d = dest.host();
const auto s1 = src1.host();
const auto s2 = src2.host();
for (size_t i = 0; i < src1.size(); ++i)
d[i] = A*s1[i] + B*s2[i] + C;
}
void affine_transform(
tensor& dest,
const tensor& src1,
const tensor& src2,
const tensor& src3,
const float A,
const float B,
const float C,
const float D
)
{
DLIB_CASSERT(dest.size()==src1.size());
DLIB_CASSERT(dest.size()==src2.size());
DLIB_CASSERT(dest.size()==src3.size());
const auto d = dest.host();
const auto s1 = src1.host();
const auto s2 = src2.host();
const auto s3 = src3.host();
for (size_t i = 0; i < src1.size(); ++i)
d[i] = A*s1[i] + B*s2[i] + C*s3[i] + D;
}
void affine_transform_range(
size_t begin,
size_t end,
tensor& dest,
const tensor& src1,
const tensor& src2,
const tensor& src3,
const float A,
const float B,
const float C
)
{
DLIB_CASSERT(dest.size()==src1.size());
DLIB_CASSERT(dest.size()==src2.size());
DLIB_CASSERT(dest.size()==src3.size());
DLIB_CASSERT(begin <= end && end <= dest.size());
const auto d = dest.host();
const auto s1 = src1.host();
const auto s2 = src2.host();
const auto s3 = src3.host();
for (size_t i = begin; i < end; ++i)
d[i] = A*s1[i] + B*s2[i] + C*s3[i];
}
// -----------------------------------------------------------------------------------
void affine_transform(
tensor& dest,
const tensor& src,
const tensor& A,
const tensor& B
)
{
DLIB_CASSERT(have_same_dimensions(dest,src));
DLIB_CASSERT(
((A.num_samples()==1 && B.num_samples()==1) ||
(A.num_samples()==src.num_samples() && B.num_samples()==src.num_samples())) &&
A.nr()==B.nr() && B.nr()==src.nr() &&
A.nc()==B.nc() && B.nc()==src.nc() &&
A.k() ==B.k() && B.k()==src.k());
auto d = dest.host();
auto s = src.host();
const auto a = A.host();
const auto b = B.host();
if (A.num_samples() == 1)
{
const long num = src.size()/src.num_samples();
for (long i = 0; i < src.num_samples(); ++i)
{
for (long j = 0; j < num; ++j)
{
*d = a[j]*(*s) + b[j];
d++;
s++;
}
}
}
else
{
for (size_t i = 0; i < src.size(); ++i)
d[i] = a[i]*s[i] + b[i];
}
}
// -----------------------------------------------------------------------------------
void affine_transform_conv(
tensor& dest,
const tensor& src,
const tensor& A,
const tensor& B
)
{
DLIB_CASSERT(have_same_dimensions(dest,src));
DLIB_CASSERT(have_same_dimensions(A,B));
DLIB_CASSERT(A.num_samples() == 1 &&
A.nr() == 1 &&
A.nc() == 1 &&
A.k() == src.k());
auto d = dest.host();
auto s = src.host();
const auto a = A.host();
const auto b = B.host();
for (long n = 0; n < dest.num_samples(); ++n)
{
for (long k = 0; k < dest.k(); ++k)
{
for (long r = 0; r < dest.nr(); ++r)
{
for (long c = 0; c < dest.nc(); ++c)
{
*d++ = a[k]*(*s++) + b[k];
}
}
}
}
}
// ----------------------------------------------------------------------------------------
void affine_transform(
const rectangle& rect,
tensor& dest,
const tensor& src1,
const tensor& src2,
const tensor& src3,
float A,
float B,
float C
)
{
DLIB_CASSERT(dest.size() == src1.size());
DLIB_CASSERT(dest.size() == src2.size());
DLIB_CASSERT(dest.size() == src3.size());
DLIB_CASSERT(dest.num_samples() == src1.num_samples());
DLIB_CASSERT(dest.num_samples() == src2.num_samples());
DLIB_CASSERT(dest.num_samples() == src3.num_samples());
DLIB_CASSERT(rectangle(0,0, dest.size()/dest.num_samples()-1, dest.num_samples()-1).contains(rect));
auto d = dest.host();
auto s1 = src1.host();
auto s2 = src2.host();
auto s3 = src3.host();
const auto nc = dest.size()/dest.num_samples();
for (long r = rect.top(); r <= rect.bottom(); ++r)
{
for (long c = rect.left(); c <= rect.right(); ++c)
{
auto idx = r*nc + c;
d[idx] = s1[idx]*A + s2[idx]*B + s3[idx]*C;
}
}
}
// -----------------------------------------------------------------------------------
void compute_adam_update (
size_t begin,
size_t end,
tensor& s,
tensor& m,
tensor& v,
const float t,
const float learning_rate,
const float weight_decay,
const float momentum1,
const float momentum2,
const tensor& params,
const tensor& params_grad
)
{
DLIB_CASSERT(s.size() == m.size() &&
s.size() == v.size() &&
s.size() == params.size() &&
s.size() == params_grad.size());
DLIB_CASSERT(begin <= end && end <= params.size());
const float eps = 1e-8;
const float alpha = learning_rate*std::sqrt(1-std::pow(momentum2,t))/(1-std::pow(momentum1, t));
// The loop is equivalent to doing this:
// m = momentum1*m + (1-momentum1) * (weight_decay*params + params_grad);
// v = momentum2*v + (1-momentum2)*squared(weight_decay*params + params_grad);
// s = -alpha*m/(sqrt(v) + eps);
auto pm = m.host();
auto pv = v.host();
auto ps = s.host_write_only();
auto pparams = params.host();
auto ppgrad = params_grad.host();
for (size_t i = begin; i < end; ++i)
{
float g = weight_decay*pparams[i] + ppgrad[i];
pm[i] = momentum1*pm[i] + (1-momentum1)*g;
pv[i] = momentum2*pv[i] + (1-momentum2)*g*g;
ps[i] = -alpha*pm[i]/(std::sqrt(pv[i]) + eps);
}
}
// -----------------------------------------------------------------------------------
void batch_normalize_inference (
const double eps,
resizable_tensor& dest,
const tensor& src,
const tensor& gamma,
const tensor& beta,
const tensor& running_means,
const tensor& running_variances
)
{
DLIB_CASSERT(
gamma.num_samples() == 1 &&
gamma.nr() == src.nr() &&
gamma.nc() == src.nc() &&
gamma.k() == src.k() &&
have_same_dimensions(gamma, beta) &&
have_same_dimensions(gamma, running_means) &&
have_same_dimensions(gamma, running_variances) &&
eps > 0,
"\ngamma.num_samples(): " << gamma.num_samples() <<
"\ngamma.k(): " << gamma.k() <<
"\ngamma.nr(): " << gamma.nr() <<
"\ngamma.nc(): " << gamma.nc() <<
"\nbeta.num_samples(): " << beta.num_samples() <<
"\nbeta.k(): " << beta.k() <<
"\nbeta.nr(): " << beta.nr() <<
"\nbeta.nc(): " << beta.nc() <<
"\nrunning_means.num_samples(): " << running_means.num_samples() <<
"\nrunning_means.k(): " << running_means.k() <<
"\nrunning_means.nr(): " << running_means.nr() <<
"\nrunning_means.nc(): " << running_means.nc() <<
"\nrunning_variances.num_samples(): " << running_variances.num_samples() <<
"\nrunning_variances.k(): " << running_variances.k() <<
"\nrunning_variances.nr(): " << running_variances.nr() <<
"\nrunning_variances.nc(): " << running_variances.nc() <<
"\nsrc.k(): " << src.k() <<
"\nsrc.nr(): " << src.nr() <<
"\nsrc.nc(): " << src.nc() <<
"\neps: " << eps
);
dest.copy_size(src);
auto d = dest.host();
auto s = src.host();
auto g = gamma.host();
auto b = beta.host();
auto m = running_means.host();
auto v = running_variances.host();
const long num = src.k()*src.nr()*src.nc();
for (long n = 0; n < src.num_samples(); ++n)
{
for (long k = 0; k < num; ++k)
{
*d = g[k]*(*s - m[k])/std::sqrt(v[k]+eps) + b[k];
++d;
++s;
}
}
}
void batch_normalize (
const double eps,
resizable_tensor& dest,
resizable_tensor& means,
resizable_tensor& invstds,
const double averaging_factor,
resizable_tensor& running_means,
resizable_tensor& running_variances,
const tensor& src,
const tensor& gamma,
const tensor& beta
)
{
DLIB_CASSERT(0 <= averaging_factor && averaging_factor <= 1, "averaging_factor: " << averaging_factor);
DLIB_CASSERT(averaging_factor==1 || have_same_dimensions(running_means,means));
DLIB_CASSERT(averaging_factor==1 || have_same_dimensions(running_variances,invstds));
DLIB_CASSERT(
src.num_samples() > 1 &&
gamma.num_samples() == 1 &&
beta.num_samples() == 1 &&
gamma.nr() == beta.nr() && beta.nr() == src.nr() &&
gamma.nc() == beta.nc() && beta.nc() == src.nc() &&
gamma.k() == beta.k() && beta.k() == src.k() &&
eps > 0,
"\ngamma.num_samples(): " << gamma.num_samples() <<
"\ngamma.k(): " << gamma.k() <<
"\ngamma.nr(): " << gamma.nr() <<
"\ngamma.nc(): " << gamma.nc() <<
"\nbeta.num_samples(): " << beta.num_samples() <<
"\nbeta.k(): " << beta.k() <<
"\nbeta.nr(): " << beta.nr() <<
"\nbeta.nc(): " << beta.nc() <<
"\nsrc.k(): " << src.k() <<
"\nsrc.nr(): " << src.nr() <<
"\nsrc.nc(): " << src.nc() <<
"\neps: " << eps
);
dest.copy_size(src);
means.set_size(1, src.k(), src.nr(), src.nc());
invstds.set_size(1, src.k(), src.nr(), src.nc());
// first compute means and invstds
means = 0;
invstds = 0;
const auto p_invstds = invstds.host();
const auto p_means = means.host();
auto p_src = src.host();
const long num = src.k()*src.nr()*src.nc();
// compute means, and sum of squares
for (long i = 0; i < num; ++i)
{
for (long n = 0; n < src.num_samples(); ++n)
{
float val = p_src[n*num+i];
p_means[i] += val;
p_invstds[i] += val*val;
}
}
means /= src.num_samples();
invstds /= src.num_samples();
// copy data back to host
invstds.host(); means.host();
// compute variances
running_variances.copy_size(invstds);
auto rvar = running_variances.host();
// This scale makes the running variances unbiased.
const double scale = (src.num_samples())/(src.num_samples()-1.0);
for (long i = 0; i < num; ++i)
{
auto actual_var = p_invstds[i] - p_means[i]*p_means[i];
if (averaging_factor == 1)
rvar[i] = scale*actual_var;
else
rvar[i] = (1-averaging_factor)*rvar[i] + scale*averaging_factor*actual_var;
p_invstds[i] = 1.0f/std::sqrt(actual_var + eps);
}
p_src = src.host();
auto p_dest = dest.host();
const auto p_gamma = gamma.host();
const auto p_beta = beta.host();
for (long n = 0; n < src.num_samples(); ++n)
{
for (long i = 0; i < num; ++i)
{
*p_dest = (*p_src - p_means[i])*p_invstds[i];
*p_dest = (*p_dest)*p_gamma[i] + p_beta[i];
++p_src;
++p_dest;
}
}
// now keep track of the running means
running_means.copy_size(means);
if (averaging_factor != 1)
running_means = (1-averaging_factor)*mat(running_means) + averaging_factor*mat(means);
else
running_means = means;
}
void batch_normalize_gradient (
const double eps,
const tensor& gradient_input,
const tensor& means,
const tensor& invstds,
const tensor& src,
const tensor& gamma,
tensor& src_grad,
tensor& gamma_grad,
tensor& beta_grad
)
{
const long num = src.k()*src.nr()*src.nc();
DLIB_CASSERT(src.num_samples() > 1);
DLIB_CASSERT(num == (long)means.size());
DLIB_CASSERT(num == (long)invstds.size());
DLIB_CASSERT(num == (long)gamma.size());
DLIB_CASSERT(num == (long)gamma_grad.size());
DLIB_CASSERT(num == (long)beta_grad.size());
DLIB_CASSERT(have_same_dimensions(gradient_input, src));
DLIB_CASSERT(have_same_dimensions(gradient_input, src_grad));
DLIB_CASSERT(eps > 0);
beta_grad = 0;
gamma_grad = 0;
auto p_grad = gradient_input.host();
auto p_src = src.host();
const auto p_gamma = gamma.host();
const auto p_gamma_grad = gamma_grad.host();
const auto p_beta_grad = beta_grad.host();
const auto p_invstds = invstds.host();
const auto p_means = means.host();
resizable_tensor dvars, dmeans;
dvars.copy_size(invstds);
dmeans.copy_size(means);
dvars = 0;
dmeans = 0;
const auto p_dvars = dvars.host();
const auto p_dmeans = dmeans.host();
for (long n = 0; n < src.num_samples(); ++n)
{
for (long i = 0; i < num; ++i)
{
const float x_hat = (*p_src - p_means[i])*p_invstds[i];
p_beta_grad[i] += *p_grad;
p_gamma_grad[i] += (*p_grad)*x_hat;
const float dx = *p_grad * p_gamma[i];
p_dvars[i] += dx*(*p_src - p_means[i])*-0.5*std::pow(p_invstds[i], 3.0f);
++p_grad;
++p_src;
}
}
const float invnum = 1.0f/src.num_samples();
p_grad = gradient_input.host();
p_src = src.host();
for (long n = 0; n < src.num_samples(); ++n)
{
for (long i = 0; i < num; ++i)
{
const float dx = *p_grad * p_gamma[i];
p_dmeans[i] += dx*-p_invstds[i] + p_dvars[i] * -2*(*p_src - p_means[i])*invnum;
++p_grad;
++p_src;
}
}
p_grad = gradient_input.host();
p_src = src.host();
auto p_src_grad = src_grad.host();
for (long n = 0; n < src.num_samples(); ++n)
{
for (long i = 0; i < num; ++i)
{
const float dx = *p_grad * p_gamma[i];
*p_src_grad += dx*p_invstds[i] +
p_dvars[i] *2*(*p_src - p_means[i])*invnum +
p_dmeans[i]*invnum;
++p_grad;
++p_src;
++p_src_grad;
}
}
}
// ----------------------------------------------------------------------------------------
void batch_normalize_conv_inference (
const double eps,
resizable_tensor& dest,
const tensor& src,
const tensor& gamma,
const tensor& beta,
const tensor& running_means,
const tensor& running_variances
)
{
DLIB_CASSERT(
gamma.num_samples() == 1 &&
gamma.nr() == 1 &&
gamma.nc() == 1 &&
gamma.k() == src.k() &&
have_same_dimensions(gamma, beta) &&
have_same_dimensions(gamma, running_means) &&
have_same_dimensions(gamma, running_variances) &&
eps > 0,
"\ngamma.num_samples(): " << gamma.num_samples() <<
"\ngamma.k(): " << gamma.k() <<
"\ngamma.nr(): " << gamma.nr() <<
"\ngamma.nc(): " << gamma.nc() <<
"\nbeta.num_samples(): " << beta.num_samples() <<
"\nbeta.k(): " << beta.k() <<
"\nbeta.nr(): " << beta.nr() <<
"\nbeta.nc(): " << beta.nc() <<
"\nrunning_means.num_samples(): " << running_means.num_samples() <<
"\nrunning_means.k(): " << running_means.k() <<
"\nrunning_means.nr(): " << running_means.nr() <<
"\nrunning_means.nc(): " << running_means.nc() <<
"\nrunning_variances.num_samples(): " << running_variances.num_samples() <<
"\nrunning_variances.k(): " << running_variances.k() <<
"\nrunning_variances.nr(): " << running_variances.nr() <<
"\nrunning_variances.nc(): " << running_variances.nc() <<
"\nsrc.k(): " << src.k() <<
"\nsrc.nr(): " << src.nr() <<
"\nsrc.nc(): " << src.nc() <<
"\neps: " << eps
);
dest.copy_size(src);
auto d = dest.host();
auto s = src.host();
auto g = gamma.host();
auto b = beta.host();
auto m = running_means.host();
auto v = running_variances.host();
const long num = src.nr()*src.nc();
for (long n = 0; n < src.num_samples(); ++n)
{
for (long k = 0; k < src.k(); ++k)
{
const float invstd = 1.0f/std::sqrt(v[k] + eps);
for (long j = 0; j < num; ++j)
{
*d = g[k]*(*s - m[k])*invstd + b[k];
++d;
++s;
}
}
}
}
void batch_normalize_conv (
const double eps,
resizable_tensor& dest,
resizable_tensor& means,
resizable_tensor& invstds,
const double averaging_factor,
resizable_tensor& running_means,
resizable_tensor& running_variances,
const tensor& src,
const tensor& gamma,
const tensor& beta
)
{
DLIB_CASSERT(0 <= averaging_factor && averaging_factor <= 1, "averaging_factor: " << averaging_factor);
DLIB_CASSERT(averaging_factor==1 || have_same_dimensions(running_means,means));
DLIB_CASSERT(averaging_factor==1 || have_same_dimensions(running_variances,invstds));
DLIB_CASSERT(
src.num_samples() > 1 &&
gamma.num_samples() == 1 &&
beta.num_samples() == 1 &&
gamma.nr() == 1 &&
beta.nr() == 1 &&
gamma.nc() == 1 &&
beta.nc() == 1 &&
gamma.k() == beta.k() && beta.k() == src.k() &&
eps > 0,
"\ngamma.num_samples(): " << gamma.num_samples() <<
"\ngamma.k(): " << gamma.k() <<
"\ngamma.nr(): " << gamma.nr() <<
"\ngamma.nc(): " << gamma.nc() <<
"\nbeta.num_samples(): " << beta.num_samples() <<
"\nbeta.k(): " << beta.k() <<
"\nbeta.nr(): " << beta.nr() <<
"\nbeta.nc(): " << beta.nc() <<
"\nsrc.k(): " << src.k() <<
"\nsrc.nr(): " << src.nr() <<
"\nsrc.nc(): " << src.nc() <<
"\neps: " << eps
);
dest.copy_size(src);
means.set_size(1, src.k());
invstds.set_size(1, src.k());
// first compute means and invstds
means = 0;
invstds = 0;
const auto p_invstds = invstds.host();
const auto p_means = means.host();
const auto p_gamma = gamma.host();
const auto p_beta = beta.host();
auto p_src = src.host();
const long num = src.nr()*src.nc();
// compute means, and sum of squares
for (long n = 0; n < src.num_samples(); ++n)
{
for (long k = 0; k < src.k(); ++k)
{
for (long i = 0; i < num; ++i)
{
p_means[k] += *p_src;
p_invstds[k] += (*p_src)*(*p_src);
++p_src;
}
}
}
means /= src.num_samples()*num;
invstds /= src.num_samples()*num;
// copy data back to host
invstds.host(); means.host();
p_src = src.host();
// compute variances
running_variances.copy_size(invstds);
auto rvar = running_variances.host();
// This scale makes the running variances unbiased.
const double scale = (src.num_samples()*num)/(src.num_samples()*num-1.0);
for (long k = 0; k < src.k(); ++k)
{
float actual_var = p_invstds[k] - p_means[k]*p_means[k];
if (averaging_factor == 1)
rvar[k] = scale*actual_var;
else
rvar[k] = (1-averaging_factor)*rvar[k] + scale*averaging_factor*actual_var;
p_invstds[k] = 1.0f/std::sqrt(actual_var + eps);
}
p_src = src.host();
auto p_dest = dest.host();
for (long n = 0; n < src.num_samples(); ++n)
{
for (long k = 0; k < src.k(); ++k)
{
for (long i = 0; i < num; ++i)
{
*p_dest = (*p_src - p_means[k])*p_invstds[k];
*p_dest = (*p_dest)*p_gamma[k] + p_beta[k];
++p_src;
++p_dest;
}
}
}
// now keep track of the running means
running_means.copy_size(means);
if (averaging_factor != 1)
running_means = (1-averaging_factor)*mat(running_means) + averaging_factor*mat(means);
else
running_means = means;
}
void batch_normalize_conv_gradient(
const double eps,
const tensor& gradient_input,
const tensor& means,
const tensor& invstds,
const tensor& src,
const tensor& gamma,
tensor& src_grad,
tensor& gamma_grad,
tensor& beta_grad
)
{
const long num = src.nr()*src.nc();
DLIB_CASSERT(src.num_samples() > 1);
DLIB_CASSERT(src.k() == (long)means.size());
DLIB_CASSERT(src.k() == (long)invstds.size());
DLIB_CASSERT(src.k() == (long)gamma.size());
DLIB_CASSERT(src.k() == (long)gamma_grad.size());
DLIB_CASSERT(src.k() == (long)beta_grad.size());
DLIB_CASSERT(have_same_dimensions(gradient_input, src));
DLIB_CASSERT(have_same_dimensions(gradient_input, src_grad));
DLIB_CASSERT(eps > 0);
beta_grad = 0;
gamma_grad = 0;
auto p_grad = gradient_input.host();
auto p_src = src.host();
const auto p_gamma = gamma.host();
const auto p_gamma_grad = gamma_grad.host();
const auto p_beta_grad = beta_grad.host();
const auto p_invstds = invstds.host();
const auto p_means = means.host();
resizable_tensor dvars, dmeans;
dvars.copy_size(invstds);
dmeans.copy_size(means);
dvars = 0;
dmeans = 0;
const auto p_dvars = dvars.host();
const auto p_dmeans = dmeans.host();
for (long n = 0; n < src.num_samples(); ++n)
{
for (long k = 0; k < src.k(); ++k)
{
const float invstd_pow = -0.5*std::pow(p_invstds[k], 3.0f);
for (long i = 0; i < num; ++i)
{
const float x_hat = (*p_src - p_means[k])*p_invstds[k];
p_beta_grad[k] += *p_grad;
p_gamma_grad[k] += (*p_grad)*x_hat;
const float dx = *p_grad * p_gamma[k];
p_dvars[k] += dx*(*p_src - p_means[k])*invstd_pow;
++p_grad;
++p_src;
}
}
}
p_grad = gradient_input.host();
p_src = src.host();
const float invnum = 1.0f/(src.num_samples()*num);
for (long n = 0; n < src.num_samples(); ++n)
{
for (long k = 0; k < src.k(); ++k)
{
for (long i = 0; i < num; ++i)
{
const float dx = *p_grad * p_gamma[k];
p_dmeans[k] += -dx*p_invstds[k] + p_dvars[k] * -2*(*p_src - p_means[k])*invnum;
++p_grad;
++p_src;
}
}
}
p_grad = gradient_input.host();
p_src = src.host();
auto p_src_grad = src_grad.host();
for (long n = 0; n < src.num_samples(); ++n)
{
for (long k = 0; k < src.k(); ++k)
{
for (long i = 0; i < num; ++i)
{
const float dx = *p_grad * p_gamma[k];
*p_src_grad += dx*p_invstds[k] +
p_dvars[k]*2*(*p_src - p_means[k])*invnum +
p_dmeans[k]*invnum;
++p_grad;
++p_src;
++p_src_grad;
}
}
}
}
// -----------------------------------------------------------------------------------
void layer_normalize (
const double eps,
resizable_tensor& dest,
resizable_tensor& means,
resizable_tensor& invstds,
const tensor& src,
const tensor& gamma,
const tensor& beta
)
{
const long num = src.k() * src.nr() * src.nc();
DLIB_CASSERT(
have_same_dimensions(gamma, beta) &&
src.k() == gamma.k() &&
src.nr() == gamma.nr() &&
src.nc() == gamma.nc() &&
eps > 0,
"\ngamma.k(): " << gamma.k() <<
"\ngamma.nr(): " << gamma.nr() <<
"\ngamma.nc(): " << gamma.nc() <<
"\nbeta.k(): " << beta.k() <<
"\nbeta.nr(): " << beta.nr() <<
"\nbeta.nc(): " << beta.nc() <<
"\nsrc.k(): " << src.k() <<
"\nsrc.nr(): " << src.nr() <<
"\nsrc.nc(): " << src.nc() <<
"\neps: " << eps
);
dest.copy_size(src);
means.set_size(src.num_samples());
invstds.set_size(src.num_samples());
// first compute means and invstds
means = 0;
invstds = 0;
const auto p_invstds = invstds.host();
const auto p_means = means.host();
auto p_src = src.host();
// compute means, and sum of squares
for (long n = 0; n < src.num_samples(); ++n)
{
for (long i = 0; i < num; ++i)
{
float val = p_src[n*num+i];
p_means[n] += val;
p_invstds[n] += val*val;
}
}
means /= num;
invstds /= num;
// copy data back to host
invstds.host(); means.host();
// compute variances
for (long n = 0; n < src.num_samples(); ++n)
{
auto var = p_invstds[n] - p_means[n] * p_means[n];
p_invstds[n] = 1.0f / std::sqrt(var + eps);
}
p_src = src.host();
auto p_dest = dest.host();
auto p_gamma = gamma.host();
auto p_beta = beta.host();
for (long n = 0; n < src.num_samples(); ++n)
{
for (long i = 0; i < num; ++i)
{
*p_dest = (*p_src - p_means[n])*p_invstds[n];
*p_dest = (*p_dest)*p_gamma[i] + p_beta[i];
++p_src;
++p_dest;
}
}
}
void layer_normalize_gradient (
const double eps,
const tensor& gradient_input,
const tensor& means,
const tensor& invstds,
const tensor& src,
const tensor& gamma,
tensor& src_grad,
tensor& gamma_grad,
tensor& beta_grad
)
{
const long num = src.k() * src.nr() * src.nc();
DLIB_CASSERT(src.num_samples() == means.size());
DLIB_CASSERT(src.num_samples() == invstds.size());
DLIB_CASSERT(src.k() == gamma.k());
DLIB_CASSERT(src.nr() == gamma_grad.nr());
DLIB_CASSERT(src.nc() == beta_grad.nc());
DLIB_CASSERT(have_same_dimensions(gradient_input, src));
DLIB_CASSERT(have_same_dimensions(gradient_input, src_grad));
DLIB_CASSERT(have_same_dimensions(gamma_grad, beta_grad));
DLIB_CASSERT(eps > 0);
beta_grad = 0;
gamma_grad = 0;
auto p_grad = gradient_input.host();
auto p_src = src.host();
const auto p_gamma = gamma.host();
const auto p_gamma_grad = gamma_grad.host();
const auto p_beta_grad = beta_grad.host();
const auto p_invstds = invstds.host();
const auto p_means = means.host();
resizable_tensor dvars, dmeans;
dvars.copy_size(invstds);
dmeans.copy_size(means);
dvars = 0;
dmeans = 0;
const auto p_dvars = dvars.host();
const auto p_dmeans = dmeans.host();
for (long n = 0; n < src.num_samples(); ++n)
{
for (long i = 0; i < num; ++i)
{
const float x_hat = (*p_src - p_means[n])*p_invstds[n];
p_beta_grad[i] += *p_grad;
p_gamma_grad[i] += (*p_grad)*x_hat;
const float dx = *p_grad * p_gamma[n];
p_dvars[n] += dx*(*p_src - p_means[n])*-0.5*p_invstds[n]*p_invstds[n]*p_invstds[n];
++p_grad;
++p_src;
}
}
const float invnum = 1.0f/num;
p_grad = gradient_input.host();
p_src = src.host();
for (long n = 0; n < src.num_samples(); ++n)
{
for (long i = 0; i < num; ++i)
{
const float dx = *p_grad * p_gamma[i];
p_dmeans[n] += dx*-p_invstds[n] + p_dvars[n] * -2*(*p_src - p_means[n])*invnum;
++p_grad;
++p_src;
}
}
p_grad = gradient_input.host();
p_src = src.host();
auto p_src_grad = src_grad.host();
for (long n = 0; n < src.num_samples(); ++n)
{
for (long i = 0; i < num; ++i)
{
const float dx = *p_grad * p_gamma[i];
*p_src_grad += dx*p_invstds[n] +
p_dvars[n] *2*(*p_src - p_means[n])*invnum +
p_dmeans[n]*invnum;
++p_grad;
++p_src;
++p_src_grad;
}
}
}
// -----------------------------------------------------------------------------------
void threshold (
tensor& data,
float thresh
)
{
const auto d = data.host();
for (size_t i = 0; i < data.size(); ++i)
d[i] = d[i]>thresh ? 1:0;
}
void dot (
const tensor& a,
const tensor& b,
tensor& result,
size_t idx
)
{
DLIB_CASSERT(a.size() == b.size());
DLIB_CASSERT(idx < result.size());
const auto aa = a.host();
const auto bb = b.host();
auto r = result.host();
for (size_t i = 0; i < a.size(); ++i)
r[idx] += aa[i]*bb[i];
}
// -----------------------------------------------------------------------------------
// -----------------------------------------------------------------------------------
// -----------------------------------------------------------------------------------
namespace ttimpl
{
void softmax (
const long num_locations,
const long num_channels,
tensor& dest,
const tensor& src
)
{
DLIB_ASSERT(num_channels*num_locations == src.nr()*src.nc()*src.k());
DLIB_CASSERT(have_same_dimensions(dest,src));
const auto d = dest.host();
const auto s = src.host();
// Note that we subtract out the max values in each channel before applying
// exp() to avoid numeric overflow in the subsequent computations. Doing this
// doesn't change the resulting output, it just makes it more numerically
// stable.
for (long n = 0; n < src.num_samples(); ++n)
{
auto ss = s + num_locations*num_channels*n;
auto dd = d + num_locations*num_channels*n;
for (long i = 0; i < num_locations; ++i)
{
float max_val = -std::numeric_limits<float>::infinity();
for (long k = 0; k < num_channels; ++k)
max_val = std::max(max_val, ss[k*num_locations]);
for (long k = 0; k < num_channels; ++k)
dd[k*num_locations] = std::exp(ss[k*num_locations]-max_val);
++ss;
++dd;
}
}
// Now normalize each channel so they sum to 1.
for (long n = 0; n < src.num_samples(); ++n)
{
const auto dd = d + num_locations*num_channels*n;
for (long i = 0; i < num_locations; ++i)
{
const auto ddd = dd+i;
float temp = 0;
for (long k = 0; k < num_channels; ++k)
temp += ddd[k*num_locations];
for (long k = 0; k < num_channels; ++k)
ddd[k*num_locations] /= temp;
}
}
}
void softmax_gradient (
const long num_locations,
const long num_channels,
tensor& grad,
const tensor& dest,
const tensor& gradient_input
)
{
DLIB_ASSERT(num_channels*num_locations == grad.nr()*grad.nc()*grad.k());
DLIB_CASSERT(have_same_dimensions(grad,dest));
DLIB_CASSERT(have_same_dimensions(grad,gradient_input));
const auto d = dest.host();
const auto g = grad.host();
const auto in = gradient_input.host();
for (long n = 0; n < grad.num_samples(); ++n)
{
const auto d2 = d + num_locations*num_channels*n;
const auto g2 = g + num_locations*num_channels*n;
const auto in2 = in + num_locations*num_channels*n;
for (long i = 0; i < num_locations; ++i)
{
const auto d3 = d2+i;
const auto g3 = g2+i;
const auto in3 = in2+i;
float temp = 0;
for (long k = 0; k < num_channels; ++k)
temp += -d3[k*num_locations]*in3[k*num_locations];
if (is_same_object(gradient_input, grad))
{
for (long k = 0; k < num_channels; ++k)
g3[k*num_locations] = d3[k*num_locations]*(temp+in3[k*num_locations]);
}
else
{
for (long k = 0; k < num_channels; ++k)
g3[k*num_locations] += d3[k*num_locations]*(temp+in3[k*num_locations]);
}
}
}
}
}
// ----------------------------------------------------------------------------------------
void softmax (
tensor& dest,
const tensor& src
)
{
DLIB_CASSERT(have_same_dimensions(dest,src));
ttimpl::softmax(src.nr()*src.nc(), src.k(), dest, src);
}
void softmax_gradient (
tensor& grad,
const tensor& dest,
const tensor& gradient_input
)
{
DLIB_CASSERT(have_same_dimensions(grad,dest));
DLIB_CASSERT(have_same_dimensions(grad,gradient_input));
ttimpl::softmax_gradient(grad.nr()*grad.nc(), grad.k(), grad, dest, gradient_input);
}
// ------------------------------------------------------------------------------------
void softmax_all (
tensor& dest,
const tensor& src
)
{
DLIB_CASSERT(have_same_dimensions(dest,src));
ttimpl::softmax(1, src.nr()*src.nc()*src.k(), dest, src);
}
void softmax_all_gradient (
tensor& grad,
const tensor& dest,
const tensor& gradient_input
)
{
DLIB_CASSERT(have_same_dimensions(grad,dest));
DLIB_CASSERT(have_same_dimensions(grad,gradient_input));
ttimpl::softmax_gradient(1, grad.nr()*grad.nc()*grad.k(), grad, dest, gradient_input);
}
// ------------------------------------------------------------------------------------
void sigmoid (
tensor& dest,
const tensor& src
)
{
const auto d = dest.host();
const auto s = src.host();
for (size_t i = 0; i < src.size(); ++i)
d[i] = 1/(1+std::exp(-s[i]));
}
void sigmoid_gradient (
tensor& grad,
const tensor& dest,
const tensor& gradient_input
)
{
const auto g = grad.host();
const auto d = dest.host();
const auto in = gradient_input.host();
if (is_same_object(gradient_input, grad))
{
for (size_t i = 0; i < dest.size(); ++i)
g[i] = in[i]*d[i]*(1-d[i]);
}
else
{
for (size_t i = 0; i < dest.size(); ++i)
g[i] += in[i]*d[i]*(1-d[i]);
}
}
// ------------------------------------------------------------------------------------
void mish (
tensor& dest,
const tensor& src
)
{
const auto d = dest.host_write_only();
const auto s = src.host();
for (size_t i = 0; i < src.size(); ++i)
{
const auto e = std::exp(s[i]);
const auto delta = 2*e + e*e + 2;
d[i] = s[i] - 2*s[i]/delta;
}
}
void mish_gradient(
tensor& grad,
const tensor& src,
const tensor& gradient_input
)
{
const auto g = grad.host();
const auto s = src.host();
const auto in = gradient_input.host();
const auto calculate_gradient = [](float x)
{
if (x >= 8)
return 1.f;
if (x <= -8)
return 0.f;
const auto e = std::exp(x);
const auto delta = 2*e + e*e + 2;
const auto omega = 4*(x + 1) + 4*e*e + e*e*e + e*(4*x + 6);
return e*omega/(delta*delta);
};
if (is_same_object(gradient_input, grad))
{
for (size_t i = 0; i < src.size(); ++i)
g[i] = in[i]*calculate_gradient(s[i]);
}
else
{
for (size_t i = 0; i < src.size(); ++i)
g[i] += in[i]*calculate_gradient(s[i]);
}
}
// ------------------------------------------------------------------------------------
void relu (
tensor& dest,
const tensor& src
)
{
dest = lowerbound(mat(src), 0);
}
void relu_gradient (
tensor& grad,
const tensor& dest,
const tensor& gradient_input
)
{
const float* gi = gradient_input.host();
const float* in = dest.host();
float* out = grad.host();
if (is_same_object(grad, gradient_input))
{
for (size_t i = 0; i < dest.size(); ++i)
{
if (in[i] > 0)
out[i] = gi[i];
else
out[i] = 0;
}
}
else
{
for (size_t i = 0; i < dest.size(); ++i)
{
if (in[i] > 0)
out[i] += gi[i];
}
}
}
// ----------------------------------------------------------------------------------------
void prelu (
tensor& dest,
const tensor& src,
const tensor& param
)
{
const float p = param.host()[0];
const float* s = src.host();
float* d = dest.host();
for (size_t i = 0; i < dest.size(); ++i)
{
if (s[i] > 0)
d[i] = s[i];
else
d[i] = p*s[i];
}
}
void prelu_gradient (
tensor& grad,
const tensor& src,
const tensor& gradient_input,
const tensor& param,
tensor& params_grad
)
{
DLIB_CASSERT(is_same_object(grad, gradient_input) == false);
const float p = param.host()[0];
const float* gi = gradient_input.host();
const float* s = src.host();
float* out = grad.host();
float pgrad = 0;
for (size_t i = 0; i < src.size(); ++i)
{
if (s[i] > 0)
{
out[i] += gi[i];
}
else
{
out[i] += p*gi[i];
pgrad += gi[i]*s[i];
}
}
params_grad.host()[0] = pgrad;
}
// ------------------------------------------------------------------------------------
void leaky_relu (
tensor& dest,
const tensor& src,
const float alpha
)
{
const float* s = src.host();
float* d = dest.host();
for (size_t i = 0; i < dest.size(); ++i)
{
if (s[i] > 0)
d[i] = s[i];
else
d[i] = alpha * s[i];
}
}
void leaky_relu_gradient (
tensor& grad,
const tensor& dest,
const tensor& gradient_input,
const float alpha
)
{
const float* gi = gradient_input.host();
const float* in = dest.host();
float* out = grad.host();
if (is_same_object(grad, gradient_input))
{
for (size_t i = 0; i < dest.size(); ++i)
{
if (in[i] > 0)
out[i] = gi[i];
else
out[i] = alpha * gi[i];
}
}
else
{
for (size_t i = 0; i < dest.size(); ++i)
{
if (in[i] > 0)
out[i] += gi[i];
else
out[i] += alpha * gi[i];
}
}
}
// ------------------------------------------------------------------------------------
void tanh (
tensor& dest,
const tensor& src
)
{
const auto d = dest.host();
const auto s = src.host();
for (size_t i = 0; i < src.size(); ++i)
d[i] = std::tanh(s[i]);
}
void tanh_gradient (
tensor& grad,
const tensor& dest,
const tensor& gradient_input
)
{
const auto g = grad.host();
const auto d = dest.host();
const auto in = gradient_input.host();
if (is_same_object(grad, gradient_input))
{
for (size_t i = 0; i < dest.size(); ++i)
g[i] = in[i]*(1-d[i]*d[i]);
}
else
{
for (size_t i = 0; i < dest.size(); ++i)
g[i] += in[i]*(1-d[i]*d[i]);
}
}
// ----------------------------------------------------------------------------------------
void clipped_relu (
tensor& dest,
const tensor& src,
const float ceiling
)
{
dest = upperbound(lowerbound(mat(src), 0), ceiling);
}
void clipped_relu_gradient (
tensor& grad,
const tensor& dest,
const tensor& gradient_input,
const float ceiling
)
{
const auto out = grad.host();
const auto in = dest.host();
const auto gi = gradient_input.host();
if (is_same_object(grad, gradient_input))
{
for (size_t i = 0; i < dest.size(); ++i)
{
if (in[i] > 0 && in[i] < ceiling)
out[i] = gi[i];
else
out[i] = 0;
}
}
else
{
for (size_t i = 0; i < dest.size(); ++i)
{
if (in[i] > 0 && in[i] < ceiling)
out[i] += gi[i];
}
}
}
// ----------------------------------------------------------------------------------------
void elu (
tensor& dest,
const tensor& src,
const float alpha
)
{
const auto d = dest.host();
const auto s = src.host();
for (size_t i = 0; i < src.size(); ++i)
{
if (s[i] > 0)
d[i] = s[i];
else
d[i] = alpha * (std::exp(s[i]) - 1.0f);
}
}
void elu_gradient (
tensor& grad,
const tensor& dest,
const tensor& gradient_input,
const float alpha
)
{
const auto out = grad.host();
const auto in = dest.host();
const auto gi = gradient_input.host();
if (is_same_object(grad, gradient_input))
{
for (size_t i = 0; i < dest.size(); ++i)
{
if (in[i] > 0)
out[i] = gi[i];
else
out[i] = (alpha + in[i]) * gi[i];
}
}
else
{
for (size_t i = 0; i < dest.size(); ++i)
{
if (in[i] > 0)
out[i] += gi[i];
else
out[i] += (alpha + in[i]) * gi[i];
}
}
}
// ----------------------------------------------------------------------------------------
void gelu (
tensor& dest,
const tensor& src
)
{
const auto d = dest.host();
const auto s = src.host();
for (size_t i = 0; i < src.size(); ++i)
d[i] = 0.5f*s[i]*(1.0f + std::erf(s[i]/sqrt_2));
}
void gelu_gradient (
tensor& grad,
const tensor& src,
const tensor& gradient_input
)
{
const float beta = 1.0f / std::sqrt(2.0f * pi);
const auto compute_gradient = [beta](float x)
{
const float cdf = 0.5f*(1.0f + std::erf(x/sqrt_2));
const float pdf = beta*std::exp(-0.5f*x*x);
return cdf + x * pdf;
};
const auto g = grad.host();
const auto s = src.host();
const auto in = gradient_input.host();
if (is_same_object(grad, gradient_input))
{
for (size_t i = 0; i < src.size(); ++i)
g[i] = in[i]*compute_gradient(s[i]);
}
else
{
for (size_t i = 0; i < src.size(); ++i)
g[i] += in[i]*compute_gradient(s[i]);
}
}
// ----------------------------------------------------------------------------------------
void smelu (
tensor& dest,
const tensor& src,
const float beta
)
{
const float* s = src.host();
float* d = dest.host();
for (size_t i = 0; i < dest.size(); ++i)
{
if (s[i] >= beta)
d[i] = s[i];
else if (s[i] <= -beta)
d[i] = 0;
else
d[i] = (s[i] + beta) * (s[i] + beta) / (4 * beta);
}
}
void smelu_gradient (
tensor& grad,
const tensor& dest,
const tensor& gradient_input,
const float beta
)
{
const float* gi = gradient_input.host();
const float* in = dest.host();
float* out = grad.host();
if (is_same_object(grad, gradient_input))
{
for (size_t i = 0; i < dest.size(); ++i)
{
if (in[i] >= beta)
out[i] = gi[i];
else if (in[i] == 0)
out[i] = 0;
else
out[i] = std::sqrt(beta * in[i]) / beta * gi[i];
}
}
else
{
for (size_t i = 0; i < dest.size(); ++i)
{
if (in[i] >= beta)
out[i] += gi[i];
else if (in[i] == 0)
continue;
else
out[i] += std::sqrt(beta * in[i]) / beta * gi[i];
}
}
}
// ----------------------------------------------------------------------------------------
void silu (
tensor& dest,
const tensor& src
)
{
const auto d = dest.host();
const auto s = src.host();
for (size_t i = 0; i < src.size(); ++i)
d[i] = s[i] * impl::sigmoid(s[i]);
}
void silu_gradient (
tensor& grad,
const tensor& src,
const tensor& gradient_input
)
{
const auto g = grad.host();
const auto s = src.host();
const auto in = gradient_input.host();
if (is_same_object(grad, gradient_input))
{
for (size_t i = 0; i < src.size(); ++i)
{
const auto sig_s = impl::sigmoid(s[i]);
g[i] = in[i] * (sig_s * (1.0f + s[i] * (1.0f - sig_s)));
}
}
else
{
for (size_t i = 0; i < src.size(); ++i)
{
const auto sig_s = impl::sigmoid(s[i]);
g[i] += in[i] * (sig_s * (1.0f + s[i] * (1.0f - sig_s)));
}
}
}
// ----------------------------------------------------------------------------------------
void resize_bilinear (
tensor& dest,
long long dest_row_stride,
long long dest_channel_stride,
const tensor& src,
long long src_row_stride,
long long src_channel_stride
)
{
DLIB_CASSERT(is_same_object(dest, src)==false);
DLIB_CASSERT(dest.num_samples() == src.num_samples());
DLIB_CASSERT(dest.k() == src.k());
if (dest.size() == 0 || src.size() == 0)
return;
const float* s = src.host();
float* d = dest.host();
parallel_for(0, dest.k()*dest.num_samples(), [&](long i)
{
auto simg = sub_image(s+i*src_channel_stride, src.nr(), src.nc(), src_row_stride);
auto dimg = sub_image(d+i*dest_channel_stride, dest.nr(), dest.nc(), dest_row_stride);
resize_image(simg, dimg);
});
}
void resize_bilinear_gradient (
tensor& grad,
long long grad_row_stride,
long long grad_channel_stride,
const tensor& gradient_input,
long long gradient_input_row_stride,
long long gradient_input_channel_stride
)
{
DLIB_CASSERT(is_same_object(grad, gradient_input)==false);
DLIB_CASSERT(gradient_input.num_samples() == grad.num_samples());
DLIB_CASSERT(gradient_input.k() == grad.k());
if (gradient_input.size() == 0 || grad.size() == 0)
return;
const float* gi = gradient_input.host();
float* g = grad.host();
const float x_scale = (grad.nc()-1)/(float)std::max<long>((gradient_input.nc()-1),1);
const float y_scale = (grad.nr()-1)/(float)std::max<long>((gradient_input.nr()-1),1);
for (long long samp = 0; samp < gradient_input.num_samples(); ++samp)
{
for (long long k = 0; k < gradient_input.k(); ++k)
{
for (long long r = 0; r < gradient_input.nr(); ++r)
{
const float y = r*y_scale;
const long long top = static_cast<long long>(std::floor(y));
const long long bottom = std::min(top+1, grad.nr()-1);
const float tb_frac = y - top;
for (long long c = 0; c < gradient_input.nc(); ++c)
{
const float x = c*x_scale;
const long long left = static_cast<long long>(std::floor(x));
const long long right = std::min(left+1, grad.nc()-1);
const float lr_frac = x - left;
const float tmp = gi[r*gradient_input_row_stride+c];
g[top*grad_row_stride+left] += tmp*(1-tb_frac)*(1-lr_frac);
g[top*grad_row_stride+right] += tmp*(1-tb_frac)*(lr_frac);
g[bottom*grad_row_stride+left] += tmp*(tb_frac)*(1-lr_frac);
g[bottom*grad_row_stride+right] += tmp*(tb_frac)*(lr_frac);
}
}
g += grad_channel_stride;
gi += gradient_input_channel_stride;
}
}
}
// ----------------------------------------------------------------------------------------
void reorg (
tensor& dest,
const int row_stride,
const int col_stride,
const tensor& src
)
{
DLIB_CASSERT(is_same_object(dest, src)==false);
DLIB_CASSERT(src.nr() % row_stride == 0);
DLIB_CASSERT(src.nc() % col_stride == 0);
DLIB_CASSERT(dest.num_samples() == src.num_samples());
DLIB_CASSERT(dest.k() == src.k() * row_stride * col_stride);
DLIB_CASSERT(dest.nr() == src.nr() / row_stride);
DLIB_CASSERT(dest.nc() == src.nc() / col_stride);
const float* s = src.host();
float* d = dest.host();
parallel_for(0, dest.num_samples(), [&](long n)
{
for (long k = 0; k < dest.k(); ++k)
{
for (long r = 0; r < dest.nr(); ++r)
{
for (long c = 0; c < dest.nc(); ++c)
{
const auto out_idx = tensor_index(dest, n, k, r, c);
const auto in_idx = tensor_index(src,
n,
k % src.k(),
r * row_stride + (k / src.k()) / row_stride,
c * col_stride + (k / src.k()) % col_stride);
d[out_idx] = s[in_idx];
}
}
}
});
}
void reorg_gradient (
tensor& grad,
const int row_stride,
const int col_stride,
const tensor& gradient_input
)
{
DLIB_CASSERT(is_same_object(grad, gradient_input)==false);
DLIB_CASSERT(grad.nr() % row_stride == 0);
DLIB_CASSERT(grad.nc() % col_stride == 0);
DLIB_CASSERT(grad.num_samples() == gradient_input.num_samples());
DLIB_CASSERT(grad.k() == gradient_input.k() / row_stride / col_stride);
DLIB_CASSERT(grad.nr() == gradient_input.nr() * row_stride);
DLIB_CASSERT(grad.nc() == gradient_input.nc() * row_stride);
const float* gi = gradient_input.host();
float* g = grad.host();
parallel_for(0, gradient_input.num_samples(), [&](long n)
{
for (long k = 0; k < gradient_input.k(); ++k)
{
for (long r = 0; r < gradient_input.nr(); ++r)
{
for (long c = 0; c < gradient_input.nc(); ++c)
{
const auto in_idx = tensor_index(gradient_input, n, k, r, c);
const auto out_idx = tensor_index(grad,
n,
k % grad.k(),
r * row_stride + (k / grad.k()) / row_stride,
c * col_stride + (k / grad.k()) % col_stride);
g[out_idx] += gi[in_idx];
}
}
}
});
}
// ------------------------------------------------------------------------------------
// ------------------------------------------------------------------------------------
// ------------------------------------------------------------------------------------
pooling::pooling (
) : window_height(0),window_width(0),stride_y(0),stride_x(0),padding_y(0),padding_x(0),do_max_pooling(true)
{
}
void pooling::
clear(
)
{
window_height = 0;
window_width = 0;
stride_y = 0;
stride_x = 0;
padding_y = 0;
padding_x = 0;
}
void pooling::
setup_max_pooling(
int window_height_,
int window_width_,
int stride_y_,
int stride_x_,
int padding_y_,
int padding_x_
)
{
DLIB_CASSERT(window_width_ > 0);
DLIB_CASSERT(window_height_ > 0);
DLIB_CASSERT(stride_y_ > 0);
DLIB_CASSERT(stride_x_ > 0);
DLIB_CASSERT(0 <= padding_y_ && padding_y_ < window_height_);
DLIB_CASSERT(0 <= padding_x_ && padding_x_ < window_width_);
window_height = window_height_;
window_width = window_width_;
stride_y = stride_y_;
stride_x = stride_x_;
padding_y = padding_y_;
padding_x = padding_x_;
do_max_pooling = true;
}
void pooling::
setup_avg_pooling(
int window_height_,
int window_width_,
int stride_y_,
int stride_x_,
int padding_y_,
int padding_x_
)
{
DLIB_CASSERT(window_width_ > 0);
DLIB_CASSERT(window_height_ > 0);
DLIB_CASSERT(stride_y_ > 0);
DLIB_CASSERT(stride_x_ > 0);
DLIB_CASSERT(0 <= padding_y_ && padding_y_ < window_height_);
DLIB_CASSERT(0 <= padding_x_ && padding_x_ < window_width_);
window_height = window_height_;
window_width = window_width_;
stride_y = stride_y_;
stride_x = stride_x_;
padding_y = padding_y_;
padding_x = padding_x_;
do_max_pooling = false;
}
void pooling::
operator() (
resizable_tensor& dest,
const tensor& src
)
{
DLIB_CASSERT(window_width > 0);
DLIB_CASSERT(window_height > 0);
DLIB_CASSERT(stride_y > 0);
DLIB_CASSERT(stride_x > 0);
DLIB_CASSERT(0 <= padding_y && padding_y < window_height);
DLIB_CASSERT(0 <= padding_x && padding_x < window_width);
DLIB_CASSERT(window_width <= src.nc() + 2*padding_x,
"Pooling windows must be small enough to fit into the padded image.");
DLIB_CASSERT(window_height <= src.nr() + 2*padding_y,
"Pooling windows must be small enough to fit into the padded image.");
dest.set_size(
src.num_samples(),
src.k(),
1+(src.nr()+2*padding_y-window_height)/stride_y,
1+(src.nc()+2*padding_x-window_width)/stride_x
);
if (src.size() == 0)
{
dest = 0;
return;
}
auto d = dest.host();
const long x_offset = window_width/2 - padding_x;
const long y_offset = window_height/2 - padding_y;
if (does_max_pooling())
{
for (long n = 0; n < dest.num_samples(); ++n)
{
for (long k = 0; k < dest.k(); ++k)
{
auto simg = image_plane(src,n,k);
auto dimg = d + (n*dest.k() + k)*dest.nr()*dest.nc();
for (long r = 0; r < dest.nr(); ++r)
{
for (long c = 0; c < dest.nc(); ++c)
{
auto win = centered_rect(c*stride_x+x_offset,
r*stride_y+y_offset,
window_width,
window_height);
dimg[r*dest.nc() + c] = max(subm_clipped(simg,win));
}
}
}
}
}
else
{
for (long n = 0; n < dest.num_samples(); ++n)
{
for (long k = 0; k < dest.k(); ++k)
{
auto simg = image_plane(src,n,k);
auto dimg = d + (n*dest.k() + k)*dest.nr()*dest.nc();
for (long r = 0; r < dest.nr(); ++r)
{
for (long c = 0; c < dest.nc(); ++c)
{
auto win = centered_rect(c*stride_x+x_offset,
r*stride_y+y_offset,
window_width,
window_height);
dimg[r*dest.nc() + c] = mean(subm_clipped(simg,win));
}
}
}
}
}
}
void pooling::get_gradient(
const tensor& gradient_input,
const tensor& dest,
const tensor& src,
tensor& grad
)
{
DLIB_CASSERT(have_same_dimensions(gradient_input,dest));
DLIB_CASSERT(have_same_dimensions(src,grad));
if (src.size() == 0)
{
return;
}
auto gi = gradient_input.host();
auto g = grad.host();
const long x_offset = window_width/2 - padding_x;
const long y_offset = window_height/2 - padding_y;
if (does_max_pooling())
{
for (long n = 0; n < dest.num_samples(); ++n)
{
for (long k = 0; k < dest.k(); ++k)
{
auto simg = image_plane(src,n,k);
auto gimg = g + (n*grad.k() + k)*grad.nr()*grad.nc();
auto giimg = gi + (n*dest.k() + k)*dest.nr()*dest.nc();
auto imgbox = get_rect(simg);
for (long r = 0; r < dest.nr(); ++r)
{
for (long c = 0; c < dest.nc(); ++c)
{
auto win = centered_rect(c*stride_x+x_offset,
r*stride_y+y_offset,
window_width,
window_height).intersect(imgbox);
auto p = max_point(subm(simg,win))+win.tl_corner();
gimg[p.y()*grad.nc()+p.x()] += giimg[r*dest.nc()+c];
}
}
}
}
}
else
{
for (long n = 0; n < dest.num_samples(); ++n)
{
for (long k = 0; k < dest.k(); ++k)
{
auto simg = image_plane(src,n,k);
auto gimg = g + (n*grad.k() + k)*grad.nr()*grad.nc();
auto giimg = gi + (n*dest.k() + k)*dest.nr()*dest.nc();
auto imgbox = get_rect(simg);
for (long r = 0; r < dest.nr(); ++r)
{
for (long c = 0; c < dest.nc(); ++c)
{
auto win = centered_rect(c*stride_x+x_offset,
r*stride_y+y_offset,
window_width,
window_height).intersect(imgbox);
const float delta = giimg[r*dest.nc()+c]/win.area();
for (long y = win.top(); y <= win.bottom(); ++y)
{
for (long x = win.left(); x <= win.right(); ++x)
{
gimg[y*grad.nc()+x] += delta;
}
}
}
}
}
}
}
}
// ------------------------------------------------------------------------------------
// ------------------------------------------------------------------------------------
// ------------------------------------------------------------------------------------
void img2col(
matrix<float>& output,
const tensor& data,
long n,
long filter_nr,
long filter_nc,
long stride_y,
long stride_x,
long padding_y,
long padding_x
)
{
const auto d = data.host() + data.k()*data.nr()*data.nc()*n;
const rectangle boundary = get_rect(data);
const long out_nr = 1+(data.nr()+2*padding_y-filter_nr)/stride_y;
const long out_nc = 1+(data.nc()+2*padding_x-filter_nc)/stride_x;
output.set_size(out_nr*out_nc,
data.k()*filter_nr*filter_nc);
DLIB_CASSERT(output.size() != 0);
float* t = &output(0,0);
// now fill in the Toeplitz output matrix for the n-th sample in data.
long cnt = 0;
const long max_r = data.nr() + padding_y-(filter_nr-1);
const long max_c = data.nc() + padding_x-(filter_nc-1);
for (long r = -padding_y; r < max_r; r+=stride_y)
{
for (long c = -padding_x; c < max_c; c+=stride_x)
{
for (long k = 0; k < data.k(); ++k)
{
for (long y = 0; y < filter_nr; ++y)
{
for (long x = 0; x < filter_nc; ++x)
{
DLIB_ASSERT(cnt < output.size());
long xx = c+x;
long yy = r+y;
if (boundary.contains(xx,yy))
*t = d[(k*data.nr() + yy)*data.nc() + xx];
else
*t = 0;
++t;
++cnt;
}
}
}
}
}
}
void col2img(
const matrix<float>& output,
tensor& data,
long n,
long filter_nr,
long filter_nc,
long stride_y,
long stride_x,
long padding_y,
long padding_x
)
{
const auto d = data.host() + data.k()*data.nr()*data.nc()*n;
const rectangle boundary = get_rect(data);
DLIB_CASSERT(output.size() != 0);
const float* t = &output(0,0);
// now fill in the Toeplitz output matrix for the n-th sample in data.
const long max_r = data.nr() + padding_y-(filter_nr-1);
const long max_c = data.nc() + padding_x-(filter_nc-1);
for (long r = -padding_y; r < max_r; r+=stride_y)
{
for (long c = -padding_x; c < max_c; c+=stride_x)
{
for (long k = 0; k < data.k(); ++k)
{
for (long y = 0; y < filter_nr; ++y)
{
for (long x = 0; x < filter_nc; ++x)
{
long xx = c+x;
long yy = r+y;
if (boundary.contains(xx,yy))
d[(k*data.nr() + yy)*data.nc() + xx] += *t;
++t;
}
}
}
}
}
}
void tensor_conv::operator() (
const bool add_to_output,
resizable_tensor& output,
const tensor& data,
const tensor& filters
)
{
DLIB_CASSERT(last_stride_y > 0 && last_stride_x > 0, "You must call setup() before calling this function.");
output.set_size(data.num_samples(),
filters.num_samples(),
1+(data.nr()+2*last_padding_y-filters.nr())/last_stride_y,
1+(data.nc()+2*last_padding_x-filters.nc())/last_stride_x);
(*this)(add_to_output, static_cast<tensor&>(output),data,filters);
}
void tensor_conv::operator() (
const bool add_to_output,
tensor& output,
const tensor& data,
const tensor& filters
)
{
DLIB_CASSERT(is_same_object(output,data) == false);
DLIB_CASSERT(is_same_object(output,filters) == false);
DLIB_CASSERT(filters.k() == data.k());
DLIB_CASSERT(last_stride_y > 0 && last_stride_x > 0, "You must call setup() before calling this function.");
DLIB_CASSERT(filters.nr() <= data.nr() + 2*last_padding_y,
"Filter windows must be small enough to fit into the padded image.");
DLIB_CASSERT(filters.nc() <= data.nc() + 2*last_padding_x,
"Filter windows must be small enough to fit into the padded image.");
DLIB_CASSERT(output.num_samples() == data.num_samples());
DLIB_CASSERT(output.k() == filters.num_samples());
DLIB_CASSERT(output.nr() == 1+(data.nr()+2*last_padding_y-filters.nr())/last_stride_y);
DLIB_CASSERT(output.nc() == 1+(data.nc()+2*last_padding_x-filters.nc())/last_stride_x);
matrix<float> temp;
for (long n = 0; n < data.num_samples(); ++n)
{
img2col(temp, data, n, filters.nr(), filters.nc(), last_stride_y, last_stride_x, last_padding_y, last_padding_x);
if (add_to_output)
output.add_to_sample(n, mat(filters)*trans(temp));
else
output.set_sample(n, mat(filters)*trans(temp));
}
}
void tensor_conv::operator() (
const bool add_to_output,
resizable_tensor& output,
const tensor& data,
const tensor& filters,
const tensor& biases
)
{
DLIB_CASSERT(filters.num_samples() == biases.k());
(*this)(add_to_output, output,data,filters);
tt::add(1, output, 1, biases);
}
void tensor_conv::operator() (
const bool add_to_output,
tensor& output,
const tensor& data,
const tensor& filters,
const tensor& biases
)
{
DLIB_CASSERT(filters.num_samples() == biases.k());
(*this)(add_to_output, output, data, filters);
tt::add(1, output, 1, biases);
}
// ------------------------------------------------------------------------------------
void tensor_conv::
get_gradient_for_data (
const bool add_to_output,
const tensor& gradient_input,
const tensor& filters,
tensor& data_gradient
)
{
matrix<float> temp;
if (!add_to_output)
data_gradient = 0;
for (long n = 0; n < gradient_input.num_samples(); ++n)
{
auto gi = mat(gradient_input.host()+gradient_input.k()*gradient_input.nr()*gradient_input.nc()*n,
gradient_input.k(),
gradient_input.nr()*gradient_input.nc());
temp = trans(gi)*mat(filters);
col2img(temp, data_gradient, n, filters.nr(), filters.nc(), last_stride_y, last_stride_x, last_padding_y, last_padding_x);
}
}
// ------------------------------------------------------------------------------------
void tensor_conv::
get_gradient_for_filters (
const bool add_to_output,
const tensor& gradient_input,
const tensor& data,
tensor& filters_gradient
)
{
matrix<float> temp;
for (long n = 0; n < gradient_input.num_samples(); ++n)
{
auto gi = mat(gradient_input.host()+gradient_input.k()*gradient_input.nr()*gradient_input.nc()*n,
gradient_input.k(),
gradient_input.nr()*gradient_input.nc());
img2col(temp, data, n, filters_gradient.nr(), filters_gradient.nc(), last_stride_y, last_stride_x, last_padding_y, last_padding_x);
if (n == 0)
{
if (add_to_output)
filters_gradient += gi*temp;
else
filters_gradient = gi*temp;
}
else
{
filters_gradient += gi*temp;
}
}
}
// ------------------------------------------------------------------------------------
void copy_tensor(
bool add_to,
tensor& dest,
size_t dest_k_offset,
const tensor& src,
size_t src_k_offset,
size_t count_k
)
{
const size_t dest_sample_size = static_cast<size_t>(dest.nc() * dest.nr() * dest.k());
const size_t src_sample_size = static_cast<size_t>(src.nc() * src.nr() * src.k());
const size_t block_size = count_k * dest.nc() * dest.nr();
DLIB_CASSERT(dest.num_samples() == src.num_samples() &&
dest.nc() == src.nc() && dest.nr() == src.nr(), "All sources should fit into dest tensor size");
DLIB_CASSERT(dest.k() - dest_k_offset >= count_k, "Not enough space in dest tensor");
DLIB_CASSERT(src.k() - src_k_offset >= count_k, "Not enough space in src tensor");
float* dest_p = dest.host() + dest_k_offset * dest.nc() * dest.nr();
const float* src_p = src.host() + src_k_offset * src.nc() * src.nr();
for (long i = 0; i < src.num_samples(); ++i)
{
if (add_to)
{
for (size_t j = 0; j < block_size; ++j)
dest_p[j] += src_p[j];
}
else
{
::memcpy(dest_p, src_p, block_size * sizeof(float));
}
dest_p += dest_sample_size;
src_p += src_sample_size;
}
}
// ------------------------------------------------------------------------------------
// ------------------------------------------------------------------------------------
// ------------------------------------------------------------------------------------
}
}
#endif // DLIB_DNN_CPU_cPP_
// Copyright (C) 2015 Davis E. King (davis@dlib.net)
// License: Boost Software License See LICENSE.txt for the full license.
#ifndef DLIB_DNN_CPU_H_
#define DLIB_DNN_CPU_H_
// This file contains CPU implementations of the GPU based functions in cuda_dlib.h
// and cudnn_dlibapi.h
#include "tensor.h"
#include "../geometry/rectangle.h"
#include "../dnn/utilities.h"
namespace dlib
{
namespace cpu
{
// -----------------------------------------------------------------------------------
void multiply (
bool add_to,
tensor& dest,
const tensor& src1,
const tensor& src2
);
void multiply_conv (
bool add_to,
tensor& dest,
const tensor& src1,
const tensor& src2
);
void multiply_zero_padded (
bool add_to,
tensor& dest,
const tensor& src1,
const tensor& src2
);
void scale_channels (
bool add_to,
tensor& dest,
const tensor& src,
const tensor& scales
);
void add(
float beta,
tensor& dest,
float alpha,
const tensor& src
);
void assign_bias_gradient (
tensor& grad,
const tensor& gradient_input
);
void add (
tensor& dest,
const tensor& src1,
const tensor& src2
);
void assign_conv_bias_gradient (
tensor& grad,
const tensor& gradient_input
);
// -----------------------------------------------------------------------------------
void affine_transform(
tensor& dest,
const tensor& src,
const float A,
const float B
);
void affine_transform(
tensor& dest,
const tensor& src1,
const tensor& src2,
const float A,
const float B,
const float C
);
void affine_transform(
tensor& dest,
const tensor& src1,
const tensor& src2,
const tensor& src3,
const float A,
const float B,
const float C,
const float D
);
void affine_transform_range(
size_t begin,
size_t end,
tensor& dest,
const tensor& src1,
const tensor& src2,
const tensor& src3,
const float A,
const float B,
const float C
);
// -----------------------------------------------------------------------------------
void affine_transform(
tensor& dest,
const tensor& src,
const tensor& A,
const tensor& B
);
// -----------------------------------------------------------------------------------
void affine_transform_conv(
tensor& dest,
const tensor& src,
const tensor& A,
const tensor& B
);
// -----------------------------------------------------------------------------------
void affine_transform(
const rectangle& rect,
tensor& dest,
const tensor& src1,
const tensor& src2,
const tensor& src3,
float A,
float B,
float C
);
// -----------------------------------------------------------------------------------
void compute_adam_update (
size_t begin,
size_t end,
tensor& s,
tensor& m,
tensor& v,
const float t,
const float learning_rate,
const float weight_decay,
const float momentum1,
const float momentum2,
const tensor& params,
const tensor& params_grad
);
// -----------------------------------------------------------------------------------
void batch_normalize_inference (
const double eps,
resizable_tensor& dest,
const tensor& src,
const tensor& gamma,
const tensor& beta,
const tensor& running_means,
const tensor& running_variances
);
void batch_normalize (
const double eps,
resizable_tensor& dest,
resizable_tensor& means,
resizable_tensor& invstds,
const double averaging_factor,
resizable_tensor& running_means,
resizable_tensor& running_variances,
const tensor& src,
const tensor& gamma,
const tensor& beta
);
void batch_normalize_gradient (
const double eps,
const tensor& gradient_input,
const tensor& means,
const tensor& invstds,
const tensor& src,
const tensor& gamma,
tensor& src_grad,
tensor& gamma_grad,
tensor& beta_grad
);
void batch_normalize_conv_inference (
const double eps,
resizable_tensor& dest,
const tensor& src,
const tensor& gamma,
const tensor& beta,
const tensor& running_means,
const tensor& running_variances
);
void batch_normalize_conv (
const double eps,
resizable_tensor& dest,
resizable_tensor& means,
resizable_tensor& invstds,
const double averaging_factor,
resizable_tensor& running_means,
resizable_tensor& running_variances,
const tensor& src,
const tensor& gamma,
const tensor& beta
);
void batch_normalize_conv_gradient (
const double eps,
const tensor& gradient_input,
const tensor& means,
const tensor& invstds,
const tensor& src,
const tensor& gamma,
tensor& src_grad,
tensor& gamma_grad,
tensor& beta_grad
);
// -----------------------------------------------------------------------------------
void layer_normalize (
const double eps,
resizable_tensor& dest,
resizable_tensor& means,
resizable_tensor& invstds,
const tensor& src,
const tensor& gamma,
const tensor& beta
);
void layer_normalize_gradient (
const double eps,
const tensor& gradient_input,
const tensor& means,
const tensor& invstds,
const tensor& src,
const tensor& gamma,
tensor& src_grad,
tensor& gamma_grad,
tensor& beta_grad
);
// -----------------------------------------------------------------------------------
void threshold (
tensor& data,
float thresh
);
void dot (
const tensor& a,
const tensor& b,
tensor& result,
size_t idx
);
// -----------------------------------------------------------------------------------
void softmax (
tensor& dest,
const tensor& src
);
void softmax_gradient (
tensor& grad,
const tensor& dest,
const tensor& gradient_input
);
// ------------------------------------------------------------------------------------
void softmax_all (
tensor& dest,
const tensor& src
);
void softmax_all_gradient (
tensor& grad,
const tensor& dest,
const tensor& gradient_input
);
// ------------------------------------------------------------------------------------
void sigmoid (
tensor& dest,
const tensor& src
);
void sigmoid_gradient (
tensor& grad,
const tensor& dest,
const tensor& gradient_input
);
// ------------------------------------------------------------------------------------
void mish (
tensor& dest,
const tensor& src
);
void mish_gradient (
tensor& grad,
const tensor& src,
const tensor& gradient_input
);
// ------------------------------------------------------------------------------------
void relu (
tensor& dest,
const tensor& src
);
void relu_gradient (
tensor& grad,
const tensor& dest,
const tensor& gradient_input
);
// ----------------------------------------------------------------------------------------
void prelu (
tensor& dest,
const tensor& src,
const tensor& param
);
void prelu_gradient (
tensor& grad,
const tensor& src,
const tensor& gradient_input,
const tensor& param,
tensor& params_grad
);
// ------------------------------------------------------------------------------------
void leaky_relu (
tensor& dest,
const tensor& src,
const float alpha
);
void leaky_relu_gradient (
tensor& grad,
const tensor& dest,
const tensor& gradient_input,
const float alpha
);
// ------------------------------------------------------------------------------------
void tanh (
tensor& dest,
const tensor& src
);
void tanh_gradient (
tensor& grad,
const tensor& dest,
const tensor& gradient_input
);
// ------------------------------------------------------------------------------------
void clipped_relu (
tensor& dest,
const tensor& src,
const float ceiling
);
void clipped_relu_gradient (
tensor& grad,
const tensor& dest,
const tensor& gradient_input,
const float ceiling
);
// ------------------------------------------------------------------------------------
void elu (
tensor& dest,
const tensor& src,
const float alpha
);
void elu_gradient (
tensor& grad,
const tensor& dest,
const tensor& gradient_input,
const float alpha
);
// ----------------------------------------------------------------------------------------
void gelu (
tensor& dest,
const tensor& src
);
void gelu_gradient (
tensor& grad,
const tensor& dest,
const tensor& gradient_input
);
// ----------------------------------------------------------------------------------------
void smelu (
tensor& dest,
const tensor& src,
const float beta
);
void smelu_gradient (
tensor& grad,
const tensor& dest,
const tensor& gradient_input,
const float beta
);
// ----------------------------------------------------------------------------------------
void silu (
tensor& dest,
const tensor& src
);
void silu_gradient (
tensor& grad,
const tensor& dest,
const tensor& gradient_input
);
// ------------------------------------------------------------------------------------
void resize_bilinear (
tensor& dest,
long long dest_row_stride,
long long dest_channel_stride,
const tensor& src,
long long src_row_stride,
long long src_channel_stride
);
void resize_bilinear_gradient (
tensor& grad,
long long grad_row_stride,
long long grad_channel_stride,
const tensor& gradient_input,
long long gradient_input_row_stride,
long long gradient_input_channel_stride
);
inline void resize_bilinear (
tensor& dest,
const tensor& src
) { resize_bilinear(dest, dest.nc(), dest.nr()*dest.nc(), src, src.nc(), src.nr()*src.nc()); }
inline void resize_bilinear_gradient (
tensor& grad,
const tensor& gradient_input
) { resize_bilinear_gradient(grad, grad.nc(), grad.nr()*grad.nc(), gradient_input, gradient_input.nc(), gradient_input.nr()*gradient_input.nc()); }
// -----------------------------------------------------------------------------------
void reorg (
tensor& dest,
const int row_stride,
const int col_stride,
const tensor& src
);
void reorg_gradient (
tensor& grad,
const int row_stride,
const int col_stride,
const tensor& gradient_input
);
// -----------------------------------------------------------------------------------
class pooling
{
public:
pooling(const pooling&) = delete;
pooling& operator=(const pooling&) = delete;
pooling (
);
void clear(
);
void setup_max_pooling(
int window_height,
int window_width,
int stride_y,
int stride_x,
int padding_y,
int padding_x
);
void setup_avg_pooling(
int window_height,
int window_width,
int stride_y,
int stride_x,
int padding_y,
int padding_x
);
bool does_max_pooling(
) const { return do_max_pooling; }
void operator() (
resizable_tensor& dest,
const tensor& src
);
void get_gradient(
const tensor& gradient_input,
const tensor& dest,
const tensor& src,
tensor& grad
);
private:
int window_height;
int window_width;
int stride_y;
int stride_x;
int padding_y;
int padding_x;
bool do_max_pooling;
};
// -----------------------------------------------------------------------------------
class tensor_conv
{
public:
tensor_conv(const tensor_conv&) = delete;
tensor_conv& operator=(const tensor_conv&) = delete;
tensor_conv() {}
void clear(
) {}
void setup(
const tensor& data, /* not used but required for interface */
const tensor& filters, /* not used but required for interface */
int stride_y,
int stride_x,
int padding_y,
int padding_x
)
{
(void)data; /* silence compiler */
DLIB_CASSERT(stride_y > 0 && stride_x > 0);
DLIB_CASSERT(0 <= padding_y && padding_y < filters.nr());
DLIB_CASSERT(0 <= padding_x && padding_x < filters.nc());
last_stride_y = stride_y;
last_stride_x = stride_x;
last_padding_y = padding_y;
last_padding_x = padding_x;
}
void operator() (
const bool add_to_output,
resizable_tensor& output,
const tensor& data,
const tensor& filters
);
void operator() (
const bool add_to_output,
tensor& output,
const tensor& data,
const tensor& filters
);
void operator() (
const bool add_to_output,
resizable_tensor& output,
const tensor& data,
const tensor& filters,
const tensor& biases
);
void operator() (
const bool add_to_output,
tensor& output,
const tensor& data,
const tensor& filters,
const tensor& biases
);
void get_gradient_for_data (
const bool add_to_output,
const tensor& gradient_input,
const tensor& filters,
tensor& data_gradient
);
void get_gradient_for_filters (
const bool add_to_output,
const tensor& gradient_input,
const tensor& data,
tensor& filters_gradient
);
private:
long last_stride_y = 0;
long last_stride_x = 0;
long last_padding_y = 0;
long last_padding_x = 0;
};
// -----------------------------------------------------------------------------------
void copy_tensor(
bool add_to,
tensor& dest,
size_t dest_k_offset,
const tensor& src,
size_t src_k_offset,
size_t count_k
);
// -----------------------------------------------------------------------------------
class compute_loss_binary_log_per_pixel
{
/*! The point of this class is to compute the loss for loss_binary_log_per_pixel_
on the cpu to provide an analogous implementation of the cuda version
!*/
public:
compute_loss_binary_log_per_pixel(
)
{
}
template <
typename const_label_iterator
>
void operator()(
const_label_iterator truth,
const tensor& output_tensor,
tensor& grad,
double& loss
) const
{
sigmoid(grad, output_tensor);
// The loss we output is the average loss over the mini-batch, and also over each element of the matrix output.
const double scale = 1.0/(output_tensor.num_samples()*output_tensor.nr()*output_tensor.nc());
loss = 0;
float* const g = grad.host();
const float* const out_data = output_tensor.host();
for (long i = 0; i < output_tensor.num_samples(); ++i, ++truth)
{
for (long r = 0; r < output_tensor.nr(); ++r)
{
for (long c = 0; c < output_tensor.nc(); ++c)
{
const float y = truth->operator()(r, c);
const size_t idx = tensor_index(output_tensor, i, 0, r, c);
if (y > 0.f)
{
const float temp = log1pexp(-out_data[idx]);
loss += y*scale*temp;
g[idx] = y*scale*(g[idx]-1);
}
else if (y < 0.f)
{
const float temp = -(-out_data[idx]-log1pexp(-out_data[idx]));
loss += -y*scale*temp;
g[idx] = -y*scale*g[idx];
}
else
{
g[idx] = 0.f;
}
}
}
}
}
};
// -----------------------------------------------------------------------------------
class compute_loss_multiclass_log_per_pixel
{
/*! The point of this class is to compute the loss for loss_multiclass_log_per_pixel_
on the cpu to provide an analogous implementation of the cuda version
!*/
public:
compute_loss_multiclass_log_per_pixel(
)
{
}
template <
typename const_label_iterator
>
void operator()(
const_label_iterator truth,
const tensor& output_tensor,
tensor& grad,
double& loss
) const
{
softmax(grad, output_tensor);
// The loss we output is the average loss over the mini-batch, and also over each element of the matrix output.
const double scale = 1.0 / (output_tensor.num_samples() * output_tensor.nr() * output_tensor.nc());
loss = 0;
float* const g = grad.host();
for (long i = 0; i < output_tensor.num_samples(); ++i, ++truth)
{
for (long r = 0; r < output_tensor.nr(); ++r)
{
for (long c = 0; c < output_tensor.nc(); ++c)
{
const uint16_t y = truth->operator()(r, c);
// The network must produce a number of outputs that is equal to the number
// of labels when using this type of loss.
DLIB_CASSERT(static_cast<long>(y) < output_tensor.k() || y == label_to_ignore,
"y: " << y << ", output_tensor.k(): " << output_tensor.k());
for (long k = 0; k < output_tensor.k(); ++k)
{
const size_t idx = tensor_index(output_tensor, i, k, r, c);
if (k == y)
{
loss += scale*-safe_log(g[idx]);
g[idx] = scale*(g[idx] - 1);
}
else if (y == label_to_ignore)
{
g[idx] = 0.f;
}
else
{
g[idx] = scale*g[idx];
}
}
}
}
}
}
private:
static const uint16_t label_to_ignore = std::numeric_limits<uint16_t>::max();
};
// -----------------------------------------------------------------------------------
class compute_loss_multiclass_log_per_pixel_weighted
{
/*! The point of this class is to compute the loss for loss_multiclass_log_per_pixel_weighted_
on the cpu to provide an analogous implementation of the cuda version
!*/
public:
compute_loss_multiclass_log_per_pixel_weighted(
)
{
}
template <
typename const_label_iterator
>
void operator()(
const_label_iterator truth,
const tensor& output_tensor,
tensor& grad,
double& loss
) const
{
softmax(grad, output_tensor);
// The loss we output is the weighted average loss over the mini-batch, and also over each element of the matrix output.
const double scale = 1.0 / (output_tensor.num_samples() * output_tensor.nr() * output_tensor.nc());
loss = 0;
float* const g = grad.host();
for (long i = 0; i < output_tensor.num_samples(); ++i, ++truth)
{
for (long r = 0; r < output_tensor.nr(); ++r)
{
for (long c = 0; c < output_tensor.nc(); ++c)
{
const weighted_label<uint16_t>& weighted_label = truth->operator()(r, c);
const uint16_t y = weighted_label.label;
const float weight = weighted_label.weight;
// The network must produce a number of outputs that is equal to the number
// of labels when using this type of loss.
DLIB_CASSERT(static_cast<long>(y) < output_tensor.k() || weight == 0.f,
"y: " << y << ", output_tensor.k(): " << output_tensor.k());
for (long k = 0; k < output_tensor.k(); ++k)
{
const size_t idx = tensor_index(output_tensor, i, k, r, c);
if (k == y)
{
loss += weight*scale*-safe_log(g[idx]);
g[idx] = weight*scale*(g[idx] - 1);
}
else
{
g[idx] = weight*scale*g[idx];
}
}
}
}
}
}
};
// -----------------------------------------------------------------------------------
class compute_loss_mean_squared_per_channel_and_pixel
{
/*! The point of this class is to compute the loss for loss_mean_squared_per_channel_and_pixel_
on the cpu to provide an analogous implementation of the cuda version
!*/
public:
compute_loss_mean_squared_per_channel_and_pixel(
)
{
}
template <
typename const_label_iterator
>
void operator()(
const_label_iterator truth,
const tensor& output_tensor,
tensor& grad,
double& loss
) const
{
// The loss we output is the average loss over the mini-batch, and also over each element of the matrix output.
const double scale = 1.0 / (output_tensor.num_samples() * output_tensor.k() * output_tensor.nr() * output_tensor.nc());
loss = 0;
float* const g = grad.host();
const float* out_data = output_tensor.host();
for (long i = 0; i < output_tensor.num_samples(); ++i, ++truth)
{
for (long k = 0; k < output_tensor.k(); ++k)
{
for (long r = 0; r < output_tensor.nr(); ++r)
{
for (long c = 0; c < output_tensor.nc(); ++c)
{
const float y = (*truth)[k].operator()(r, c);
const size_t idx = tensor_index(output_tensor, i, k, r, c);
const float temp1 = y - out_data[idx];
const float temp2 = scale*temp1;
loss += temp2*temp1;
g[idx] = -temp2;
}
}
}
}
}
};
// -----------------------------------------------------------------------------------
}
}
#ifdef NO_MAKEFILE
#include "cpu_dlib.cpp"
#endif
#endif // DLIB_DNN_CPU_H_
// Copyright (C) 2015 Davis E. King (davis@dlib.net)
// License: Boost Software License See LICENSE.txt for the full license.
#ifndef DLIB_DNN_CuBLAS_CPP_
#define DLIB_DNN_CuBLAS_CPP_
#ifdef DLIB_USE_CUDA
#include "cublas_dlibapi.h"
#include "cuda_utils.h"
#include <cublas_v2.h>
#include <vector>
static const char* cublas_get_error_string(cublasStatus_t s)
{
switch(s)
{
case CUBLAS_STATUS_NOT_INITIALIZED:
return "CUDA Runtime API initialization failed.";
case CUBLAS_STATUS_ALLOC_FAILED:
return "CUDA Resources could not be allocated.";
default:
return "A call to cuBLAS failed";
}
}
// Check the return value of a call to the cuBLAS runtime for an error condition.
#define CHECK_CUBLAS(call) \
do{ \
const cublasStatus_t error = call; \
if (error != CUBLAS_STATUS_SUCCESS) \
{ \
std::ostringstream sout; \
sout << "Error while calling " << #call << " in file " << __FILE__ << ":" << __LINE__ << ". ";\
sout << "code: " << error << ", reason: " << cublas_get_error_string(error);\
throw dlib::cublas_error(sout.str()); \
} \
}while(false)
namespace dlib
{
namespace cuda
{
// -----------------------------------------------------------------------------------
class cublas_context
{
public:
// not copyable
cublas_context(const cublas_context&) = delete;
cublas_context& operator=(const cublas_context&) = delete;
cublas_context()
{
handles.resize(16);
}
~cublas_context()
{
for (auto h : handles)
{
if (h)
cublasDestroy(h);
}
}
cublasHandle_t get_handle (
)
{
int new_device_id;
CHECK_CUDA(cudaGetDevice(&new_device_id));
// make room for more devices if needed
if (new_device_id >= (long)handles.size())
handles.resize(new_device_id+16);
// If we don't have a handle already for this device then make one
if (!handles[new_device_id])
CHECK_CUBLAS(cublasCreate(&handles[new_device_id]));
// Finally, return the handle for the current device
return handles[new_device_id];
}
private:
std::vector<cublasHandle_t> handles;
};
static cublasHandle_t context()
{
thread_local cublas_context c;
return c.get_handle();
}
// -----------------------------------------------------------------------------------
void gemm (
float beta,
tensor& dest,
float alpha,
const tensor& lhs,
bool trans_lhs,
const tensor& rhs,
bool trans_rhs
)
{
// Recall that BLAS uses column major order so to deal with that we flip the
// order of the lhs and rhs arguments.
const auto transa = trans_lhs ? CUBLAS_OP_T : CUBLAS_OP_N;
const auto transb = trans_rhs ? CUBLAS_OP_T : CUBLAS_OP_N;
const int dest_nr = dest.num_samples();
const int dest_nc = dest.size()/dest_nr;
const int lhs_nr = lhs.num_samples();
const int lhs_nc = lhs.size()/lhs_nr;
const int rhs_nr = rhs.num_samples();
const int rhs_nc = rhs.size()/rhs_nr;
if (trans_lhs && trans_rhs)
{
DLIB_ASSERT( dest_nr == lhs_nc &&
dest_nc == rhs_nr &&
lhs_nr == rhs_nc)
}
else if (!trans_lhs && trans_rhs)
{
DLIB_ASSERT( dest_nr == lhs_nr &&
dest_nc == rhs_nr &&
lhs_nc == rhs_nc)
}
else if (trans_lhs && !trans_rhs)
{
DLIB_ASSERT( dest_nr == lhs_nc &&
dest_nc == rhs_nc &&
lhs_nr == rhs_nr)
}
else
{
DLIB_ASSERT( dest_nr == lhs_nr &&
dest_nc == rhs_nc &&
lhs_nc == rhs_nr)
}
const int k = trans_rhs ? rhs_nc : rhs_nr;
CHECK_CUBLAS(cublasSgemm(context(),
transb,
transa,
dest_nc, dest_nr, k,
&alpha,
rhs.device(), rhs_nc,
lhs.device(), lhs_nc,
&beta,
dest.device(),dest_nc));
}
// ------------------------------------------------------------------------------------
}
}
#endif // DLIB_USE_CUDA
#endif // DLIB_DNN_CuBLAS_CPP_
// Copyright (C) 2015 Davis E. King (davis@dlib.net)
// License: Boost Software License See LICENSE.txt for the full license.
#ifndef DLIB_DNN_CuBLAS_H_
#define DLIB_DNN_CuBLAS_H_
#ifdef DLIB_USE_CUDA
#include "tensor.h"
#include "cuda_errors.h"
namespace dlib
{
namespace cuda
{
// -----------------------------------------------------------------------------------
void gemm (
float beta,
tensor& dest,
float alpha,
const tensor& lhs,
bool trans_lhs,
const tensor& rhs,
bool trans_rhs
);
/*!
requires
- The dimensions of lhs and rhs must be compatible for matrix
multiplication. In particular:
- Let L == trans_lhs ? trans(mat(lhs)) : mat(lhs)
- Let R == trans_rhs ? trans(mat(rhs)) : mat(rhs)
- Let D == mat(dest)
- D.nr() == L.nr() && D.nc() == R.nc()
(i.e. dest must be preallocated and have the correct output dimensions)
- L.nc() == R.nr()
ensures
- performs: dest = alpha*L*R + beta*mat(dest)
!*/
// ------------------------------------------------------------------------------------
}
}
#endif // DLIB_USE_CUDA
#endif // DLIB_DNN_CuBLAS_H_
// Copyright (C) 2017 Davis E. King (davis@dlib.net)
// License: Boost Software License See LICENSE.txt for the full license.
#ifndef DLIB_DNN_CuDA_DATA_PTR_CPP_
#define DLIB_DNN_CuDA_DATA_PTR_CPP_
#ifdef DLIB_USE_CUDA
#include "cuda_data_ptr.h"
#include "cuda_utils.h"
namespace dlib
{
namespace cuda
{
// ----------------------------------------------------------------------------------------
weak_cuda_data_void_ptr::
weak_cuda_data_void_ptr(
const cuda_data_void_ptr& ptr
) : num(ptr.num), pdata(ptr.pdata)
{
}
// ----------------------------------------------------------------------------------------
cuda_data_void_ptr weak_cuda_data_void_ptr::
lock() const
{
auto ptr = pdata.lock();
if (ptr)
{
cuda_data_void_ptr temp;
temp.pdata = ptr;
temp.num = num;
return temp;
}
else
{
return cuda_data_void_ptr();
}
}
// -----------------------------------------------------------------------------------
// -----------------------------------------------------------------------------------
cuda_data_void_ptr::
cuda_data_void_ptr(
size_t n
) : num(n)
{
if (n == 0)
return;
void* data = nullptr;
CHECK_CUDA(cudaMalloc(&data, n));
pdata.reset(data, [](void* ptr){
auto err = cudaFree(ptr);
if(err!=cudaSuccess)
std::cerr << "cudaFree() failed. Reason: " << cudaGetErrorString(err) << std::endl;
});
}
// ------------------------------------------------------------------------------------
void memcpy(
void* dest,
const cuda_data_void_ptr& src,
const size_t num
)
{
DLIB_ASSERT(num <= src.size());
if (src.size() != 0)
{
CHECK_CUDA(cudaMemcpy(dest, src.data(), num, cudaMemcpyDefault));
}
}
// ------------------------------------------------------------------------------------
void memcpy(
void* dest,
const cuda_data_void_ptr& src
)
{
memcpy(dest, src, src.size());
}
// ------------------------------------------------------------------------------------
void memcpy(
cuda_data_void_ptr dest,
const void* src,
const size_t num
)
{
DLIB_ASSERT(num <= dest.size());
if (dest.size() != 0)
{
CHECK_CUDA(cudaMemcpy(dest.data(), src, num, cudaMemcpyDefault));
}
}
// ------------------------------------------------------------------------------------
void memcpy(
cuda_data_void_ptr dest,
const void* src
)
{
memcpy(dest,src,dest.size());
}
// ------------------------------------------------------------------------------------
class cudnn_device_buffer
{
public:
// not copyable
cudnn_device_buffer(const cudnn_device_buffer&) = delete;
cudnn_device_buffer& operator=(const cudnn_device_buffer&) = delete;
cudnn_device_buffer()
{
buffers.resize(16);
}
~cudnn_device_buffer()
{
}
cuda_data_void_ptr get (
size_t size
)
{
int new_device_id;
CHECK_CUDA(cudaGetDevice(&new_device_id));
// make room for more devices if needed
if (new_device_id >= (long)buffers.size())
buffers.resize(new_device_id+16);
// If we don't have a buffer already for this device then make one, or if it's too
// small, make a bigger one.
cuda_data_void_ptr buff = buffers[new_device_id].lock();
if (!buff || buff.size() < size)
{
buff = cuda_data_void_ptr(size);
buffers[new_device_id] = buff;
}
// Finally, return the buffer for the current device
return buff;
}
private:
std::vector<weak_cuda_data_void_ptr> buffers;
};
// ----------------------------------------------------------------------------------------
cuda_data_void_ptr device_global_buffer(size_t size)
{
thread_local cudnn_device_buffer buffer;
return buffer.get(size);
}
// ------------------------------------------------------------------------------------
}
}
#endif // DLIB_USE_CUDA
#endif // DLIB_DNN_CuDA_DATA_PTR_CPP_
// Copyright (C) 2017 Davis E. King (davis@dlib.net)
// License: Boost Software License See LICENSE.txt for the full license.
#ifndef DLIB_DNN_CuDA_DATA_PTR_H_
#define DLIB_DNN_CuDA_DATA_PTR_H_
#include "../assert.h"
#ifdef DLIB_USE_CUDA
#include <memory>
#include <vector>
#include <type_traits>
namespace dlib
{
namespace cuda
{
// ------------------------------------------------------------------------------------
class cuda_data_void_ptr;
class weak_cuda_data_void_ptr
{
/*!
WHAT THIS OBJECT REPRESENTS
This is just like a std::weak_ptr version of cuda_data_void_ptr. It allows you
to hold a non-owning reference to a cuda_data_void_ptr.
!*/
public:
weak_cuda_data_void_ptr() = default;
weak_cuda_data_void_ptr(const cuda_data_void_ptr& ptr);
void reset() { pdata.reset(); num = 0; }
cuda_data_void_ptr lock() const;
/*!
ensures
- if (the memory block referenced by this object hasn't been deleted) then
- returns a cuda_data_void_ptr referencing that memory block
- else
- returns a default initialized cuda_data_void_ptr (i.e. an empty one).
!*/
private:
size_t num = 0;
std::weak_ptr<void> pdata;
};
// ----------------------------------------------------------------------------------------
class cuda_data_void_ptr
{
/*!
WHAT THIS OBJECT REPRESENTS
This is a block of memory on a CUDA device.
!*/
public:
cuda_data_void_ptr() = default;
cuda_data_void_ptr(size_t n);
/*!
ensures
- This object will allocate a device memory buffer of n bytes.
- #size() == n
!*/
void* data() { return pdata.get(); }
const void* data() const { return pdata.get(); }
operator void*() { return pdata.get(); }
operator const void*() const { return pdata.get(); }
void reset() { pdata.reset(); }
size_t size() const { return num; }
/*!
ensures
- returns the length of this buffer, in bytes.
!*/
cuda_data_void_ptr operator+ (size_t offset) const
/*!
requires
- offset < size()
ensures
- returns a pointer that is offset by the given amount.
!*/
{
DLIB_CASSERT(offset < num);
cuda_data_void_ptr temp;
temp.num = num-offset;
temp.pdata = std::shared_ptr<void>(pdata, ((char*)pdata.get())+offset);
return temp;
}
void shrink(size_t new_size)
/*!
requires
- new_size <= num
ensures
- #size() == new_size
- Doesn't actually deallocate anything, just changes the size() metadata to a
smaller number and only for this instance of the pointer.
!*/
{
DLIB_CASSERT(new_size <= num);
num = new_size;
}
private:
friend class weak_cuda_data_void_ptr;
size_t num = 0;
std::shared_ptr<void> pdata;
};
inline cuda_data_void_ptr operator+(size_t offset, const cuda_data_void_ptr& rhs) { return rhs+offset; }
// ------------------------------------------------------------------------------------
void memcpy(
void* dest,
const cuda_data_void_ptr& src
);
/*!
requires
- dest == a pointer to at least src.size() bytes on the host machine.
ensures
- copies the GPU data from src into dest.
- This routine is equivalent to performing: memcpy(dest,src,src.size())
!*/
void memcpy(
void* dest,
const cuda_data_void_ptr& src,
const size_t num
);
/*!
requires
- dest == a pointer to at least num bytes on the host machine.
- num <= src.size()
ensures
- copies the GPU data from src into dest. Copies only the first num bytes
of src to dest.
!*/
// ------------------------------------------------------------------------------------
void memcpy(
cuda_data_void_ptr dest,
const void* src
);
/*!
requires
- dest == a pointer to at least src.size() bytes on the host machine.
ensures
- copies the host data from src to the GPU memory buffer dest.
- This routine is equivalent to performing: memcpy(dest,src,dest.size())
!*/
void memcpy(
cuda_data_void_ptr dest,
const void* src,
const size_t num
);
/*!
requires
- dest == a pointer to at least num bytes on the host machine.
- num <= dest.size()
ensures
- copies the host data from src to the GPU memory buffer dest. Copies only
the first num bytes of src to dest.
!*/
// ------------------------------------------------------------------------------------
// ------------------------------------------------------------------------------------
// ------------------------------------------------------------------------------------
template <typename T>
class cuda_data_ptr
{
/*!
WHAT THIS OBJECT REPRESENTS
This is a block of memory on a CUDA device. It is just a type safe
version of cuda_data_void_ptr.
!*/
public:
static_assert(std::is_standard_layout<T>::value, "You can only create basic standard layout types on the GPU");
cuda_data_ptr() = default;
cuda_data_ptr(size_t n) : num(n)
/*!
ensures
- This object will allocate a device memory buffer of n T objects.
- #size() == n
!*/
{
if (n == 0)
return;
pdata = cuda_data_void_ptr(n*sizeof(T));
}
cuda_data_ptr(
const cuda_data_ptr<typename std::remove_const<T>::type> &other
) : num(other.num), pdata(other.pdata) {}
/*!
ensures
- *this is a copy of other. This version of the copy constructor allows
assigning non-const pointers to const ones. For instance, converting from
cuda_data_ptr<float> to cuda_data_ptr<const float>.
!*/
T* data() { return (T*)pdata.data(); }
const T* data() const { return (T*)pdata.data(); }
operator T*() { return (T*)pdata.data(); }
operator const T*() const { return (T*)pdata.data(); }
void reset() { pdata.reset(); }
size_t size() const { return num; }
/*!
ensures
- returns the number of T instances pointed to by *this.
!*/
operator cuda_data_void_ptr() const
/*!
ensures
- returns *this as a cuda_data_void_ptr. Importantly, the returned size() will
reflect the number of bytes referenced by *this. To be clear, let P be the
returned pointer. Then:
- P.get() == get()
- P.size() == size() * sizeof(T)
!*/
{
cuda_data_void_ptr temp = pdata;
temp.shrink(size() * sizeof(T));
return temp;
}
private:
template <typename U>
friend cuda_data_ptr<U> static_pointer_cast(const cuda_data_void_ptr &ptr);
template <typename U>
friend cuda_data_ptr<U> static_pointer_cast(const cuda_data_void_ptr &ptr, size_t num);
template <typename U>
friend class cuda_data_ptr;
size_t num = 0;
cuda_data_void_ptr pdata;
};
template <typename T>
cuda_data_ptr<T> static_pointer_cast(const cuda_data_void_ptr &ptr)
{
DLIB_CASSERT(ptr.size() % sizeof(T) == 0,
"Size of memory buffer in ptr doesn't match sizeof(T). "
<< "\nptr.size(): "<< ptr.size()
<< "\nsizeof(T): "<< sizeof(T));
cuda_data_ptr<T> result;
result.pdata = ptr;
result.num = ptr.size() / sizeof(T);
return result;
}
template <typename T>
cuda_data_ptr<T> static_pointer_cast(const cuda_data_void_ptr &ptr, size_t num)
{
DLIB_CASSERT(num*sizeof(T) <= ptr.size(),
"Size of memory buffer in ptr isn't big enough to represent this many T objects. "
<< "\nnum: "<< num
<< "\nnum*sizeof(T): "<< num*sizeof(T)
<< "\nsizeof(T): "<< sizeof(T)
<< "\nptr.size(): "<< ptr.size());
cuda_data_ptr<T> result;
result.pdata = ptr;
result.num = num;
return result;
}
template <typename T>
void memcpy(std::vector<T>& dest, const cuda_data_ptr<T>& src)
{
dest.resize(src.size());
if (src.size() != 0)
memcpy(dest.data(), static_cast<cuda_data_void_ptr>(src));
}
template <typename T>
void memcpy(cuda_data_ptr<T>& dest, const std::vector<T>& src)
{
if (src.size() != dest.size())
dest = cuda_data_ptr<T>(src.size());
if (dest.size() != 0)
memcpy(static_cast<cuda_data_void_ptr>(dest), src.data());
}
template <typename T>
void memcpy(cuda_data_ptr<T>& dest, const T* src)
{
memcpy(static_cast<cuda_data_void_ptr>(dest), src);
}
template <typename T>
void memcpy(cuda_data_ptr<T>& dest, const T* src, size_t num)
{
DLIB_CASSERT(num <= dest.size());
memcpy(static_cast<cuda_data_void_ptr>(dest), src, num*sizeof(T));
}
template <typename T>
void memcpy(T* dest, const cuda_data_ptr<T>& src)
{
memcpy(dest, static_cast<cuda_data_void_ptr>(src));
}
template <typename T>
void memcpy(T* dest, const cuda_data_ptr<T>& src, size_t num)
{
DLIB_CASSERT(num <= src.size());
memcpy(dest, static_cast<cuda_data_void_ptr>(src), num*sizeof(T));
}
// ------------------------------------------------------------------------------------
cuda_data_void_ptr device_global_buffer(size_t size);
/*!
ensures
- Returns a pointer to a globally shared CUDA memory buffer on the
currently selected CUDA device. The buffer is also thread local. So
each host thread will get its own buffer. You can use this global buffer
as scratch space for CUDA computations that all take place on the default
stream. Using it in this way ensures that there aren't any race conditions
involving the use of the buffer.
- The returned pointer will point to at least size bytes. It may point to more.
- The global buffer is deallocated once all references to it are destructed.
However, if device_global_buffer() is called before then with a size <= the last
size requested, then the previously returned global buffer pointer is returned.
This avoids triggering expensive CUDA reallocations. So if you want to avoid
these reallocations then hold a copy of the pointer returned by this function.
However, as a general rule, client code should not hold the returned
cuda_data_void_ptr for long durations, but instead should call
device_global_buffer() whenever the buffer is needed, and overwrite the previously
returned pointer with the new pointer. Doing so ensures multiple buffers are not
kept around in the event that multiple sized buffers are requested. To explain
this, consider this code, assumed to execute at program startup:
auto ptr1 = device_global_buffer(1);
auto ptr2 = device_global_buffer(2);
auto ptr3 = device_global_buffer(3);
since the sizes increased at each call 3 separate buffers were allocated. First
one of size 1, then of size 2, then of size 3. If we then executed:
ptr1 = device_global_buffer(1);
ptr2 = device_global_buffer(2);
ptr3 = device_global_buffer(3);
all three of these pointers would now point to the same buffer, since the smaller
requests can be satisfied by returning the size 3 buffer in each case.
!*/
// ----------------------------------------------------------------------------------------
}
}
#endif // DLIB_USE_CUDA
#endif // DLIB_DNN_CuDA_DATA_PTR_H_
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment