Unverified Commit c45d166a authored by Adrià Arrufat's avatar Adrià Arrufat Committed by GitHub
Browse files

Test cuda losses (#2199)

* add cuda test for loss_binary_log_per_pixel and some needed refactoring

* add cuda test for loss_multiclass_log_per_pixel

* forgot to add cpu version in loss

* remove a line I added by mistake

* fix typos

* declare label_to_ignore as static

* use tensor_index function instead of index method

* test cuda and cpu gradients values

* use DLIB_TEST instead of DLIB_CASSERT
parent d78d273a
...@@ -522,6 +522,132 @@ namespace dlib ...@@ -522,6 +522,132 @@ namespace dlib
// ----------------------------------------------------------------------------------- // -----------------------------------------------------------------------------------
class compute_loss_binary_log_per_pixel
{
/*! The point of this class is to compute the loss for loss_binary_log_per_pixel_
on the cpu to provide an analogous implementation of the cuda version
!*/
public:
compute_loss_binary_log_per_pixel(
)
{
}
template <
typename const_label_iterator
>
void operator()(
const_label_iterator truth,
const tensor& output_tensor,
tensor& grad,
double& loss
) const
{
sigmoid(grad, output_tensor);
// The loss we output is the average loss over the mini-batch, and also over each element of the matrix output.
const double scale = 1.0/(output_tensor.num_samples()*output_tensor.nr()*output_tensor.nc());
loss = 0;
float* const g = grad.host();
const float* const out_data = output_tensor.host();
for (long i = 0; i < output_tensor.num_samples(); ++i, ++truth)
{
for (long r = 0; r < output_tensor.nr(); ++r)
{
for (long c = 0; c < output_tensor.nc(); ++c)
{
const float y = truth->operator()(r, c);
const size_t idx = tensor_index(output_tensor, i, 0, r, c);
if (y > 0.f)
{
const float temp = log1pexp(-out_data[idx]);
loss += y*scale*temp;
g[idx] = y*scale*(g[idx]-1);
}
else if (y < 0.f)
{
const float temp = -(-out_data[idx]-log1pexp(-out_data[idx]));
loss += -y*scale*temp;
g[idx] = -y*scale*g[idx];
}
else
{
g[idx] = 0.f;
}
}
}
}
}
};
// -----------------------------------------------------------------------------------
class compute_loss_multiclass_log_per_pixel
{
/*! The point of this class is to compute the loss for loss_multiclass_log_per_pixel_
on the cpu to provide an analogous implementation of the cuda version
!*/
public:
compute_loss_multiclass_log_per_pixel(
)
{
}
template <
typename const_label_iterator
>
void operator()(
const_label_iterator truth,
const tensor& output_tensor,
tensor& grad,
double& loss
) const
{
softmax(grad, output_tensor);
// The loss we output is the average loss over the mini-batch, and also over each element of the matrix output.
const double scale = 1.0 / (output_tensor.num_samples() * output_tensor.nr() * output_tensor.nc());
loss = 0;
float* const g = grad.host();
for (long i = 0; i < output_tensor.num_samples(); ++i, ++truth)
{
for (long r = 0; r < output_tensor.nr(); ++r)
{
for (long c = 0; c < output_tensor.nc(); ++c)
{
const uint16_t y = truth->operator()(r, c);
// The network must produce a number of outputs that is equal to the number
// of labels when using this type of loss.
DLIB_CASSERT(static_cast<long>(y) < output_tensor.k() || y == label_to_ignore,
"y: " << y << ", output_tensor.k(): " << output_tensor.k());
for (long k = 0; k < output_tensor.k(); ++k)
{
const size_t idx = tensor_index(output_tensor, i, k, r, c);
if (k == y)
{
loss += scale*-safe_log(g[idx]);
g[idx] = scale*(g[idx] - 1);
}
else if (y == label_to_ignore)
{
g[idx] = 0.f;
}
else
{
g[idx] = scale*g[idx];
}
}
}
}
}
}
private:
static const uint16_t label_to_ignore = std::numeric_limits<uint16_t>::max();
};
// -----------------------------------------------------------------------------------
class compute_loss_multiclass_log_per_pixel_weighted class compute_loss_multiclass_log_per_pixel_weighted
{ {
...@@ -580,20 +706,6 @@ namespace dlib ...@@ -580,20 +706,6 @@ namespace dlib
} }
} }
private:
template <typename T>
T safe_log(T input, T epsilon = 1e-10) const
{
// Prevent trying to calculate the logarithm of a very small number (let alone zero)
return std::log(std::max(input, epsilon));
}
static size_t tensor_index(const tensor& t, long sample, long k, long row, long column)
{
// See: https://github.com/davisking/dlib/blob/4dfeb7e186dd1bf6ac91273509f687293bd4230a/dlib/dnn/tensor_abstract.h#L38
return ((sample * t.k() + k) * t.nr() + row) * t.nc() + column;
}
}; };
// ----------------------------------------------------------------------------------- // -----------------------------------------------------------------------------------
...@@ -634,7 +746,7 @@ namespace dlib ...@@ -634,7 +746,7 @@ namespace dlib
for (long c = 0; c < output_tensor.nc(); ++c) for (long c = 0; c < output_tensor.nc(); ++c)
{ {
const float y = (*truth)[k].operator()(r, c); const float y = (*truth)[k].operator()(r, c);
const size_t idx = ((i * output_tensor.k() + k) * output_tensor.nr() + r) * output_tensor.nc() + c; const size_t idx = tensor_index(output_tensor, i, k, r, c);
const float temp1 = y - out_data[idx]; const float temp1 = y - out_data[idx];
const float temp2 = scale*temp1; const float temp2 = scale*temp1;
loss += temp2*temp1; loss += temp2*temp1;
......
...@@ -242,13 +242,6 @@ namespace dlib ...@@ -242,13 +242,6 @@ namespace dlib
}; };
template <typename T>
T safe_log(T input, T epsilon = 1e-10)
{
// Prevent trying to calculate the logarithm of a very small number (let alone zero)
return std::log(std::max(input, epsilon));
}
template <typename SUBNET> template <typename SUBNET>
using loss_binary_log = add_loss_layer<loss_binary_log_, SUBNET>; using loss_binary_log = add_loss_layer<loss_binary_log_, SUBNET>;
...@@ -2759,7 +2752,7 @@ namespace dlib ...@@ -2759,7 +2752,7 @@ namespace dlib
{ {
for (long c = 0; c < output_tensor.nc(); ++c) for (long c = 0; c < output_tensor.nc(); ++c)
{ {
iter->operator()(r, c) = out_data[tensor_index(output_tensor, i, r, c)]; iter->operator()(r, c) = out_data[tensor_index(output_tensor, i, 0, r, c)];
} }
} }
} }
...@@ -2796,49 +2789,13 @@ namespace dlib ...@@ -2796,49 +2789,13 @@ namespace dlib
"output size = " << output_tensor.nr() << " x " << output_tensor.nc()); "output size = " << output_tensor.nr() << " x " << output_tensor.nc());
} }
#ifdef DLIB_USE_CUDA
double loss; double loss;
#ifdef DLIB_USE_CUDA
cuda_compute(truth, output_tensor, grad, loss); cuda_compute(truth, output_tensor, grad, loss);
return loss;
#else #else
cpu_compute(truth, output_tensor, grad, loss);
tt::sigmoid(grad, output_tensor);
// The loss we output is the average loss over the mini-batch, and also over each element of the matrix output.
const double scale = 1.0/(output_tensor.num_samples()*output_tensor.nr()*output_tensor.nc());
double loss = 0;
float* const g = grad.host();
const float* const out_data = output_tensor.host();
for (long i = 0; i < output_tensor.num_samples(); ++i, ++truth)
{
for (long r = 0; r < output_tensor.nr(); ++r)
{
for (long c = 0; c < output_tensor.nc(); ++c)
{
const float y = truth->operator()(r, c);
const size_t idx = tensor_index(output_tensor, i, r, c);
if (y > 0.f)
{
const float temp = log1pexp(-out_data[idx]);
loss += y*scale*temp;
g[idx] = y*scale*(g[idx]-1);
}
else if (y < 0.f)
{
const float temp = -(-out_data[idx]-log1pexp(-out_data[idx]));
loss += -y*scale*temp;
g[idx] = -y*scale*g[idx];
}
else
{
g[idx] = 0.f;
}
}
}
}
return loss;
#endif #endif
return loss;
} }
friend void serialize(const loss_binary_log_per_pixel_& , std::ostream& out) friend void serialize(const loss_binary_log_per_pixel_& , std::ostream& out)
...@@ -2866,16 +2823,11 @@ namespace dlib ...@@ -2866,16 +2823,11 @@ namespace dlib
} }
private: private:
static size_t tensor_index(const tensor& t, long sample, long row, long column)
{
DLIB_ASSERT(t.k() == 1);
// See: https://github.com/davisking/dlib/blob/4dfeb7e186dd1bf6ac91273509f687293bd4230a/dlib/dnn/tensor_abstract.h#L38
return (sample * t.nr() + row) * t.nc() + column;
}
#ifdef DLIB_USE_CUDA #ifdef DLIB_USE_CUDA
cuda::compute_loss_binary_log_per_pixel cuda_compute; cuda::compute_loss_binary_log_per_pixel cuda_compute;
#else
cpu::compute_loss_binary_log_per_pixel cpu_compute;
#endif #endif
}; };
...@@ -2982,51 +2934,13 @@ namespace dlib ...@@ -2982,51 +2934,13 @@ namespace dlib
} }
#ifdef DLIB_USE_CUDA
double loss; double loss;
#ifdef DLIB_USE_CUDA
cuda_compute(truth, output_tensor, grad, loss); cuda_compute(truth, output_tensor, grad, loss);
return loss;
#else #else
cpu_compute(truth, output_tensor, grad, loss);
tt::softmax(grad, output_tensor);
// The loss we output is the average loss over the mini-batch, and also over each element of the matrix output.
const double scale = 1.0 / (output_tensor.num_samples() * output_tensor.nr() * output_tensor.nc());
double loss = 0;
float* const g = grad.host();
for (long i = 0; i < output_tensor.num_samples(); ++i, ++truth)
{
for (long r = 0; r < output_tensor.nr(); ++r)
{
for (long c = 0; c < output_tensor.nc(); ++c)
{
const uint16_t y = truth->operator()(r, c);
// The network must produce a number of outputs that is equal to the number
// of labels when using this type of loss.
DLIB_CASSERT(static_cast<long>(y) < output_tensor.k() || y == label_to_ignore,
"y: " << y << ", output_tensor.k(): " << output_tensor.k());
for (long k = 0; k < output_tensor.k(); ++k)
{
const size_t idx = tensor_index(output_tensor, i, k, r, c);
if (k == y)
{
loss += scale*-safe_log(g[idx]);
g[idx] = scale*(g[idx] - 1);
}
else if (y == label_to_ignore)
{
g[idx] = 0.f;
}
else
{
g[idx] = scale*g[idx];
}
}
}
}
}
return loss;
#endif #endif
return loss;
} }
friend void serialize(const loss_multiclass_log_per_pixel_& , std::ostream& out) friend void serialize(const loss_multiclass_log_per_pixel_& , std::ostream& out)
...@@ -3054,15 +2968,11 @@ namespace dlib ...@@ -3054,15 +2968,11 @@ namespace dlib
} }
private: private:
static size_t tensor_index(const tensor& t, long sample, long k, long row, long column)
{
// See: https://github.com/davisking/dlib/blob/4dfeb7e186dd1bf6ac91273509f687293bd4230a/dlib/dnn/tensor_abstract.h#L38
return ((sample * t.k() + k) * t.nr() + row) * t.nc() + column;
}
#ifdef DLIB_USE_CUDA #ifdef DLIB_USE_CUDA
cuda::compute_loss_multiclass_log_per_pixel cuda_compute; cuda::compute_loss_multiclass_log_per_pixel cuda_compute;
#else
cpu::compute_loss_multiclass_log_per_pixel cpu_compute;
#endif #endif
}; };
...@@ -3158,11 +3068,7 @@ namespace dlib ...@@ -3158,11 +3068,7 @@ namespace dlib
} }
private: private:
static size_t tensor_index(const tensor& t, long sample, long k, long row, long column)
{
// See: https://github.com/davisking/dlib/blob/4dfeb7e186dd1bf6ac91273509f687293bd4230a/dlib/dnn/tensor_abstract.h#L38
return ((sample * t.k() + k) * t.nr() + row) * t.nc() + column;
}
#ifdef DLIB_USE_CUDA #ifdef DLIB_USE_CUDA
cuda::compute_loss_multiclass_log_per_pixel_weighted cuda_compute; cuda::compute_loss_multiclass_log_per_pixel_weighted cuda_compute;
#else #else
...@@ -3294,12 +3200,6 @@ namespace dlib ...@@ -3294,12 +3200,6 @@ namespace dlib
out << "<loss_mean_squared_per_pixel/>"; out << "<loss_mean_squared_per_pixel/>";
} }
private:
static size_t tensor_index(const tensor& t, long sample, long k, long row, long column)
{
// See: https://github.com/davisking/dlib/blob/4dfeb7e186dd1bf6ac91273509f687293bd4230a/dlib/dnn/tensor_abstract.h#L38
return ((sample * t.k() + k) * t.nr() + row) * t.nc() + column;
}
}; };
template <typename SUBNET> template <typename SUBNET>
...@@ -3419,11 +3319,7 @@ namespace dlib ...@@ -3419,11 +3319,7 @@ namespace dlib
} }
private: private:
static size_t tensor_index(const tensor& t, long sample, long k, long row, long column)
{
// See: https://github.com/davisking/dlib/blob/4dfeb7e186dd1bf6ac91273509f687293bd4230a/dlib/dnn/tensor_abstract.h#L38
return ((sample * t.k() + k) * t.nr() + row) * t.nc() + column;
}
#ifdef DLIB_USE_CUDA #ifdef DLIB_USE_CUDA
cuda::compute_loss_mean_squared_per_channel_and_pixel cuda_compute; cuda::compute_loss_mean_squared_per_channel_and_pixel cuda_compute;
#else #else
......
...@@ -3,6 +3,8 @@ ...@@ -3,6 +3,8 @@
#ifndef DLIB_DNn_MISC_h #ifndef DLIB_DNn_MISC_h
#define DLIB_DNn_MISC_h #define DLIB_DNn_MISC_h
#include "../cuda/tensor.h"
namespace dlib namespace dlib
{ {
...@@ -22,6 +24,47 @@ namespace dlib ...@@ -22,6 +24,47 @@ namespace dlib
float weight = 1.f; float weight = 1.f;
}; };
// ----------------------------------------------------------------------------------------
inline double log1pexp(double x)
{
using std::exp;
using namespace std; // Do this instead of using std::log1p because some compilers
// error out otherwise (E.g. gcc 4.9 in cygwin)
if (x <= -37)
return exp(x);
else if (-37 < x && x <= 18)
return log1p(exp(x));
else if (18 < x && x <= 33.3)
return x + exp(-x);
else
return x;
}
// ----------------------------------------------------------------------------------------
template <typename T>
T safe_log(T input, T epsilon = 1e-10)
{
// Prevent trying to calculate the logarithm of a very small number (let alone zero)
return std::log(std::max(input, epsilon));
}
// ----------------------------------------------------------------------------------------
static size_t tensor_index(
const tensor& t,
const long sample,
const long k,
const long r,
const long c
)
{
return ((sample * t.k() + k) * t.nr() + r) * t.nc() + c;
}
// ----------------------------------------------------------------------------------------
} }
#endif // DLIB_DNn_MISC_h #endif // DLIB_DNn_MISC_h
......
...@@ -11,23 +11,6 @@ ...@@ -11,23 +11,6 @@
namespace dlib namespace dlib
{ {
// ----------------------------------------------------------------------------------------
inline double log1pexp(double x)
{
using std::exp;
using namespace std; // Do this instead of using std::log1p because some compilers
// error out otherwise (E.g. gcc 4.9 in cygwin)
if (x <= -37)
return exp(x);
else if (-37 < x && x <= 18)
return log1p(exp(x));
else if (18 < x && x <= 33.3)
return x + exp(-x);
else
return x;
}
// ---------------------------------------------------------------------------------------- // ----------------------------------------------------------------------------------------
inline void randomize_parameters ( inline void randomize_parameters (
......
...@@ -18,6 +18,8 @@ namespace dlib ...@@ -18,6 +18,8 @@ namespace dlib
ensures ensures
- returns log(1+exp(x)) - returns log(1+exp(x))
(except computes it using a numerically accurate method) (except computes it using a numerically accurate method)
NOTE: For technical reasons, it is defined in misc.h.
!*/ !*/
// ---------------------------------------------------------------------------------------- // ----------------------------------------------------------------------------------------
......
...@@ -2683,9 +2683,14 @@ namespace ...@@ -2683,9 +2683,14 @@ namespace
cpu::compute_loss_mean_squared_per_channel_and_pixel cpu_compute; cpu::compute_loss_mean_squared_per_channel_and_pixel cpu_compute;
double cuda_loss, cpu_loss; double cuda_loss, cpu_loss;
const tensor& output_tensor = net.subnet().get_output(); const tensor& output_tensor = net.subnet().get_output();
tensor& grad = net.subnet().get_gradient_input(); resizable_tensor cuda_grad(output_tensor), cpu_grad(output_tensor);
cuda_compute(labels.begin(), output_tensor, grad, cuda_loss); cuda_compute(labels.begin(), output_tensor, cuda_grad, cuda_loss);
cpu_compute(labels.begin(), output_tensor, grad, cpu_loss); cpu_compute(labels.begin(), output_tensor, cpu_grad, cpu_loss);
DLIB_TEST(cuda_grad.size() == cpu_grad.size());
for (size_t i = 0; i < cuda_grad.size(); ++i)
{
DLIB_TEST(::std::abs(*(cuda_grad.begin() + i) - *(cpu_grad.begin() + i)) < 1e-8);
}
const auto err = abs(cuda_loss - cpu_loss) / cpu_loss; const auto err = abs(cuda_loss - cpu_loss) / cpu_loss;
DLIB_TEST_MSG(err < 1e-6, "multi channel cuda and cpu losses differ"); DLIB_TEST_MSG(err < 1e-6, "multi channel cuda and cpu losses differ");
#endif #endif
...@@ -2883,6 +2888,23 @@ namespace ...@@ -2883,6 +2888,23 @@ namespace
const int num_correct_required = static_cast<int>(::std::ceil(0.9 * num_correct_max)); const int num_correct_required = static_cast<int>(::std::ceil(0.9 * num_correct_max));
DLIB_TEST_MSG(num_correct >= num_correct_required, DLIB_TEST_MSG(num_correct >= num_correct_required,
"Number of correctly classified elements = " << num_correct << ", required = " << num_correct_required); "Number of correctly classified elements = " << num_correct << ", required = " << num_correct_required);
#if DLIB_USE_CUDA
cuda::compute_loss_binary_log_per_pixel cuda_compute;
cpu::compute_loss_binary_log_per_pixel cpu_compute;
double cuda_loss, cpu_loss;
const tensor& output_tensor = net.subnet().get_output();
resizable_tensor cuda_grad(output_tensor), cpu_grad(output_tensor);
cuda_compute(y.begin(), output_tensor, cuda_grad, cuda_loss);
cpu_compute(y.begin(), output_tensor, cpu_grad, cpu_loss);
DLIB_TEST(cuda_grad.size() == cpu_grad.size());
for (size_t i = 0; i < cuda_grad.size(); ++i)
{
DLIB_TEST(::std::abs(*(cuda_grad.begin() + i) - *(cpu_grad.begin() + i)) < 1e-8);
}
const auto err = abs(cuda_loss - cpu_loss) / cpu_loss;
DLIB_TEST_MSG(err < 1e-6, "binary log per pixel cuda and cpu losses differ");
#endif
} }
// ---------------------------------------------------------------------------------------- // ----------------------------------------------------------------------------------------
...@@ -3217,6 +3239,23 @@ namespace ...@@ -3217,6 +3239,23 @@ namespace
const int num_correct_required = static_cast<int>(::std::ceil(0.9 * num_correct_max)); const int num_correct_required = static_cast<int>(::std::ceil(0.9 * num_correct_max));
DLIB_TEST_MSG(num_correct >= num_correct_required, DLIB_TEST_MSG(num_correct >= num_correct_required,
"Number of correctly classified elements = " << num_correct << ", required = " << num_correct_required); "Number of correctly classified elements = " << num_correct << ", required = " << num_correct_required);
#if DLIB_USE_CUDA
cuda::compute_loss_multiclass_log_per_pixel cuda_compute;
cpu::compute_loss_multiclass_log_per_pixel cpu_compute;
double cuda_loss, cpu_loss;
const tensor& output_tensor = net.subnet().get_output();
resizable_tensor cuda_grad(output_tensor), cpu_grad(output_tensor);
cuda_compute(y.begin(), output_tensor, cuda_grad, cuda_loss);
cpu_compute(y.begin(), output_tensor, cpu_grad, cpu_loss);
DLIB_TEST(cuda_grad.size() == cpu_grad.size());
for (size_t i = 0; i < cuda_grad.size(); ++i)
{
DLIB_TEST(::std::abs(*(cuda_grad.begin() + i) - *(cpu_grad.begin() + i)) < 1e-8);
}
const auto err = abs(cuda_loss - cpu_loss) / cpu_loss;
DLIB_TEST_MSG(err < 1e-6, "multiclass log per pixel cuda and cpu losses differ");
#endif
} }
// ---------------------------------------------------------------------------------------- // ----------------------------------------------------------------------------------------
...@@ -3317,9 +3356,14 @@ namespace ...@@ -3317,9 +3356,14 @@ namespace
cpu::compute_loss_multiclass_log_per_pixel_weighted cpu_compute; cpu::compute_loss_multiclass_log_per_pixel_weighted cpu_compute;
double cuda_loss, cpu_loss; double cuda_loss, cpu_loss;
const tensor& output_tensor = net.subnet().get_output(); const tensor& output_tensor = net.subnet().get_output();
tensor& grad = net.subnet().get_gradient_input(); resizable_tensor cuda_grad(output_tensor), cpu_grad(output_tensor);
cuda_compute(y_weighted.begin(), output_tensor, grad, cuda_loss); cuda_compute(y_weighted.begin(), output_tensor, cuda_grad, cuda_loss);
cpu_compute(y_weighted.begin(), output_tensor, grad, cpu_loss); cpu_compute(y_weighted.begin(), output_tensor, cpu_grad, cpu_loss);
DLIB_TEST(cuda_grad.size() == cpu_grad.size());
for (size_t i = 0; i < cuda_grad.size(); ++i)
{
DLIB_TEST(::std::abs(*(cuda_grad.begin() + i) - *(cpu_grad.begin() + i)) < 1e-8);
}
const auto err = abs(cuda_loss - cpu_loss) / cpu_loss; const auto err = abs(cuda_loss - cpu_loss) / cpu_loss;
DLIB_TEST_MSG(err < 1e-6, "multi class log per pixel weighted cuda and cpu losses differ"); DLIB_TEST_MSG(err < 1e-6, "multi class log per pixel weighted cuda and cpu losses differ");
#endif #endif
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment