"git@developer.sourcefind.cn:OpenDAS/torch-sparce.git" did not exist on "b8fb93bd1af845459c0587ac91dff02bc1589c9b"
Unverified Commit c45d166a authored by Adrià Arrufat's avatar Adrià Arrufat Committed by GitHub
Browse files

Test cuda losses (#2199)

* add cuda test for loss_binary_log_per_pixel and some needed refactoring

* add cuda test for loss_multiclass_log_per_pixel

* forgot to add cpu version in loss

* remove a line I added by mistake

* fix typos

* declare label_to_ignore as static

* use tensor_index function instead of index method

* test cuda and cpu gradients values

* use DLIB_TEST instead of DLIB_CASSERT
parent d78d273a
......@@ -522,6 +522,132 @@ namespace dlib
// -----------------------------------------------------------------------------------
class compute_loss_binary_log_per_pixel
{
/*! The point of this class is to compute the loss for loss_binary_log_per_pixel_
on the cpu to provide an analogous implementation of the cuda version
!*/
public:
compute_loss_binary_log_per_pixel(
)
{
}
template <
typename const_label_iterator
>
void operator()(
const_label_iterator truth,
const tensor& output_tensor,
tensor& grad,
double& loss
) const
{
sigmoid(grad, output_tensor);
// The loss we output is the average loss over the mini-batch, and also over each element of the matrix output.
const double scale = 1.0/(output_tensor.num_samples()*output_tensor.nr()*output_tensor.nc());
loss = 0;
float* const g = grad.host();
const float* const out_data = output_tensor.host();
for (long i = 0; i < output_tensor.num_samples(); ++i, ++truth)
{
for (long r = 0; r < output_tensor.nr(); ++r)
{
for (long c = 0; c < output_tensor.nc(); ++c)
{
const float y = truth->operator()(r, c);
const size_t idx = tensor_index(output_tensor, i, 0, r, c);
if (y > 0.f)
{
const float temp = log1pexp(-out_data[idx]);
loss += y*scale*temp;
g[idx] = y*scale*(g[idx]-1);
}
else if (y < 0.f)
{
const float temp = -(-out_data[idx]-log1pexp(-out_data[idx]));
loss += -y*scale*temp;
g[idx] = -y*scale*g[idx];
}
else
{
g[idx] = 0.f;
}
}
}
}
}
};
// -----------------------------------------------------------------------------------
class compute_loss_multiclass_log_per_pixel
{
/*! The point of this class is to compute the loss for loss_multiclass_log_per_pixel_
on the cpu to provide an analogous implementation of the cuda version
!*/
public:
compute_loss_multiclass_log_per_pixel(
)
{
}
template <
typename const_label_iterator
>
void operator()(
const_label_iterator truth,
const tensor& output_tensor,
tensor& grad,
double& loss
) const
{
softmax(grad, output_tensor);
// The loss we output is the average loss over the mini-batch, and also over each element of the matrix output.
const double scale = 1.0 / (output_tensor.num_samples() * output_tensor.nr() * output_tensor.nc());
loss = 0;
float* const g = grad.host();
for (long i = 0; i < output_tensor.num_samples(); ++i, ++truth)
{
for (long r = 0; r < output_tensor.nr(); ++r)
{
for (long c = 0; c < output_tensor.nc(); ++c)
{
const uint16_t y = truth->operator()(r, c);
// The network must produce a number of outputs that is equal to the number
// of labels when using this type of loss.
DLIB_CASSERT(static_cast<long>(y) < output_tensor.k() || y == label_to_ignore,
"y: " << y << ", output_tensor.k(): " << output_tensor.k());
for (long k = 0; k < output_tensor.k(); ++k)
{
const size_t idx = tensor_index(output_tensor, i, k, r, c);
if (k == y)
{
loss += scale*-safe_log(g[idx]);
g[idx] = scale*(g[idx] - 1);
}
else if (y == label_to_ignore)
{
g[idx] = 0.f;
}
else
{
g[idx] = scale*g[idx];
}
}
}
}
}
}
private:
static const uint16_t label_to_ignore = std::numeric_limits<uint16_t>::max();
};
// -----------------------------------------------------------------------------------
class compute_loss_multiclass_log_per_pixel_weighted
{
......@@ -580,20 +706,6 @@ namespace dlib
}
}
private:
template <typename T>
T safe_log(T input, T epsilon = 1e-10) const
{
// Prevent trying to calculate the logarithm of a very small number (let alone zero)
return std::log(std::max(input, epsilon));
}
static size_t tensor_index(const tensor& t, long sample, long k, long row, long column)
{
// See: https://github.com/davisking/dlib/blob/4dfeb7e186dd1bf6ac91273509f687293bd4230a/dlib/dnn/tensor_abstract.h#L38
return ((sample * t.k() + k) * t.nr() + row) * t.nc() + column;
}
};
// -----------------------------------------------------------------------------------
......@@ -634,7 +746,7 @@ namespace dlib
for (long c = 0; c < output_tensor.nc(); ++c)
{
const float y = (*truth)[k].operator()(r, c);
const size_t idx = ((i * output_tensor.k() + k) * output_tensor.nr() + r) * output_tensor.nc() + c;
const size_t idx = tensor_index(output_tensor, i, k, r, c);
const float temp1 = y - out_data[idx];
const float temp2 = scale*temp1;
loss += temp2*temp1;
......
......@@ -242,13 +242,6 @@ namespace dlib
};
template <typename T>
T safe_log(T input, T epsilon = 1e-10)
{
// Prevent trying to calculate the logarithm of a very small number (let alone zero)
return std::log(std::max(input, epsilon));
}
template <typename SUBNET>
using loss_binary_log = add_loss_layer<loss_binary_log_, SUBNET>;
......@@ -2759,7 +2752,7 @@ namespace dlib
{
for (long c = 0; c < output_tensor.nc(); ++c)
{
iter->operator()(r, c) = out_data[tensor_index(output_tensor, i, r, c)];
iter->operator()(r, c) = out_data[tensor_index(output_tensor, i, 0, r, c)];
}
}
}
......@@ -2796,49 +2789,13 @@ namespace dlib
"output size = " << output_tensor.nr() << " x " << output_tensor.nc());
}
#ifdef DLIB_USE_CUDA
double loss;
#ifdef DLIB_USE_CUDA
cuda_compute(truth, output_tensor, grad, loss);
return loss;
#else
tt::sigmoid(grad, output_tensor);
// The loss we output is the average loss over the mini-batch, and also over each element of the matrix output.
const double scale = 1.0/(output_tensor.num_samples()*output_tensor.nr()*output_tensor.nc());
double loss = 0;
float* const g = grad.host();
const float* const out_data = output_tensor.host();
for (long i = 0; i < output_tensor.num_samples(); ++i, ++truth)
{
for (long r = 0; r < output_tensor.nr(); ++r)
{
for (long c = 0; c < output_tensor.nc(); ++c)
{
const float y = truth->operator()(r, c);
const size_t idx = tensor_index(output_tensor, i, r, c);
if (y > 0.f)
{
const float temp = log1pexp(-out_data[idx]);
loss += y*scale*temp;
g[idx] = y*scale*(g[idx]-1);
}
else if (y < 0.f)
{
const float temp = -(-out_data[idx]-log1pexp(-out_data[idx]));
loss += -y*scale*temp;
g[idx] = -y*scale*g[idx];
}
else
{
g[idx] = 0.f;
}
}
}
}
return loss;
cpu_compute(truth, output_tensor, grad, loss);
#endif
return loss;
}
friend void serialize(const loss_binary_log_per_pixel_& , std::ostream& out)
......@@ -2866,16 +2823,11 @@ namespace dlib
}
private:
static size_t tensor_index(const tensor& t, long sample, long row, long column)
{
DLIB_ASSERT(t.k() == 1);
// See: https://github.com/davisking/dlib/blob/4dfeb7e186dd1bf6ac91273509f687293bd4230a/dlib/dnn/tensor_abstract.h#L38
return (sample * t.nr() + row) * t.nc() + column;
}
#ifdef DLIB_USE_CUDA
cuda::compute_loss_binary_log_per_pixel cuda_compute;
#else
cpu::compute_loss_binary_log_per_pixel cpu_compute;
#endif
};
......@@ -2982,51 +2934,13 @@ namespace dlib
}
#ifdef DLIB_USE_CUDA
double loss;
#ifdef DLIB_USE_CUDA
cuda_compute(truth, output_tensor, grad, loss);
return loss;
#else
tt::softmax(grad, output_tensor);
// The loss we output is the average loss over the mini-batch, and also over each element of the matrix output.
const double scale = 1.0 / (output_tensor.num_samples() * output_tensor.nr() * output_tensor.nc());
double loss = 0;
float* const g = grad.host();
for (long i = 0; i < output_tensor.num_samples(); ++i, ++truth)
{
for (long r = 0; r < output_tensor.nr(); ++r)
{
for (long c = 0; c < output_tensor.nc(); ++c)
{
const uint16_t y = truth->operator()(r, c);
// The network must produce a number of outputs that is equal to the number
// of labels when using this type of loss.
DLIB_CASSERT(static_cast<long>(y) < output_tensor.k() || y == label_to_ignore,
"y: " << y << ", output_tensor.k(): " << output_tensor.k());
for (long k = 0; k < output_tensor.k(); ++k)
{
const size_t idx = tensor_index(output_tensor, i, k, r, c);
if (k == y)
{
loss += scale*-safe_log(g[idx]);
g[idx] = scale*(g[idx] - 1);
}
else if (y == label_to_ignore)
{
g[idx] = 0.f;
}
else
{
g[idx] = scale*g[idx];
}
}
}
}
}
return loss;
cpu_compute(truth, output_tensor, grad, loss);
#endif
return loss;
}
friend void serialize(const loss_multiclass_log_per_pixel_& , std::ostream& out)
......@@ -3054,15 +2968,11 @@ namespace dlib
}
private:
static size_t tensor_index(const tensor& t, long sample, long k, long row, long column)
{
// See: https://github.com/davisking/dlib/blob/4dfeb7e186dd1bf6ac91273509f687293bd4230a/dlib/dnn/tensor_abstract.h#L38
return ((sample * t.k() + k) * t.nr() + row) * t.nc() + column;
}
#ifdef DLIB_USE_CUDA
cuda::compute_loss_multiclass_log_per_pixel cuda_compute;
#else
cpu::compute_loss_multiclass_log_per_pixel cpu_compute;
#endif
};
......@@ -3158,11 +3068,7 @@ namespace dlib
}
private:
static size_t tensor_index(const tensor& t, long sample, long k, long row, long column)
{
// See: https://github.com/davisking/dlib/blob/4dfeb7e186dd1bf6ac91273509f687293bd4230a/dlib/dnn/tensor_abstract.h#L38
return ((sample * t.k() + k) * t.nr() + row) * t.nc() + column;
}
#ifdef DLIB_USE_CUDA
cuda::compute_loss_multiclass_log_per_pixel_weighted cuda_compute;
#else
......@@ -3294,12 +3200,6 @@ namespace dlib
out << "<loss_mean_squared_per_pixel/>";
}
private:
static size_t tensor_index(const tensor& t, long sample, long k, long row, long column)
{
// See: https://github.com/davisking/dlib/blob/4dfeb7e186dd1bf6ac91273509f687293bd4230a/dlib/dnn/tensor_abstract.h#L38
return ((sample * t.k() + k) * t.nr() + row) * t.nc() + column;
}
};
template <typename SUBNET>
......@@ -3419,11 +3319,7 @@ namespace dlib
}
private:
static size_t tensor_index(const tensor& t, long sample, long k, long row, long column)
{
// See: https://github.com/davisking/dlib/blob/4dfeb7e186dd1bf6ac91273509f687293bd4230a/dlib/dnn/tensor_abstract.h#L38
return ((sample * t.k() + k) * t.nr() + row) * t.nc() + column;
}
#ifdef DLIB_USE_CUDA
cuda::compute_loss_mean_squared_per_channel_and_pixel cuda_compute;
#else
......
......@@ -3,6 +3,8 @@
#ifndef DLIB_DNn_MISC_h
#define DLIB_DNn_MISC_h
#include "../cuda/tensor.h"
namespace dlib
{
......@@ -22,6 +24,47 @@ namespace dlib
float weight = 1.f;
};
// ----------------------------------------------------------------------------------------
inline double log1pexp(double x)
{
using std::exp;
using namespace std; // Do this instead of using std::log1p because some compilers
// error out otherwise (E.g. gcc 4.9 in cygwin)
if (x <= -37)
return exp(x);
else if (-37 < x && x <= 18)
return log1p(exp(x));
else if (18 < x && x <= 33.3)
return x + exp(-x);
else
return x;
}
// ----------------------------------------------------------------------------------------
template <typename T>
T safe_log(T input, T epsilon = 1e-10)
{
// Prevent trying to calculate the logarithm of a very small number (let alone zero)
return std::log(std::max(input, epsilon));
}
// ----------------------------------------------------------------------------------------
static size_t tensor_index(
const tensor& t,
const long sample,
const long k,
const long r,
const long c
)
{
return ((sample * t.k() + k) * t.nr() + r) * t.nc() + c;
}
// ----------------------------------------------------------------------------------------
}
#endif // DLIB_DNn_MISC_h
......
......@@ -11,23 +11,6 @@
namespace dlib
{
// ----------------------------------------------------------------------------------------
inline double log1pexp(double x)
{
using std::exp;
using namespace std; // Do this instead of using std::log1p because some compilers
// error out otherwise (E.g. gcc 4.9 in cygwin)
if (x <= -37)
return exp(x);
else if (-37 < x && x <= 18)
return log1p(exp(x));
else if (18 < x && x <= 33.3)
return x + exp(-x);
else
return x;
}
// ----------------------------------------------------------------------------------------
inline void randomize_parameters (
......
......@@ -18,6 +18,8 @@ namespace dlib
ensures
- returns log(1+exp(x))
(except computes it using a numerically accurate method)
NOTE: For technical reasons, it is defined in misc.h.
!*/
// ----------------------------------------------------------------------------------------
......
......@@ -2683,9 +2683,14 @@ namespace
cpu::compute_loss_mean_squared_per_channel_and_pixel cpu_compute;
double cuda_loss, cpu_loss;
const tensor& output_tensor = net.subnet().get_output();
tensor& grad = net.subnet().get_gradient_input();
cuda_compute(labels.begin(), output_tensor, grad, cuda_loss);
cpu_compute(labels.begin(), output_tensor, grad, cpu_loss);
resizable_tensor cuda_grad(output_tensor), cpu_grad(output_tensor);
cuda_compute(labels.begin(), output_tensor, cuda_grad, cuda_loss);
cpu_compute(labels.begin(), output_tensor, cpu_grad, cpu_loss);
DLIB_TEST(cuda_grad.size() == cpu_grad.size());
for (size_t i = 0; i < cuda_grad.size(); ++i)
{
DLIB_TEST(::std::abs(*(cuda_grad.begin() + i) - *(cpu_grad.begin() + i)) < 1e-8);
}
const auto err = abs(cuda_loss - cpu_loss) / cpu_loss;
DLIB_TEST_MSG(err < 1e-6, "multi channel cuda and cpu losses differ");
#endif
......@@ -2883,6 +2888,23 @@ namespace
const int num_correct_required = static_cast<int>(::std::ceil(0.9 * num_correct_max));
DLIB_TEST_MSG(num_correct >= num_correct_required,
"Number of correctly classified elements = " << num_correct << ", required = " << num_correct_required);
#if DLIB_USE_CUDA
cuda::compute_loss_binary_log_per_pixel cuda_compute;
cpu::compute_loss_binary_log_per_pixel cpu_compute;
double cuda_loss, cpu_loss;
const tensor& output_tensor = net.subnet().get_output();
resizable_tensor cuda_grad(output_tensor), cpu_grad(output_tensor);
cuda_compute(y.begin(), output_tensor, cuda_grad, cuda_loss);
cpu_compute(y.begin(), output_tensor, cpu_grad, cpu_loss);
DLIB_TEST(cuda_grad.size() == cpu_grad.size());
for (size_t i = 0; i < cuda_grad.size(); ++i)
{
DLIB_TEST(::std::abs(*(cuda_grad.begin() + i) - *(cpu_grad.begin() + i)) < 1e-8);
}
const auto err = abs(cuda_loss - cpu_loss) / cpu_loss;
DLIB_TEST_MSG(err < 1e-6, "binary log per pixel cuda and cpu losses differ");
#endif
}
// ----------------------------------------------------------------------------------------
......@@ -3217,6 +3239,23 @@ namespace
const int num_correct_required = static_cast<int>(::std::ceil(0.9 * num_correct_max));
DLIB_TEST_MSG(num_correct >= num_correct_required,
"Number of correctly classified elements = " << num_correct << ", required = " << num_correct_required);
#if DLIB_USE_CUDA
cuda::compute_loss_multiclass_log_per_pixel cuda_compute;
cpu::compute_loss_multiclass_log_per_pixel cpu_compute;
double cuda_loss, cpu_loss;
const tensor& output_tensor = net.subnet().get_output();
resizable_tensor cuda_grad(output_tensor), cpu_grad(output_tensor);
cuda_compute(y.begin(), output_tensor, cuda_grad, cuda_loss);
cpu_compute(y.begin(), output_tensor, cpu_grad, cpu_loss);
DLIB_TEST(cuda_grad.size() == cpu_grad.size());
for (size_t i = 0; i < cuda_grad.size(); ++i)
{
DLIB_TEST(::std::abs(*(cuda_grad.begin() + i) - *(cpu_grad.begin() + i)) < 1e-8);
}
const auto err = abs(cuda_loss - cpu_loss) / cpu_loss;
DLIB_TEST_MSG(err < 1e-6, "multiclass log per pixel cuda and cpu losses differ");
#endif
}
// ----------------------------------------------------------------------------------------
......@@ -3317,9 +3356,14 @@ namespace
cpu::compute_loss_multiclass_log_per_pixel_weighted cpu_compute;
double cuda_loss, cpu_loss;
const tensor& output_tensor = net.subnet().get_output();
tensor& grad = net.subnet().get_gradient_input();
cuda_compute(y_weighted.begin(), output_tensor, grad, cuda_loss);
cpu_compute(y_weighted.begin(), output_tensor, grad, cpu_loss);
resizable_tensor cuda_grad(output_tensor), cpu_grad(output_tensor);
cuda_compute(y_weighted.begin(), output_tensor, cuda_grad, cuda_loss);
cpu_compute(y_weighted.begin(), output_tensor, cpu_grad, cpu_loss);
DLIB_TEST(cuda_grad.size() == cpu_grad.size());
for (size_t i = 0; i < cuda_grad.size(); ++i)
{
DLIB_TEST(::std::abs(*(cuda_grad.begin() + i) - *(cpu_grad.begin() + i)) < 1e-8);
}
const auto err = abs(cuda_loss - cpu_loss) / cpu_loss;
DLIB_TEST_MSG(err < 1e-6, "multi class log per pixel weighted cuda and cpu losses differ");
#endif
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment