"...git@developer.sourcefind.cn:renzhc/diffusers_dcu.git" did not exist on "3f329a426a09d0bf3f96095301042a5903bc78eb"
Unverified Commit 55e9c890 authored by Adrià Arrufat's avatar Adrià Arrufat Committed by GitHub
Browse files

Add cuda implementation for loss_mean_squared_per_channel_and_pixel (#2053)



* wip: attempt to use cuda for loss mse channel

* wip: maybe this is a step in the right direction

* Try to fix dereferencing the truth data (#1)

* Try to fix dereferencing the truth data

* Fix memory layout

* fix loss scaling and update tests

* rename temp1 to temp

* readd lambda captures for output_width and output_height

clangd was complaining about this, and suggested me to remove them
in the first, place:

```
Lambda capture 'output_height' is not required to be captured for this use (fix available)
Lambda capture 'output_width' is not required to be captured for this use (fix available)
```

* add a weighted_loss typedef to loss_multiclass_log_weighted_ for consistency

* update docs for weighted losses

* refactor multi channel loss and add cpu-cuda tests

* make operator() const

* make error relative to the loss value
Co-authored-by: default avatarJuha Reunanen <juha.reunanen@tomaattinen.com>
parent b42722a7
...@@ -521,6 +521,57 @@ namespace dlib ...@@ -521,6 +521,57 @@ namespace dlib
// ----------------------------------------------------------------------------------- // -----------------------------------------------------------------------------------
class compute_loss_mean_squared_per_channel_and_pixel
{
/*! The point of this class is to compute the loss for loss_mean_squared_per_channel_and_pixel_
on the cpu to provide an analogous implementation of the cuda version
!*/
public:
compute_loss_mean_squared_per_channel_and_pixel(
)
{
}
template <
typename const_label_iterator
>
void operator()(
const_label_iterator truth,
const tensor& output_tensor,
tensor& grad,
double& loss
) const
{
// The loss we output is the average loss over the mini-batch, and also over each element of the matrix output.
const double scale = 1.0 / (output_tensor.num_samples() * output_tensor.k() * output_tensor.nr() * output_tensor.nc());
loss = 0;
float* const g = grad.host();
const float* out_data = output_tensor.host();
for (long i = 0; i < output_tensor.num_samples(); ++i, ++truth)
{
for (long k = 0; k < output_tensor.k(); ++k)
{
for (long r = 0; r < output_tensor.nr(); ++r)
{
for (long c = 0; c < output_tensor.nc(); ++c)
{
const float y = (*truth)[k].operator()(r, c);
const size_t idx = ((i * output_tensor.k() + k) * output_tensor.nr() + r) * output_tensor.nc() + c;
const float temp1 = y - out_data[idx];
const float temp2 = scale*temp1;
loss += temp2*temp1;
g[idx] = -temp2;
}
}
}
}
}
};
// -----------------------------------------------------------------------------------
} }
} }
......
...@@ -1833,6 +1833,21 @@ namespace dlib ...@@ -1833,6 +1833,21 @@ namespace dlib
warp_reduce_atomic_add(*loss_out, loss); warp_reduce_atomic_add(*loss_out, loss);
} }
// ----------------------------------------------------------------------------------------
__global__ void _cuda_compute_loss_mean_squared_per_channel_and_pixel(float* loss_out, float* g, const float* truth, const float* out_data, size_t n, const float scale)
{
float loss = 0;
for (auto i : grid_stride_range(0, n))
{
const float y = truth[i];
const float temp = y - out_data[i];
loss += temp * temp;
g[i] = -temp * scale;
}
warp_reduce_atomic_add(*loss_out, loss);
}
// ---------------------------------------------------------------------------------------- // ----------------------------------------------------------------------------------------
void compute_loss_binary_log_per_pixel:: void compute_loss_binary_log_per_pixel::
...@@ -1882,6 +1897,28 @@ namespace dlib ...@@ -1882,6 +1897,28 @@ namespace dlib
loss = scale*floss; loss = scale*floss;
} }
void compute_loss_mean_squared_per_channel_and_pixel::
do_work(
cuda_data_ptr<float> loss_work_buffer,
cuda_data_ptr<const float> truth_buffer,
const tensor& subnetwork_output,
tensor& gradient,
double& loss
)
{
CHECK_CUDA(cudaMemset(loss_work_buffer, 0, sizeof(float)));
// The loss we output is the average loss over the mini-batch, and also over each element of the matrix output.
const double scale = 1.0 / (subnetwork_output.num_samples() * subnetwork_output.k() * subnetwork_output.nr() * subnetwork_output.nc());
launch_kernel(_cuda_compute_loss_mean_squared_per_channel_and_pixel , max_jobs(gradient.size()),
loss_work_buffer.data(), gradient.device(), truth_buffer.data(), subnetwork_output.device(), gradient.size(), scale);
float floss;
dlib::cuda::memcpy(&floss, loss_work_buffer);
loss = scale*floss;
}
// ---------------------------------------------------------------------------------------- // ----------------------------------------------------------------------------------------
} }
......
...@@ -557,6 +557,70 @@ namespace dlib ...@@ -557,6 +557,70 @@ namespace dlib
mutable cuda_data_void_ptr buf; mutable cuda_data_void_ptr buf;
}; };
class compute_loss_mean_squared_per_channel_and_pixel
{
/*!
The point of this class is to compute the loss computed by
loss_mean_squared_per_channel_and_pixel_, but to do so with CUDA.
!*/
public:
compute_loss_mean_squared_per_channel_and_pixel(
)
{
}
template <
typename const_label_iterator
>
void operator() (
const_label_iterator truth,
const tensor& subnetwork_output,
tensor& gradient,
double& loss
) const
{
const auto image_size = subnetwork_output.nr()*subnetwork_output.nc()*subnetwork_output.k();
const size_t bytes_per_plane = image_size*sizeof(float);
// Allocate a cuda buffer to store all the truth images and also one float
// for the scalar loss output.
buf = device_global_buffer(subnetwork_output.num_samples()*bytes_per_plane + sizeof(float));
cuda_data_ptr<float> loss_buf = static_pointer_cast<float>(buf, 1);
buf = buf+sizeof(float);
const size_t bytes_per_channel = subnetwork_output.nr()*subnetwork_output.nc()*sizeof(float);
// copy the truth data into a cuda buffer.
for (long i = 0; i < subnetwork_output.num_samples(); ++i, ++truth)
{
const auto& t = *truth;
DLIB_ASSERT(t.size() == subnetwork_output.k());
for (size_t j = 0; j < t.size(); ++j) {
DLIB_ASSERT(t[j].nr() == subnetwork_output.nr());
DLIB_ASSERT(t[j].nc() == subnetwork_output.nc());
memcpy(buf + i*bytes_per_plane + j*bytes_per_channel, &t[j](0,0), bytes_per_channel);
}
}
auto truth_buf = static_pointer_cast<const float>(buf, subnetwork_output.num_samples()*image_size);
do_work(loss_buf, truth_buf, subnetwork_output, gradient, loss);
}
private:
static void do_work(
cuda_data_ptr<float> loss_work_buffer,
cuda_data_ptr<const float> truth_buffer,
const tensor& subnetwork_output,
tensor& gradient,
double& loss
);
mutable cuda_data_void_ptr buf;
};
// ------------------------------------------------------------------------------------ // ------------------------------------------------------------------------------------
// ------------------------------------------------------------------------------------ // ------------------------------------------------------------------------------------
// ------------------------------------------------------------------------------------ // ------------------------------------------------------------------------------------
......
...@@ -389,7 +389,8 @@ namespace dlib ...@@ -389,7 +389,8 @@ namespace dlib
{ {
public: public:
typedef weighted_label<unsigned long> training_label_type; typedef dlib::weighted_label<unsigned long> weighted_label;
typedef weighted_label training_label_type;
typedef unsigned long output_label_type; typedef unsigned long output_label_type;
template < template <
...@@ -3300,30 +3301,12 @@ namespace dlib ...@@ -3300,30 +3301,12 @@ namespace dlib
"output size = " << output_tensor.nr() << " x " << output_tensor.nc()); "output size = " << output_tensor.nr() << " x " << output_tensor.nc());
} }
} }
double loss;
// The loss we output is the average loss over the mini-batch, and also over each element of the matrix output. #ifdef DLIB_USE_CUDA
const double scale = 1.0 / (output_tensor.num_samples() * output_tensor.k() * output_tensor.nr() * output_tensor.nc()); cuda_compute(truth, output_tensor, grad, loss);
double loss = 0; #else
float* const g = grad.host(); cpu_compute(truth, output_tensor, grad, loss);
const float* out_data = output_tensor.host(); #endif
for (long i = 0; i < output_tensor.num_samples(); ++i, ++truth)
{
for (long k = 0; k < output_tensor.k(); ++k)
{
for (long r = 0; r < output_tensor.nr(); ++r)
{
for (long c = 0; c < output_tensor.nc(); ++c)
{
const float y = (*truth)[k].operator()(r, c);
const size_t idx = tensor_index(output_tensor, i, k, r, c);
const float temp1 = y - out_data[idx];
const float temp2 = scale*temp1;
loss += temp2*temp1;
g[idx] = -temp2;
}
}
}
}
return loss; return loss;
} }
...@@ -3357,6 +3340,11 @@ namespace dlib ...@@ -3357,6 +3340,11 @@ namespace dlib
// See: https://github.com/davisking/dlib/blob/4dfeb7e186dd1bf6ac91273509f687293bd4230a/dlib/dnn/tensor_abstract.h#L38 // See: https://github.com/davisking/dlib/blob/4dfeb7e186dd1bf6ac91273509f687293bd4230a/dlib/dnn/tensor_abstract.h#L38
return ((sample * t.k() + k) * t.nr() + row) * t.nc() + column; return ((sample * t.k() + k) * t.nr() + row) * t.nc() + column;
} }
#ifdef DLIB_USE_CUDA
cuda::compute_loss_mean_squared_per_channel_and_pixel cuda_compute;
#else
cpu::compute_loss_mean_squared_per_channel_and_pixel cpu_compute;
#endif
}; };
template <long num_channels, typename SUBNET> template <long num_channels, typename SUBNET>
......
...@@ -423,7 +423,8 @@ namespace dlib ...@@ -423,7 +423,8 @@ namespace dlib
public: public:
typedef weighted_label<unsigned long> training_label_type; typedef dlib::weighted_label<unsigned long> weighted_label;
typedef weighted_label training_label_type;
typedef unsigned long output_label_type; typedef unsigned long output_label_type;
template < template <
...@@ -1535,7 +1536,8 @@ namespace dlib ...@@ -1535,7 +1536,8 @@ namespace dlib
!*/ !*/
public: public:
typedef matrix<weighted_label<uint16_t> training_label_type; typedef dlib::weighted_label<uint16_t> weighted_label;
typedef matrix<weighted_label> training_label_type;
typedef matrix<uint16_t> output_label_type; typedef matrix<uint16_t> output_label_type;
template < template <
......
...@@ -2580,7 +2580,7 @@ namespace ...@@ -2580,7 +2580,7 @@ namespace
print_spinner(); print_spinner();
const int num_samples = 1000; const int num_samples = 1000;
const long num_channels = 2; const long num_channels = 10;
const long dimension = 3; const long dimension = 3;
::std::vector<matrix<float>> inputs; ::std::vector<matrix<float>> inputs;
::std::vector<::std::array<matrix<float>, num_channels>> labels; ::std::vector<::std::array<matrix<float>, num_channels>> labels;
...@@ -2628,9 +2628,21 @@ namespace ...@@ -2628,9 +2628,21 @@ namespace
trainer.set_iterations_without_progress_threshold(500); trainer.set_iterations_without_progress_threshold(500);
trainer.set_min_learning_rate(1e-6); trainer.set_min_learning_rate(1e-6);
trainer.set_mini_batch_size(50); trainer.set_mini_batch_size(50);
trainer.set_max_num_epochs(100);
trainer.train(inputs, labels); trainer.train(inputs, labels);
const auto error_after = compute_error(); const auto error_after = compute_error();
DLIB_TEST_MSG(error_after < error_before, "multi channel error increased after training"); DLIB_TEST_MSG(error_after < error_before, "multi channel error increased after training");
#if DLIB_USE_CUDA
cuda::compute_loss_mean_squared_per_channel_and_pixel cuda_compute;
cpu::compute_loss_mean_squared_per_channel_and_pixel cpu_compute;
double cuda_loss, cpu_loss;
const tensor& output_tensor = net.subnet().get_output();
tensor& grad = net.subnet().get_gradient_input();
cuda_compute(labels.begin(), output_tensor, grad, cuda_loss);
cpu_compute(labels.begin(), output_tensor, grad, cpu_loss);
const auto err = ::std::abs<double>(cuda_loss - cpu_loss) / cpu_loss;
DLIB_TEST_MSG(err < 1e-6, "multi channel cuda and cpu losses differ");
#endif
} }
// ---------------------------------------------------------------------------------------- // ----------------------------------------------------------------------------------------
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment