Add cuda implementation for loss_mean_squared_per_channel_and_pixel (#2053)

* wip: attempt to use cuda for loss mse channel * wip: maybe this is a step in the right direction * Try to fix dereferencing the truth data (#1) * Try to fix dereferencing the truth data * Fix memory layout * fix loss scaling and update tests * rename temp1 to temp * readd lambda captures for output_width and output_height clangd was complaining about this, and suggested me to remove them in the first, place: ``` Lambda capture 'output_height' is not required to be captured for this use (fix available) Lambda capture 'output_width' is not required to be captured for this use (fix available) ``` * add a weighted_loss typedef to loss_multiclass_log_weighted_ for consistency * update docs for weighted losses * refactor multi channel loss and add cpu-cuda tests * make operator() const * make error relative to the loss value Co-authored-by: Juha Reunanen <juha.reunanen@tomaattinen.com>

Add cuda implementation for loss_mean_squared_per_channel_and_pixel (#2053)
* wip: attempt to use cuda for loss mse channel * wip: maybe this is a step in the right direction * Try to fix dereferencing the truth data (#1) * Try to fix dereferencing the truth data * Fix memory layout * fix loss scaling and update tests * rename temp1 to temp * readd lambda captures for output_width and output_height clangd was complaining about this, and suggested me to remove them in the first, place: ``` Lambda capture 'output_height' is not required to be captured for this use (fix available) Lambda capture 'output_width' is not required to be captured for this use (fix available) ``` * add a weighted_loss typedef to loss_multiclass_log_weighted_ for consistency * update docs for weighted losses * refactor multi channel loss and add cpu-cuda tests * make operator() const * make error relative to the loss value Co-authored-by: Juha Reunanen <juha.reunanen@tomaattinen.com>
55e9c890 · Adrià Arrufat · GitHub · b42722a7 · 55e9c890 · 55e9c890
Unverified Commit 55e9c890 authored Apr 18, 2020 by Adrià Arrufat Committed by GitHub Apr 18, 2020
6 changed files
--- a/dlib/cuda/cpu_dlib.h
+++ b/dlib/cuda/cpu_dlib.h
@@ -521,6 +521,57 @@ namespace dlib
    // -----------------------------------------------------------------------------------
+    class compute_loss_mean_squared_per_channel_and_pixel
+    {
+        /*! The point of this class is to compute the loss for loss_mean_squared_per_channel_and_pixel_
+            on the cpu to provide an analogous implementation of the cuda version
+        !*/
+        public:
+            compute_loss_mean_squared_per_channel_and_pixel(
+            )
+            {
+            }
+        template <
+            typename const_label_iterator
+            >
+        void operator()(
+            const_label_iterator truth,
+            const tensor& output_tensor,
+            tensor& grad,
+            double& loss
+        ) const
+        {
+            // The loss we output is the average loss over the mini-batch, and also over each element of the matrix output.
+            const double scale = 1.0 / (output_tensor.num_samples() * output_tensor.k() * output_tensor.nr() * output_tensor.nc());
+            loss = 0;
+            float* const g = grad.host();
+            const float* out_data = output_tensor.host();
+            for (long i = 0; i < output_tensor.num_samples(); ++i, ++truth)
+            {
+                for (long k = 0; k < output_tensor.k(); ++k)
+                {
+                    for (long r = 0; r < output_tensor.nr(); ++r)
+                    {
+                        for (long c = 0; c < output_tensor.nc(); ++c)
+                        {
+                            const float y = (*truth)[k].operator()(r, c);
+                            const size_t idx = ((i * output_tensor.k() + k) * output_tensor.nr() + r) * output_tensor.nc() + c;
+                            const float temp1 = y - out_data[idx];
+                            const float temp2 = scale*temp1;
+                            loss += temp2*temp1;
+                            g[idx] = -temp2;
+                        }
+                    }
+                }
+            }
+        }
+    };
+    // -----------------------------------------------------------------------------------
    } 
 }

--- a/dlib/cuda/cuda_dlib.cu
+++ b/dlib/cuda/cuda_dlib.cu
@@ -1833,6 +1833,21 @@ namespace dlib
            warp_reduce_atomic_add(*loss_out, loss);
        }
+    // ----------------------------------------------------------------------------------------
+        __global__ void _cuda_compute_loss_mean_squared_per_channel_and_pixel(float* loss_out, float* g, const float* truth, const float* out_data, size_t n, const float scale)
+        {
+            float loss = 0;
+            for (auto i : grid_stride_range(0, n))
+            {
+                const float y = truth[i];
+                const float temp = y - out_data[i];
+                loss += temp * temp;
+                g[i] = -temp * scale;
+            }
+            warp_reduce_atomic_add(*loss_out, loss);
+        }
    // ----------------------------------------------------------------------------------------
        void compute_loss_binary_log_per_pixel::
@@ -1882,6 +1897,28 @@ namespace dlib
            loss = scale*floss;
        }
+        void compute_loss_mean_squared_per_channel_and_pixel::
+        do_work(
+            cuda_data_ptr<float> loss_work_buffer,
+            cuda_data_ptr<const float> truth_buffer,
+            const tensor& subnetwork_output,
+            tensor& gradient,
+            double& loss
+        )
+        {
+            CHECK_CUDA(cudaMemset(loss_work_buffer, 0, sizeof(float)));
+            // The loss we output is the average loss over the mini-batch, and also over each element of the matrix output.
+            const double scale = 1.0 / (subnetwork_output.num_samples() * subnetwork_output.k() * subnetwork_output.nr() * subnetwork_output.nc());
+            launch_kernel(_cuda_compute_loss_mean_squared_per_channel_and_pixel , max_jobs(gradient.size()),
+                loss_work_buffer.data(), gradient.device(), truth_buffer.data(), subnetwork_output.device(), gradient.size(), scale);
+            float floss;
+            dlib::cuda::memcpy(&floss, loss_work_buffer);
+            loss = scale*floss;
+        }
    // ----------------------------------------------------------------------------------------
    }

--- a/dlib/cuda/cuda_dlib.h
+++ b/dlib/cuda/cuda_dlib.h
@@ -557,6 +557,70 @@ namespace dlib
            mutable cuda_data_void_ptr buf;
        };
+        class compute_loss_mean_squared_per_channel_and_pixel
+        {
+            /*!
+                The point of this class is to compute the loss computed by
+                loss_mean_squared_per_channel_and_pixel_, but to do so with CUDA.
+            !*/
+        public:
+            compute_loss_mean_squared_per_channel_and_pixel(
+            )
+            {
+            }
+            template <
+                typename const_label_iterator
+                >
+            void operator() (
+                const_label_iterator truth,
+                const tensor& subnetwork_output,
+                tensor& gradient,
+                double& loss
+            ) const
+            {
+                const auto image_size = subnetwork_output.nr()*subnetwork_output.nc()*subnetwork_output.k();
+                const size_t bytes_per_plane = image_size*sizeof(float);
+                // Allocate a cuda buffer to store all the truth images and also one float
+                // for the scalar loss output.
+                buf = device_global_buffer(subnetwork_output.num_samples()*bytes_per_plane + sizeof(float));
+                cuda_data_ptr<float> loss_buf = static_pointer_cast<float>(buf, 1);
+                buf = buf+sizeof(float);
+                const size_t bytes_per_channel = subnetwork_output.nr()*subnetwork_output.nc()*sizeof(float);
+                // copy the truth data into a cuda buffer.
+                for (long i = 0; i < subnetwork_output.num_samples(); ++i, ++truth)
+                {
+                    const auto& t = *truth;
+                    DLIB_ASSERT(t.size() == subnetwork_output.k());
+                    for (size_t j = 0; j < t.size(); ++j) {
+                        DLIB_ASSERT(t[j].nr() == subnetwork_output.nr());
+                        DLIB_ASSERT(t[j].nc() == subnetwork_output.nc());
+                        memcpy(buf + i*bytes_per_plane + j*bytes_per_channel, &t[j](0,0), bytes_per_channel);
+                    }
+                }
+                auto truth_buf = static_pointer_cast<const float>(buf, subnetwork_output.num_samples()*image_size);
+                do_work(loss_buf, truth_buf, subnetwork_output, gradient, loss);
+            }
+        private:
+            static void do_work(
+                cuda_data_ptr<float> loss_work_buffer,
+                cuda_data_ptr<const float> truth_buffer,
+                const tensor& subnetwork_output,
+                tensor& gradient,
+                double& loss
+            );
+            mutable cuda_data_void_ptr buf;
+        };
    // ------------------------------------------------------------------------------------
    // ------------------------------------------------------------------------------------
    // ------------------------------------------------------------------------------------

--- a/dlib/dnn/loss.h
+++ b/dlib/dnn/loss.h
@@ -389,7 +389,8 @@ namespace dlib
    {
    public:
-        typedef weighted_label<unsigned long> training_label_type;
+        typedef dlib::weighted_label<unsigned long> weighted_label;
+        typedef weighted_label training_label_type;
        typedef unsigned long output_label_type;
        template <
@@ -3300,30 +3301,12 @@ namespace dlib
                        "output size = " << output_tensor.nr() << " x " << output_tensor.nc());
                }
            }
+            double loss;
-            // The loss we output is the average loss over the mini-batch, and also over each element of the matrix output.
+#ifdef DLIB_USE_CUDA
-            const double scale = 1.0 / (output_tensor.num_samples() * output_tensor.k() * output_tensor.nr() * output_tensor.nc());
+            cuda_compute(truth, output_tensor, grad, loss);
-            double loss = 0;
+#else
-            float* const g = grad.host();
+            cpu_compute(truth, output_tensor, grad, loss);
-            const float* out_data = output_tensor.host();
+#endif
-            for (long i = 0; i < output_tensor.num_samples(); ++i, ++truth)
-            {
-                for (long k = 0; k < output_tensor.k(); ++k)
-                {
-                    for (long r = 0; r < output_tensor.nr(); ++r)
-                    {
-                        for (long c = 0; c < output_tensor.nc(); ++c)
-                        {
-                            const float y = (*truth)[k].operator()(r, c);
-                            const size_t idx = tensor_index(output_tensor, i, k, r, c);
-                            const float temp1 = y - out_data[idx];
-                            const float temp2 = scale*temp1;
-                            loss += temp2*temp1;
-                            g[idx] = -temp2;
-                        }
-                    }
-                }
-            }
            return loss;
        }
@@ -3357,6 +3340,11 @@ namespace dlib
            // See: https://github.com/davisking/dlib/blob/4dfeb7e186dd1bf6ac91273509f687293bd4230a/dlib/dnn/tensor_abstract.h#L38
            return ((sample * t.k() + k) * t.nr() + row) * t.nc() + column;
        }
+#ifdef DLIB_USE_CUDA
+        cuda::compute_loss_mean_squared_per_channel_and_pixel cuda_compute;
+#else
+        cpu::compute_loss_mean_squared_per_channel_and_pixel cpu_compute;
+#endif
    };
    template <long num_channels, typename SUBNET>

--- a/dlib/dnn/loss_abstract.h
+++ b/dlib/dnn/loss_abstract.h
@@ -423,7 +423,8 @@ namespace dlib
    public:
-        typedef weighted_label<unsigned long> training_label_type;
+        typedef dlib::weighted_label<unsigned long> weighted_label;
+        typedef weighted_label training_label_type;
        typedef unsigned long output_label_type;
        template <
@@ -1535,7 +1536,8 @@ namespace dlib
        !*/
    public:
-        typedef matrix<weighted_label<uint16_t> training_label_type;
+        typedef dlib::weighted_label<uint16_t> weighted_label;
+        typedef matrix<weighted_label> training_label_type;
        typedef matrix<uint16_t> output_label_type;
        template <

--- a/dlib/test/dnn.cpp
+++ b/dlib/test/dnn.cpp
@@ -2580,7 +2580,7 @@ namespace
        print_spinner();
        const int num_samples = 1000;
-        const long num_channels = 2;
+        const long num_channels = 10;
        const long dimension = 3;
        ::std::vector<matrix<float>> inputs;
        ::std::vector<::std::array<matrix<float>, num_channels>> labels;
@@ -2628,9 +2628,21 @@ namespace
        trainer.set_iterations_without_progress_threshold(500);
        trainer.set_min_learning_rate(1e-6);
        trainer.set_mini_batch_size(50);
+        trainer.set_max_num_epochs(100);
        trainer.train(inputs, labels);
        const auto error_after = compute_error();
        DLIB_TEST_MSG(error_after < error_before, "multi channel error increased after training");
+#if DLIB_USE_CUDA
+        cuda::compute_loss_mean_squared_per_channel_and_pixel cuda_compute;
+        cpu::compute_loss_mean_squared_per_channel_and_pixel cpu_compute;
+        double cuda_loss, cpu_loss;
+        const tensor& output_tensor = net.subnet().get_output();
+        tensor& grad = net.subnet().get_gradient_input();
+        cuda_compute(labels.begin(), output_tensor, grad, cuda_loss);
+        cpu_compute(labels.begin(), output_tensor, grad, cpu_loss);
+        const auto err = ::std::abs<double>(cuda_loss - cpu_loss) / cpu_loss;
+        DLIB_TEST_MSG(err < 1e-6, "multi channel cuda and cpu losses differ");
+#endif
    }
 // ----------------------------------------------------------------------------------------