Test cuda losses (#2199)

* add cuda test for loss_binary_log_per_pixel and some needed refactoring * add cuda test for loss_multiclass_log_per_pixel * forgot to add cpu version in loss * remove a line I added by mistake * fix typos * declare label_to_ignore as static * use tensor_index function instead of index method * test cuda and cpu gradients values * use DLIB_TEST instead of DLIB_CASSERT

Test cuda losses (#2199)
* add cuda test for loss_binary_log_per_pixel and some needed refactoring * add cuda test for loss_multiclass_log_per_pixel * forgot to add cpu version in loss * remove a line I added by mistake * fix typos * declare label_to_ignore as static * use tensor_index function instead of index method * test cuda and cpu gradients values * use DLIB_TEST instead of DLIB_CASSERT
c45d166a · Adrià Arrufat · GitHub · d78d273a · c45d166a · c45d166a
Unverified Commit c45d166a authored Oct 06, 2020 by Adrià Arrufat Committed by GitHub Oct 05, 2020
6 changed files
--- a/dlib/cuda/cpu_dlib.h
+++ b/dlib/cuda/cpu_dlib.h
@@ -522,6 +522,132 @@ namespace dlib

    // -----------------------------------------------------------------------------------

+    class compute_loss_binary_log_per_pixel
+    {
+
+        /*! The point of this class is to compute the loss for loss_binary_log_per_pixel_
+            on the cpu to provide an analogous implementation of the cuda version
+        !*/
+    public:
+        compute_loss_binary_log_per_pixel(
+        )
+        {
+        }
+
+        template <
+            typename const_label_iterator
+            >
+        void operator()(
+            const_label_iterator truth,
+            const tensor& output_tensor,
+            tensor& grad,
+            double& loss
+        ) const
+        {
+            sigmoid(grad, output_tensor);
+            // The loss we output is the average loss over the mini-batch, and also over each element of the matrix output.
+            const double scale = 1.0/(output_tensor.num_samples()*output_tensor.nr()*output_tensor.nc());
+            loss = 0;
+            float* const g = grad.host();
+            const float* const out_data = output_tensor.host();
+            for (long i = 0; i < output_tensor.num_samples(); ++i, ++truth)
+            {
+                for (long r = 0; r < output_tensor.nr(); ++r)
+                {
+                    for (long c = 0; c < output_tensor.nc(); ++c)
+                    {
+                        const float y = truth->operator()(r, c);
+                        const size_t idx = tensor_index(output_tensor, i, 0, r, c);
+
+                        if (y > 0.f)
+                        {
+                            const float temp = log1pexp(-out_data[idx]);
+                            loss += y*scale*temp;
+                            g[idx] = y*scale*(g[idx]-1);
+                        }
+                        else if (y < 0.f)
+                        {
+                            const float temp = -(-out_data[idx]-log1pexp(-out_data[idx]));
+                            loss += -y*scale*temp;
+                            g[idx] = -y*scale*g[idx];
+                        }
+                        else
+                        {
+                            g[idx] = 0.f;
+                        }
+                    }
+                }
+            }
+        }
+    };
+
+    // -----------------------------------------------------------------------------------
+
+    class compute_loss_multiclass_log_per_pixel
+    {
+
+        /*! The point of this class is to compute the loss for loss_multiclass_log_per_pixel_
+            on the cpu to provide an analogous implementation of the cuda version
+        !*/
+    public:
+        compute_loss_multiclass_log_per_pixel(
+        )
+        {
+        }
+
+        template <
+            typename const_label_iterator
+            >
+        void operator()(
+            const_label_iterator truth,
+            const tensor& output_tensor,
+            tensor& grad,
+            double& loss
+        ) const
+        {
+            softmax(grad, output_tensor);
+            // The loss we output is the average loss over the mini-batch, and also over each element of the matrix output.
+            const double scale = 1.0 / (output_tensor.num_samples() * output_tensor.nr() * output_tensor.nc());
+            loss = 0;
+            float* const g = grad.host();
+            for (long i = 0; i < output_tensor.num_samples(); ++i, ++truth)
+            {
+                for (long r = 0; r < output_tensor.nr(); ++r)
+                {
+                    for (long c = 0; c < output_tensor.nc(); ++c)
+                    {
+                        const uint16_t y = truth->operator()(r, c);
+                        // The network must produce a number of outputs that is equal to the number
+                        // of labels when using this type of loss.
+                        DLIB_CASSERT(static_cast<long>(y) < output_tensor.k() || y == label_to_ignore,
+                                        "y: " << y << ", output_tensor.k(): " << output_tensor.k());
+                        for (long k = 0; k < output_tensor.k(); ++k)
+                        {
+                            const size_t idx = tensor_index(output_tensor, i, k, r, c);
+                            if (k == y)
+                            {
+                                loss += scale*-safe_log(g[idx]);
+                                g[idx] = scale*(g[idx] - 1);
+                            }
+                            else if (y == label_to_ignore)
+                            {
+                                g[idx] = 0.f;
+                            }
+                            else
+                            {
+                                g[idx] = scale*g[idx];
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    private:
+        static const uint16_t label_to_ignore = std::numeric_limits<uint16_t>::max();
+    };
+
+    // -----------------------------------------------------------------------------------
+
    class compute_loss_multiclass_log_per_pixel_weighted
    {

@@ -580,20 +706,6 @@ namespace dlib
            }
        }

-    private:
-
-        template <typename T>
-        T safe_log(T input, T epsilon = 1e-10) const
-        {
-            // Prevent trying to calculate the logarithm of a very small number (let alone zero)
-            return std::log(std::max(input, epsilon));
-        }
-
-        static size_t tensor_index(const tensor& t, long sample, long k, long row, long column)
-        {
-            // See: https://github.com/davisking/dlib/blob/4dfeb7e186dd1bf6ac91273509f687293bd4230a/dlib/dnn/tensor_abstract.h#L38
-            return ((sample * t.k() + k) * t.nr() + row) * t.nc() + column;
-        }
    };

    // -----------------------------------------------------------------------------------
@@ -634,7 +746,7 @@ namespace dlib
                        for (long c = 0; c < output_tensor.nc(); ++c)
                        {
                            const float y = (*truth)[k].operator()(r, c);
-                            const size_t idx = ((i * output_tensor.k() + k) * output_tensor.nr() + r) * output_tensor.nc() + c;
+                            const size_t idx = tensor_index(output_tensor, i, k, r, c);
                            const float temp1 = y - out_data[idx];
                            const float temp2 = scale*temp1;
                            loss += temp2*temp1;

--- a/dlib/dnn/loss.h
+++ b/dlib/dnn/loss.h
@@ -242,13 +242,6 @@ namespace dlib

    };

-    template <typename T>
-    T safe_log(T input, T epsilon = 1e-10)
-    {
-        // Prevent trying to calculate the logarithm of a very small number (let alone zero)
-        return std::log(std::max(input, epsilon));
-    }
-
    template <typename SUBNET>
    using loss_binary_log = add_loss_layer<loss_binary_log_, SUBNET>;

@@ -2759,7 +2752,7 @@ namespace dlib
                {
                    for (long c = 0; c < output_tensor.nc(); ++c) 
                    {
-                        iter->operator()(r, c) = out_data[tensor_index(output_tensor, i, r, c)];
+                        iter->operator()(r, c) = out_data[tensor_index(output_tensor, i, 0, r, c)];
                    }
                }
            }
@@ -2796,49 +2789,13 @@ namespace dlib
                             "output size = " << output_tensor.nr() << " x " << output_tensor.nc());
            }

-#ifdef DLIB_USE_CUDA
            double loss;
+#ifdef DLIB_USE_CUDA
            cuda_compute(truth, output_tensor, grad, loss);
-            return loss;
 #else
-
-            tt::sigmoid(grad, output_tensor);
-
-            // The loss we output is the average loss over the mini-batch, and also over each element of the matrix output.
-            const double scale = 1.0/(output_tensor.num_samples()*output_tensor.nr()*output_tensor.nc());
-            double loss = 0;
-            float* const g = grad.host();
-            const float* const out_data = output_tensor.host();
-            for (long i = 0; i < output_tensor.num_samples(); ++i, ++truth)
-            {
-                for (long r = 0; r < output_tensor.nr(); ++r)
-                {
-                    for (long c = 0; c < output_tensor.nc(); ++c)
-                    {
-                        const float y = truth->operator()(r, c);
-                        const size_t idx = tensor_index(output_tensor, i, r, c);
-
-                        if (y > 0.f)
-                        {
-                            const float temp = log1pexp(-out_data[idx]);
-                            loss += y*scale*temp;
-                            g[idx] = y*scale*(g[idx]-1);
-                        }
-                        else if (y < 0.f)
-                        {
-                            const float temp = -(-out_data[idx]-log1pexp(-out_data[idx]));
-                            loss += -y*scale*temp;
-                            g[idx] = -y*scale*g[idx];
-                        }
-                        else
-                        {
-                            g[idx] = 0.f;
-                        }
-                    }
-                }
-            }
-            return loss;
+            cpu_compute(truth, output_tensor, grad, loss);
 #endif
+            return loss;
        }

        friend void serialize(const loss_binary_log_per_pixel_& , std::ostream& out)
@@ -2866,16 +2823,11 @@ namespace dlib
        }

    private:
-        static size_t tensor_index(const tensor& t, long sample, long row, long column)
-        {
-            DLIB_ASSERT(t.k() == 1);
-
-            // See: https://github.com/davisking/dlib/blob/4dfeb7e186dd1bf6ac91273509f687293bd4230a/dlib/dnn/tensor_abstract.h#L38
-            return (sample * t.nr() + row) * t.nc() + column;
-        }

 #ifdef DLIB_USE_CUDA
        cuda::compute_loss_binary_log_per_pixel cuda_compute;
+#else
+        cpu::compute_loss_binary_log_per_pixel cpu_compute;
 #endif
    };

@@ -2982,51 +2934,13 @@ namespace dlib
            }


-#ifdef DLIB_USE_CUDA
            double loss;
+#ifdef DLIB_USE_CUDA
            cuda_compute(truth, output_tensor, grad, loss);
-            return loss;
 #else
-
-            tt::softmax(grad, output_tensor);
-
-            // The loss we output is the average loss over the mini-batch, and also over each element of the matrix output.
-            const double scale = 1.0 / (output_tensor.num_samples() * output_tensor.nr() * output_tensor.nc());
-            double loss = 0;
-            float* const g = grad.host();
-            for (long i = 0; i < output_tensor.num_samples(); ++i, ++truth)
-            {
-                for (long r = 0; r < output_tensor.nr(); ++r)
-                {
-                    for (long c = 0; c < output_tensor.nc(); ++c)
-                    {
-                        const uint16_t y = truth->operator()(r, c);
-                        // The network must produce a number of outputs that is equal to the number
-                        // of labels when using this type of loss.
-                        DLIB_CASSERT(static_cast<long>(y) < output_tensor.k() || y == label_to_ignore,
-                                        "y: " << y << ", output_tensor.k(): " << output_tensor.k());
-                        for (long k = 0; k < output_tensor.k(); ++k)
-                        {
-                            const size_t idx = tensor_index(output_tensor, i, k, r, c);
-                            if (k == y)
-                            {
-                                loss += scale*-safe_log(g[idx]);
-                                g[idx] = scale*(g[idx] - 1);
-                            }
-                            else if (y == label_to_ignore)
-                            {
-                                g[idx] = 0.f;
-                            }
-                            else
-                            {
-                                g[idx] = scale*g[idx];
-                            }
-                        }
-                    }
-                }
-            }
-            return loss;
+            cpu_compute(truth, output_tensor, grad, loss);
 #endif
+            return loss;
        }

        friend void serialize(const loss_multiclass_log_per_pixel_& , std::ostream& out)
@@ -3054,15 +2968,11 @@ namespace dlib
        }

    private:
-        static size_t tensor_index(const tensor& t, long sample, long k, long row, long column)
-        {
-            // See: https://github.com/davisking/dlib/blob/4dfeb7e186dd1bf6ac91273509f687293bd4230a/dlib/dnn/tensor_abstract.h#L38
-            return ((sample * t.k() + k) * t.nr() + row) * t.nc() + column;
-        }
-

 #ifdef DLIB_USE_CUDA
        cuda::compute_loss_multiclass_log_per_pixel cuda_compute;
+#else
+        cpu::compute_loss_multiclass_log_per_pixel cpu_compute;
 #endif
    };

@@ -3158,11 +3068,7 @@ namespace dlib
        }

    private:
-        static size_t tensor_index(const tensor& t, long sample, long k, long row, long column)
-        {
-            // See: https://github.com/davisking/dlib/blob/4dfeb7e186dd1bf6ac91273509f687293bd4230a/dlib/dnn/tensor_abstract.h#L38
-            return ((sample * t.k() + k) * t.nr() + row) * t.nc() + column;
-        }
+
 #ifdef DLIB_USE_CUDA
        cuda::compute_loss_multiclass_log_per_pixel_weighted cuda_compute;
 #else
@@ -3294,12 +3200,6 @@ namespace dlib
            out << "<loss_mean_squared_per_pixel/>";
        }

-    private:
-        static size_t tensor_index(const tensor& t, long sample, long k, long row, long column)
-        {
-            // See: https://github.com/davisking/dlib/blob/4dfeb7e186dd1bf6ac91273509f687293bd4230a/dlib/dnn/tensor_abstract.h#L38
-            return ((sample * t.k() + k) * t.nr() + row) * t.nc() + column;
-        }
    };

    template <typename SUBNET>
@@ -3419,11 +3319,7 @@ namespace dlib
        }

    private:
-        static size_t tensor_index(const tensor& t, long sample, long k, long row, long column)
-        {
-            // See: https://github.com/davisking/dlib/blob/4dfeb7e186dd1bf6ac91273509f687293bd4230a/dlib/dnn/tensor_abstract.h#L38
-            return ((sample * t.k() + k) * t.nr() + row) * t.nc() + column;
-        }
+
 #ifdef DLIB_USE_CUDA
        cuda::compute_loss_mean_squared_per_channel_and_pixel cuda_compute;
 #else

--- a/dlib/dnn/misc.h
+++ b/dlib/dnn/misc.h
@@ -3,6 +3,8 @@
 #ifndef DLIB_DNn_MISC_h
 #define DLIB_DNn_MISC_h

+#include "../cuda/tensor.h"
+
 namespace dlib
 {

@@ -22,6 +24,47 @@ namespace dlib
        float weight = 1.f;
    };

+// ----------------------------------------------------------------------------------------
+
+    inline double log1pexp(double x)
+    {
+        using std::exp;
+        using namespace std; // Do this instead of using std::log1p because some compilers
+                             // error out otherwise (E.g. gcc 4.9 in cygwin)
+        if (x <= -37)
+            return exp(x);
+        else if (-37 < x && x <= 18)
+            return log1p(exp(x));
+        else if (18 < x && x <= 33.3)
+            return x + exp(-x);
+        else
+            return x;
+    }
+
+// ----------------------------------------------------------------------------------------
+
+    template <typename T>
+    T safe_log(T input, T epsilon = 1e-10)
+    {
+        // Prevent trying to calculate the logarithm of a very small number (let alone zero)
+        return std::log(std::max(input, epsilon));
+    }
+
+// ----------------------------------------------------------------------------------------
+
+    static size_t tensor_index(
+        const tensor& t,
+        const long sample,
+        const long k,
+        const long r,
+        const long c
+    )
+    {
+        return ((sample * t.k() + k) * t.nr() + r) * t.nc() + c;
+    }
+
+// ----------------------------------------------------------------------------------------
+
 }

 #endif // DLIB_DNn_MISC_h

--- a/dlib/dnn/utilities.h
+++ b/dlib/dnn/utilities.h
@@ -11,23 +11,6 @@
 namespace dlib
 {

-// ----------------------------------------------------------------------------------------
-
-    inline double log1pexp(double x)
-    {
-        using std::exp;
-        using namespace std; // Do this instead of using std::log1p because some compilers
-                             // error out otherwise (E.g. gcc 4.9 in cygwin)
-        if (x <= -37)
-            return exp(x);
-        else if (-37 < x && x <= 18)
-            return log1p(exp(x));
-        else if (18 < x && x <= 33.3)
-            return x + exp(-x);
-        else
-            return x;
-    }
-    
 // ----------------------------------------------------------------------------------------

    inline void randomize_parameters (

--- a/dlib/dnn/utilities_abstract.h
+++ b/dlib/dnn/utilities_abstract.h
@@ -18,6 +18,8 @@ namespace dlib
        ensures
            - returns log(1+exp(x))
              (except computes it using a numerically accurate method)
+
+        NOTE: For technical reasons, it is defined in misc.h.
    !*/

 // ----------------------------------------------------------------------------------------

--- a/dlib/test/dnn.cpp
+++ b/dlib/test/dnn.cpp
@@ -2683,9 +2683,14 @@ namespace
        cpu::compute_loss_mean_squared_per_channel_and_pixel cpu_compute;
        double cuda_loss, cpu_loss;
        const tensor& output_tensor = net.subnet().get_output();
-        tensor& grad = net.subnet().get_gradient_input();
-        cuda_compute(labels.begin(), output_tensor, grad, cuda_loss);
-        cpu_compute(labels.begin(), output_tensor, grad, cpu_loss);
+        resizable_tensor cuda_grad(output_tensor), cpu_grad(output_tensor);
+        cuda_compute(labels.begin(), output_tensor, cuda_grad, cuda_loss);
+        cpu_compute(labels.begin(), output_tensor, cpu_grad, cpu_loss);
+        DLIB_TEST(cuda_grad.size() == cpu_grad.size());
+        for (size_t i = 0; i < cuda_grad.size(); ++i)
+        {
+            DLIB_TEST(::std::abs(*(cuda_grad.begin() + i) - *(cpu_grad.begin() + i)) < 1e-8);
+        }
        const auto err = abs(cuda_loss - cpu_loss) / cpu_loss;
        DLIB_TEST_MSG(err < 1e-6, "multi channel cuda and cpu losses differ");
 #endif
@@ -2883,6 +2888,23 @@ namespace
        const int num_correct_required = static_cast<int>(::std::ceil(0.9 * num_correct_max));
        DLIB_TEST_MSG(num_correct >= num_correct_required,
                      "Number of correctly classified elements = " << num_correct << ", required = " << num_correct_required);
+
+#if DLIB_USE_CUDA
+        cuda::compute_loss_binary_log_per_pixel cuda_compute;
+        cpu::compute_loss_binary_log_per_pixel cpu_compute;
+        double cuda_loss, cpu_loss;
+        const tensor& output_tensor = net.subnet().get_output();
+        resizable_tensor cuda_grad(output_tensor), cpu_grad(output_tensor);
+        cuda_compute(y.begin(), output_tensor, cuda_grad, cuda_loss);
+        cpu_compute(y.begin(), output_tensor, cpu_grad, cpu_loss);
+        DLIB_TEST(cuda_grad.size() == cpu_grad.size());
+        for (size_t i = 0; i < cuda_grad.size(); ++i)
+        {
+            DLIB_TEST(::std::abs(*(cuda_grad.begin() + i) - *(cpu_grad.begin() + i)) < 1e-8);
+        }
+        const auto err = abs(cuda_loss - cpu_loss) / cpu_loss;
+        DLIB_TEST_MSG(err < 1e-6, "binary log per pixel cuda and cpu losses differ");
+#endif
    }

 // ----------------------------------------------------------------------------------------
@@ -3217,6 +3239,23 @@ namespace
        const int num_correct_required = static_cast<int>(::std::ceil(0.9 * num_correct_max));
        DLIB_TEST_MSG(num_correct >= num_correct_required,
                      "Number of correctly classified elements = " << num_correct << ", required = " << num_correct_required);
+
+#if DLIB_USE_CUDA
+        cuda::compute_loss_multiclass_log_per_pixel cuda_compute;
+        cpu::compute_loss_multiclass_log_per_pixel cpu_compute;
+        double cuda_loss, cpu_loss;
+        const tensor& output_tensor = net.subnet().get_output();
+        resizable_tensor cuda_grad(output_tensor), cpu_grad(output_tensor);
+        cuda_compute(y.begin(), output_tensor, cuda_grad, cuda_loss);
+        cpu_compute(y.begin(), output_tensor, cpu_grad, cpu_loss);
+        DLIB_TEST(cuda_grad.size() == cpu_grad.size());
+        for (size_t i = 0; i < cuda_grad.size(); ++i)
+        {
+            DLIB_TEST(::std::abs(*(cuda_grad.begin() + i) - *(cpu_grad.begin() + i)) < 1e-8);
+        }
+        const auto err = abs(cuda_loss - cpu_loss) / cpu_loss;
+        DLIB_TEST_MSG(err < 1e-6, "multiclass log per pixel cuda and cpu losses differ");
+#endif
    }

 // ----------------------------------------------------------------------------------------
@@ -3317,9 +3356,14 @@ namespace
            cpu::compute_loss_multiclass_log_per_pixel_weighted cpu_compute;
            double cuda_loss, cpu_loss;
            const tensor& output_tensor = net.subnet().get_output();
-            tensor& grad = net.subnet().get_gradient_input();
-            cuda_compute(y_weighted.begin(), output_tensor, grad, cuda_loss);
-            cpu_compute(y_weighted.begin(), output_tensor, grad, cpu_loss);
+            resizable_tensor cuda_grad(output_tensor), cpu_grad(output_tensor);
+            cuda_compute(y_weighted.begin(), output_tensor, cuda_grad, cuda_loss);
+            cpu_compute(y_weighted.begin(), output_tensor, cpu_grad, cpu_loss);
+            DLIB_TEST(cuda_grad.size() == cpu_grad.size());
+            for (size_t i = 0; i < cuda_grad.size(); ++i)
+            {
+                DLIB_TEST(::std::abs(*(cuda_grad.begin() + i) - *(cpu_grad.begin() + i)) < 1e-8);
+            }
            const auto err = abs(cuda_loss - cpu_loss) / cpu_loss;
            DLIB_TEST_MSG(err < 1e-6, "multi class log per pixel weighted cuda and cpu losses differ");
 #endif