Made loss_multiclass_log_per_pixel use CUDA.

49a5d39d · Davis King · 6c962dd9 · 49a5d39d · 49a5d39d · 49a5d39d
Commit 49a5d39d authored Apr 04, 2018 by Davis King
Show whitespace changes
Inline Side-by-side

Showing with 139 additions and 0 deletions

dlib/dnn/cuda_dlib.cu dlib/dnn/cuda_dlib.cu +66 -0

dlib/dnn/cuda_dlib.h dlib/dnn/cuda_dlib.h +61 -0

dlib/dnn/loss.h dlib/dnn/loss.h +12 -0

No files found.
--- a/dlib/dnn/cuda_dlib.cu
+++ b/dlib/dnn/cuda_dlib.cu
@@ -3,6 +3,7 @@

 #include "cuda_utils.h"
 #include "cuda_dlib.h"
+#include "cudnn_dlibapi.h"


 namespace dlib 
@@ -1623,6 +1624,71 @@ namespace dlib
            }
        }

+    // ----------------------------------------------------------------------------------------
+
+        __device__ float cuda_safe_log(float x, float epsilon = 1e-10)
+        {
+            // Prevent trying to calculate the logarithm of a very small number (let alone zero)
+            if (x >= epsilon)
+                return ::log(x);
+            else
+                return ::log(epsilon);
+        }
+
+        __global__ void _cuda_compute_loss_multiclass_log_per_pixel(float* loss_out, float* g, const uint16_t* truth, size_t n, size_t plane_size, size_t sample_size, size_t nk, uint16_t label_to_ignore, const float scale)
+        {
+            float loss = 0;
+            for(auto i : grid_stride_range(0, n))
+            {
+                const size_t k = (i/plane_size)%nk;
+                const size_t idx = (i%plane_size) + plane_size*(i/sample_size);
+
+                const size_t y = truth[idx];
+
+                if (k == y)
+                {
+                    loss -= cuda_safe_log(g[i]);
+                    g[i] = scale*(g[i] - 1);
+                }
+                else if (y == label_to_ignore)
+                {
+                    g[i] = 0.f;
+                }
+                else
+                {
+                    g[i] = scale*g[i];
+                }
+            }
+
+            warp_reduce_atomic_add(*loss_out, loss);
+        }
+
+
+        void compute_loss_multiclass_log_per_pixel::
+        do_work(
+            float* loss_cuda_work_buffer,
+            const uint16_t* truth_buffer,
+            const tensor& subnetwork_output,
+            tensor& gradient,
+            double& loss
+        )
+        {
+            CHECK_CUDA(cudaMemset(loss_cuda_work_buffer, 0, sizeof(float)));
+            softmax(gradient, subnetwork_output);
+            static const uint16_t label_to_ignore = std::numeric_limits<uint16_t>::max();
+
+            // The loss we output is the average loss over the mini-batch, and also over each element of the matrix output.
+            const double scale = 1.0 / (subnetwork_output.num_samples() * subnetwork_output.nr() * subnetwork_output.nc());
+
+
+            launch_kernel(_cuda_compute_loss_multiclass_log_per_pixel, max_jobs(gradient.size()),
+                loss_cuda_work_buffer, gradient.device(), truth_buffer, gradient.size(), gradient.nr()*gradient.nc(), gradient.nr()*gradient.nc()*gradient.k(), gradient.k(), label_to_ignore, scale);
+
+            float floss;
+            CHECK_CUDA(cudaMemcpy(&floss, loss_cuda_work_buffer,  sizeof(float), cudaMemcpyDefault));
+            loss = scale*floss;
+        }
+
    // ----------------------------------------------------------------------------------------

    }

--- a/dlib/dnn/cuda_dlib.h
+++ b/dlib/dnn/cuda_dlib.h
@@ -409,6 +409,67 @@ namespace dlib
            size_t count_k
        );

+
+    // ----------------------------------------------------------------------------------------
+
+        class compute_loss_multiclass_log_per_pixel
+        {
+            /*!
+                The point of this class is to compute the loss computed by
+                loss_multiclass_log_per_pixel, but to do so with CUDA.
+            !*/
+        public:
+
+            compute_loss_multiclass_log_per_pixel(
+            )
+            {
+                work = device_global_buffer();
+            }
+
+            template <
+                typename const_label_iterator
+                >
+            void operator() (
+                const_label_iterator truth,
+                const tensor& subnetwork_output,
+                tensor& gradient,
+                double& loss
+            ) const
+            {
+                const size_t bytes_per_plane = subnetwork_output.nr()*subnetwork_output.nc()*sizeof(uint16_t);
+                // Allocate a cuda buffer to store all the truth images and also one float
+                // for the scalar loss output.
+                cuda_data_void_ptr buf = work->get(subnetwork_output.num_samples()*bytes_per_plane + sizeof(float));
+
+                cuda_data_void_ptr loss_buf = buf;
+                buf = buf+sizeof(float);
+
+
+                // copy the truth data into a cuda buffer.
+                for (long i = 0; i < subnetwork_output.num_samples(); ++i, ++truth)
+                {
+                    const matrix<uint16_t>& t = *truth;
+                    DLIB_ASSERT(t.nr() == subnetwork_output.nr());
+                    DLIB_ASSERT(t.nc() == subnetwork_output.nc());
+                    memcpy(buf + i*bytes_per_plane, &t(0,0), bytes_per_plane);
+                }
+
+                do_work(static_cast<float*>(loss_buf.data()), static_cast<uint16_t*>(buf.data()), subnetwork_output, gradient, loss);
+            }
+
+        private:
+
+            static void do_work(
+                float* loss_cuda_work_buffer,
+                const uint16_t* truth_buffer,
+                const tensor& subnetwork_output,
+                tensor& gradient,
+                double& loss
+            );
+            
+            std::shared_ptr<resizable_cuda_buffer> work;
+        };
+
    // ------------------------------------------------------------------------------------
    // ------------------------------------------------------------------------------------
    // ------------------------------------------------------------------------------------

--- a/dlib/dnn/loss.h
+++ b/dlib/dnn/loss.h
@@ -2407,6 +2407,13 @@ namespace dlib
                             "output size = " << output_tensor.nr() << " x " << output_tensor.nc());
            }

+
+#ifdef DLIB_USE_CUDA
+            double loss;
+            cuda_compute(truth, output_tensor, grad, loss);
+            return loss;
+#else
+
            tt::softmax(grad, output_tensor);

            // The loss we output is the average loss over the mini-batch, and also over each element of the matrix output.
@@ -2445,6 +2452,7 @@ namespace dlib
                }
            }
            return loss;
+#endif
        }

        friend void serialize(const loss_multiclass_log_per_pixel_& , std::ostream& out)
@@ -2478,6 +2486,10 @@ namespace dlib
            return ((sample * t.k() + k) * t.nr() + row) * t.nc() + column;
        }

+
+#ifdef DLIB_USE_CUDA
+        cuda::compute_loss_multiclass_log_per_pixel cuda_compute;
+#endif
    };

    template <typename SUBNET>