Made add() faster by calling my own version for the simple pointwise add case.

cbdeb160 · Davis King · 30005b7e · cbdeb160 · cbdeb160 · cbdeb160
Commit cbdeb160 authored Jan 03, 2016 by Davis King
Show whitespace changes
Inline Side-by-side

Showing with 36 additions and 0 deletions

dlib/dnn/cuda_dlib.cu dlib/dnn/cuda_dlib.cu +20 -0

dlib/dnn/cuda_dlib.h dlib/dnn/cuda_dlib.h +7 -0

dlib/dnn/cudnn_dlibapi.cpp dlib/dnn/cudnn_dlibapi.cpp +9 -0

No files found.
--- a/dlib/dnn/cuda_dlib.cu
+++ b/dlib/dnn/cuda_dlib.cu
@@ -210,6 +210,26 @@ namespace dlib
            launch_kernel(_cuda_affine_transform4,max_jobs(dest.size()),dest.device(), src1.device(), src2.device(), dest.size(), A, B, C);
        }
+    // ----------------------------------------------------------------------------------------
+        __global__ void _cuda_add_scaled(float* d, const float* s, size_t n, float scale)
+        {
+            for (auto i : grid_stride_range(0, n))
+            {
+                d[i] += scale*s[i]; 
+            }
+        }
+        void add_scaled(
+            tensor& dest,
+            const float scale,
+            const tensor& src
+        )
+        {
+            DLIB_CASSERT(dest.size()==src.size(),"");
+            launch_kernel(_cuda_add_scaled,max_jobs(dest.size()),dest.device(), src.device(), dest.size(), scale);
+        }
    // ----------------------------------------------------------------------------------------
        __global__ void _cuda_affine_transform5(

--- a/dlib/dnn/cuda_dlib.h
+++ b/dlib/dnn/cuda_dlib.h
@@ -65,6 +65,13 @@ namespace dlib
            const float D
        );
+        // Note that this function isn't in the tt:: namespace because add_scaled() is
+        // called by cuda::add() so we don't need a tt:: version of add_scaled().  
+        void add_scaled(
+            tensor& dest,
+            const float scale,
+            const tensor& src
+        );
    // -----------------------------------------------------------------------------------

--- a/dlib/dnn/cudnn_dlibapi.cpp
+++ b/dlib/dnn/cudnn_dlibapi.cpp
@@ -12,6 +12,7 @@
 #include <string>
 #include "cuda_utils.h"
 #include "cpu_dlib.h"
+#include "cuda_dlib.h"
 static const char* cudnn_get_error_string(cudnnStatus_t s)
 {
@@ -213,6 +214,14 @@ namespace dlib
                    <<"\n\t src.nc():           " << src.nc()
                    );
+            if (dest.size() == src.size() && beta == 1)
+            {
+                // Call the dlib function in this case since it's faster than the one that
+                // comes with cuDNN (at least as of cuDNN v4).
+                add_scaled(dest, alpha, src);
+                return;
+            }
            CHECK_CUDNN(cudnnAddTensor_v3(context(),
                                    &alpha,
                                    descriptor(src),