Added affine_transform_conv() and multiply_conv() as well as a CPU

implementation of assign_conv_bias_gradient().

Added affine_transform_conv() and multiply_conv() as well as a CPU
implementation of assign_conv_bias_gradient().
669a1e17 · Davis King · e44b2aa2 · 669a1e17 · 669a1e17 · 669a1e17
Commit 669a1e17 authored Jan 23, 2016 by Davis King
6 changed files
--- a/dlib/dnn/cpu_dlib.cpp
+++ b/dlib/dnn/cpu_dlib.cpp
@@ -54,6 +54,57 @@ namespace dlib
            }
        }

+        void multiply_conv (
+            tensor& dest,
+            const tensor& src1,
+            const tensor& src2
+        )
+        {
+            auto d = dest.host();
+            auto s1 = src1.host();
+            auto s2 = src2.host();
+            if (have_same_dimensions(dest,src1))
+            {
+                DLIB_CASSERT(src2.num_samples() == 1 && src2.nr() == 1 && src2.nc() == 1 && src2.k() == src1.k(),"");
+
+                for (long n = 0; n < dest.num_samples(); ++n)
+                {
+                    for (long k = 0; k < dest.k(); ++k)
+                    {
+                        for (long r = 0; r < dest.nr(); ++r)
+                        {
+                            for (long c = 0; c < dest.nc(); ++c)
+                            {
+                                *d++ = (*s1++)*s2[k];
+                            }
+                        }
+                    }
+                }
+            }
+            else
+            {
+                DLIB_CASSERT(have_same_dimensions(src1,src2),"");
+                DLIB_CASSERT(dest.num_samples() == 1 && dest.nr() == 1 && dest.nc() == 1 && dest.k() == src1.k(),"");
+
+                for (long k = 0; k < src1.k(); ++k)
+                    d[k] = 0;
+
+                for (long n = 0; n < src1.num_samples(); ++n)
+                {
+                    for (long k = 0; k < src1.k(); ++k)
+                    {
+                        for (long r = 0; r < src1.nr(); ++r)
+                        {
+                            for (long c = 0; c < src1.nc(); ++c)
+                            {
+                                d[k] += (*s1++)*(*s2++);
+                            }
+                        }
+                    }
+                }
+            }
+        }
+
        void add(
            float beta,
            tensor& dest,
@@ -196,6 +247,44 @@ namespace dlib
            }
        }

+    // ------------------------------------------------------------------------------------
+
+        void assign_conv_bias_gradient (
+            tensor& grad,
+            const tensor& gradient_input
+        )
+        {
+            DLIB_CASSERT(
+                  grad.num_samples() == 1 &&
+                  grad.k()  >= 1 &&
+                  grad.nr() == 1 &&
+                  grad.nc() == 1 &&
+                  gradient_input.k() == grad.k() &&
+                  gradient_input.size() > 0 && 
+                  is_same_object(grad,gradient_input) == false
+                  ,"");
+
+            auto g = grad.host();
+            auto gi = gradient_input.host();
+
+            for (long k = 0; k < gradient_input.k(); ++k)
+                g[k] = 0;
+
+            for (long n = 0; n < gradient_input.num_samples(); ++n)
+            {
+                for (long k = 0; k < gradient_input.k(); ++k)
+                {
+                    for (long r = 0; r < gradient_input.nr(); ++r)
+                    {
+                        for (long c = 0; c < gradient_input.nc(); ++c)
+                        {
+                            g[k] += (*gi++);
+                        }
+                    }
+                }
+            }
+        }
+
    // -----------------------------------------------------------------------------------

        void affine_transform(
@@ -293,6 +382,41 @@ namespace dlib
            }
        }

+    // -----------------------------------------------------------------------------------
+
+        void affine_transform_conv(
+            tensor& dest,
+            const tensor& src,
+            const tensor& A,
+            const tensor& B
+        )
+        {
+            DLIB_CASSERT(have_same_dimensions(dest,src),"");
+            DLIB_CASSERT(have_same_dimensions(A,B),"");
+            DLIB_CASSERT(A.num_samples() == 1 &&
+                         A.nr() == 1 &&
+                         A.nc() == 1 &&
+                         A.k() == src.k(), "");
+
+            auto d = dest.host();
+            auto s = src.host();
+            const auto a = A.host();
+            const auto b = B.host();
+            for (long n = 0; n < dest.num_samples(); ++n)
+            {
+                for (long k = 0; k < dest.k(); ++k)
+                {
+                    for (long r = 0; r < dest.nr(); ++r)
+                    {
+                        for (long c = 0; c < dest.nc(); ++c)
+                        {
+                            *d++ = a[k]*(*s++) + b[k];
+                        }
+                    }
+                }
+            }
+        }
+
    // -----------------------------------------------------------------------------------

        void batch_normalize_inference (
@@ -1238,7 +1362,6 @@ namespace dlib

        }

-
    // ------------------------------------------------------------------------------------
    // ------------------------------------------------------------------------------------
    // ------------------------------------------------------------------------------------

--- a/dlib/dnn/cpu_dlib.h
+++ b/dlib/dnn/cpu_dlib.h
@@ -25,6 +25,12 @@ namespace dlib
            const tensor& src2
        );

+        void multiply_conv (
+            tensor& dest,
+            const tensor& src1,
+            const tensor& src2
+        );
+
        void add(
            float beta,
            tensor& dest,
@@ -43,6 +49,11 @@ namespace dlib
            const tensor& src2
        );

+        void assign_conv_bias_gradient (
+            tensor& grad,
+            const tensor& gradient_input
+        );
+
    // -----------------------------------------------------------------------------------

        void affine_transform(
@@ -81,6 +92,15 @@ namespace dlib
            const tensor& B
        );

+    // -----------------------------------------------------------------------------------
+
+        void affine_transform_conv(
+            tensor& dest,
+            const tensor& src,
+            const tensor& A,
+            const tensor& B
+        );
+
    // -----------------------------------------------------------------------------------

        void batch_normalize_inference (

--- a/dlib/dnn/cuda_dlib.cu
+++ b/dlib/dnn/cuda_dlib.cu
@@ -94,6 +94,68 @@ namespace dlib
            }
        }

+    // ------------------------------------------------------------------------------------
+
+        __global__ void _cuda_multiply_conv(float* d, const float* s1, size_t n, const float* s2, size_t bs, size_t ks)
+        {
+            for (auto i : grid_stride_range(0, n))
+            {
+                auto k = (i/bs)%ks;
+                d[i] = s1[i]*s2[k];
+            }
+        }
+
+        __global__ void _cuda_multiply_conv2(float* d, const float* s1, size_t n, const float* s2, size_t bs, size_t ks)
+        {
+            // zero initialize d before we begin.
+            for (auto i : grid_stride_range(0, ks))
+                d[i] = 0;
+            __syncthreads();
+
+            // loop over all the image planes
+            for (auto i : grid_stride_range_y(0, n))
+            {
+                // sum all the elements in the i-th image plane
+                float temp = 0;
+                for (auto j : grid_stride_range(i*bs, (i+1)*bs))
+                    temp += s1[j]*s2[j];
+                auto k = i%ks;
+                // and store the sum into d[k]
+                warp_reduce_atomic_add(d[k], temp);
+            }
+        }
+
+
+        void multiply_conv (
+            tensor& dest,
+            const tensor& src1,
+            const tensor& src2
+        )
+        {
+            if (have_same_dimensions(dest,src1))
+            {
+                DLIB_CASSERT(src2.num_samples() == 1 && src2.nr() == 1 && src2.nc() == 1 && src2.k() == src1.k(),"");
+                if (dest.size() == 0)
+                    return;
+
+                launch_kernel(_cuda_multiply_conv,max_jobs(dest.size()),
+                    dest.device(), src1.device(), src1.size(), src2.device(), src1.nr()*src1.nc(), src1.k());
+            }
+            else
+            {
+                DLIB_CASSERT(have_same_dimensions(src1,src2),"");
+                DLIB_CASSERT(dest.num_samples() == 1 && dest.nr() == 1 && dest.nc() == 1 && dest.k() == src1.k(),"");
+                if (dest.size() == 0)
+                    return;
+
+                dim3 blocks(10,1);
+                dim3 threads(32,32); // x size must be 32 because we are using warp_reduce_atomic_add() in the kernel.
+                _cuda_multiply_conv2<<<blocks,threads>>>(
+                    dest.device(), src1.device(), src1.num_samples()*src1.k(), src2.device(), src1.nr()*src1.nc(), src1.k());
+            }
+
+        }
+
    // ------------------------------------------------------------------------------------

        __global__ void _cuda_add1(float* d, const float* s1, const float* s2, size_t n)
@@ -302,6 +364,32 @@ namespace dlib
            }
        }

+    // -----------------------------------------------------------------------------------
+
+        __global__ void _cuda_affine_transform_conv(float* d, const float* s, size_t n, const float* A, const float* B, size_t bs, size_t ks)
+        {
+            for (auto i : grid_stride_range(0, n))
+            {
+                auto k = (i/bs)%ks;
+                d[i] = A[k]*s[i] + B[k];
+            }
+        }
+
+        void affine_transform_conv(
+            tensor& dest,
+            const tensor& src,
+            const tensor& A,
+            const tensor& B
+        )
+        {
+            DLIB_CASSERT(have_same_dimensions(dest, src),"");
+            DLIB_CASSERT(have_same_dimensions(A, B),"");
+            DLIB_CASSERT(A.num_samples() == 1 && A.nr() == 1 && A.nc() == 1 && A.k() == src.k(),"");
+
+            launch_kernel(_cuda_affine_transform_conv,max_jobs(dest.size()),
+                    dest.device(), src.device(), src.size(), A.device(), B.device(), src.nr()*src.nc(), src.k());
+        }
+
    // -----------------------------------------------------------------------------------

        __global__ void _add_bias_gradient(float* out, const float* in, size_t n, size_t total_n)

--- a/dlib/dnn/cuda_dlib.h
+++ b/dlib/dnn/cuda_dlib.h
@@ -30,6 +30,12 @@ namespace dlib
            const tensor& src2
        );

+        void multiply_conv (
+            tensor& dest,
+            const tensor& src1,
+            const tensor& src2
+        );
+
        void add (
            tensor& dest,
            const tensor& src1,
@@ -82,6 +88,15 @@ namespace dlib
            const tensor& B
        );

+    // -----------------------------------------------------------------------------------
+
+        void affine_transform_conv(
+            tensor& dest,
+            const tensor& src,
+            const tensor& A,
+            const tensor& B
+        );
+
    // -----------------------------------------------------------------------------------

        void assign_bias_gradient (

--- a/dlib/dnn/tensor_tools.cpp
+++ b/dlib/dnn/tensor_tools.cpp
@@ -115,6 +115,19 @@ namespace dlib { namespace tt

    }

+    void multiply_conv (
+        tensor& dest,
+        const tensor& src1,
+        const tensor& src2
+    )
+    {
+#ifdef DLIB_USE_CUDA
+        cuda::multiply_conv(dest, src1, src2);
+#else
+        cpu::multiply_conv(dest, src1, src2);
+#endif
+    }
+
 // ----------------------------------------------------------------------------------------

    void affine_transform(
@@ -181,6 +194,22 @@ namespace dlib { namespace tt
 #endif
    }

+// ----------------------------------------------------------------------------------------
+
+    void affine_transform_conv(
+        tensor& dest,
+        const tensor& src,
+        const tensor& A,
+        const tensor& B
+    )
+    {
+#ifdef DLIB_USE_CUDA
+        cuda::affine_transform_conv(dest,src,A,B);
+#else
+        cpu::affine_transform_conv(dest,src,A,B);
+#endif
+    }
+
 // ----------------------------------------------------------------------------------------

    void batch_normalize_inference (
@@ -362,8 +391,7 @@ namespace dlib { namespace tt
 #ifdef DLIB_USE_CUDA
        cuda::assign_conv_bias_gradient(grad,gradient_input);
 #else
-        // TODO
-        DLIB_CASSERT(false,"");
+        cpu::assign_conv_bias_gradient(grad,gradient_input);
 #endif
    }


--- a/dlib/dnn/tensor_tools.h
+++ b/dlib/dnn/tensor_tools.h
@@ -118,6 +118,34 @@ namespace dlib { namespace tt
              with num_samples()==1 which is then assigned to #dest.
    !*/

+    void multiply_conv (
+        tensor& dest,
+        const tensor& src1,
+        const tensor& src2
+    );
+    /*!
+        requires
+            - if (have_same_dimensions(dest, src1) == true) then
+                - src2.num_samples() == 1
+                - src2.nr() == 1
+                - src2.nc() == 1
+                - src2.k() == src1.k()
+            - else
+                - have_same_dimensions(src1, src2) == true) 
+                - dest.num_samples() == 1
+                - dest.nr() == 1
+                - dest.nc() == 1
+                - dest.k() == src1.k()
+        ensures
+            - Performs #dest == src1*src2 
+              In particular, if the elements of dest, src1, and src2 were indexed by (n,k,r,c) then
+              we would have:
+                - if (have_same_dimensions(dest,src1)) then
+                    #dest(n,k,r,c) == src1(n,k,r,c)*src2(k)
+                - else
+                    #dest(k) == sum over {n,r,c} of src1(n,k,r,c)*src2(n,k,r,c)
+    !*/
+
 // ----------------------------------------------------------------------------------------

    void affine_transform(
@@ -196,6 +224,29 @@ namespace dlib { namespace tt
                    - #dest.host()[i] == A.host()[i]*src.host()[i] + B.host()[i]  
    !*/

+// ----------------------------------------------------------------------------------------
+
+    void affine_transform_conv(
+        tensor& dest,
+        const tensor& src,
+        const tensor& A,
+        const tensor& B
+    );
+    /*!
+        requires
+            - have_same_dimensions(dest,src) == true
+            - have_same_dimensions(A, B) == true
+            - A.num_samples() == 1
+            - A.nr() == 1
+            - A.nc() == 1
+            - A.k() == src.k()
+        ensures
+            - Performs #dest == A*src + B
+              In particular, if the elements of dest and src were indexed by (n,k,r,c) then
+              we would have:
+                #dest(n,k,r,c) == A(k)*src(n,k,r,c) + B(k).
+    !*/
+
 // ----------------------------------------------------------------------------------------

    void batch_normalize_inference (