Removed unnecessary zero initialization of parameter gradients in core.h.

e2a2a26a · Davis King · eada4be8 · e2a2a26a · e2a2a26a · e2a2a26a
Commit e2a2a26a authored Nov 11, 2015 by Davis King
5 changed files
--- a/dlib/dnn/core.h
+++ b/dlib/dnn/core.h
@@ -365,7 +365,6 @@ namespace dlib
        {
            dimpl::subnet_wrapper<subnet_type> wsub(subnetwork);
            params_grad.copy_size(details.get_layer_params());
-            params_grad = 0;
            details.backward(get_output(), get_gradient_input(), wsub, static_cast<tensor&>(params_grad));
            // Don't try to adjust the parameters if this layer doesn't have any.
            if (params_grad.size() != 0)
@@ -601,7 +600,6 @@ namespace dlib
        {
            subnet_wrapper wsub(x, grad_final_ignored);
            params_grad.copy_size(details.get_layer_params());
-            params_grad = 0;
            details.backward(get_output(), get_gradient_input(), wsub, static_cast<tensor&>(params_grad));
            // Don't try to adjust the parameters if this layer doesn't have any.
            if (params_grad.size() != 0)
@@ -1605,11 +1603,11 @@ namespace dlib
        // Now tell the layer to compute all the gradients.  In the rest of this function
        // we will just be checking that these gradients were computed correctly by
        // comparing them to a central differences approximation.
-        resizable_tensor params_grad, random_noise;
+        resizable_tensor params_grad;
        params_grad.copy_size(l.get_layer_params());
-        random_noise.copy_size(l.get_layer_params());
-        randomize_parameters(random_noise, 5, rnd);
-        params_grad = random_noise;
+        // Set the params grad to something crazy so that it's very obvious if it doesn't
+        // get fully assigned.
+        params_grad = std::numeric_limits<float>::infinity();
        l.backward(output, input_grad, subnetwork, params_grad);


@@ -1631,7 +1629,7 @@ namespace dlib
            // Compute a reference derivative via a central differences approximation and
            // compare it to the one output by the layer and make sure they match.
            double reference_derivative = (dot(out2,input_grad)-dot(out3, input_grad))/(2*eps);
-            double output_derivative = params_grad.host()[i]-random_noise.host()[i];
+            double output_derivative = params_grad.host()[i];
            double relative_error = (reference_derivative - output_derivative)/(reference_derivative + 1e-100);
            if (std::abs(relative_error) > 0.01)
            {

--- a/dlib/dnn/cudnn_dlibapi.cpp
+++ b/dlib/dnn/cudnn_dlibapi.cpp
@@ -221,7 +221,7 @@ namespace dlib
                  gradient_input.size() > 0,"");

            const float alpha = 1;
-            const float beta = 1;
+            const float beta = 0;
            check(cudnnConvolutionBackwardBias(context(),
                                               &alpha,
                                               descriptor(gradient_input),
@@ -483,7 +483,7 @@ namespace dlib
        )
        {
            const float alpha = 1;
-            const float beta = 1;
+            const float beta = 0;
            check(cudnnConvolutionBackwardFilter_v3(context(),
                                                    &alpha,
                                                    descriptor(data),

--- a/dlib/dnn/cudnn_dlibapi.h
+++ b/dlib/dnn/cudnn_dlibapi.h
@@ -128,7 +128,7 @@ namespace dlib
                - let OUT be the output of add(1,OUT,1,BIAS)
                - let f(gradient_input,BIAS) == dot(gradient_input,OUT)
                - Then this function computes the gradient of f() with respect to BIAS and
-                  adds it to grad.
+                  assigns it to grad.
        !*/

    // ------------------------------------------------------------------------------------
@@ -219,7 +219,7 @@ namespace dlib
                    - let OUT be the output of (*this)(OUT,data,filters).
                    - let f(data,filters) == dot(OUT, gradient_input)
                    - This function finds the gradient of f() with respect to filters 
-                      and adds this gradient to filters_gradient.
+                      and assigns this gradient to filters_gradient.
            !*/

        private:

--- a/dlib/dnn/layers.h
+++ b/dlib/dnn/layers.h
@@ -92,7 +92,7 @@ namespace dlib
        void backward(const tensor& , const tensor& gradient_input, SUBNET& sub, tensor& params_grad)
        {
            // compute the gradient of the parameters.  
-            params_grad += trans(mat(sub.get_output()))*mat(gradient_input);
+            params_grad = trans(mat(sub.get_output()))*mat(gradient_input);

            // compute the gradient for the data
            sub.get_gradient_input() += mat(gradient_input)*trans(mat(params));
@@ -161,7 +161,9 @@ namespace dlib
            for (unsigned long i = 0; i < sub.get_output().size(); ++i)
            {
                if (in[i] > 0)
-                    out[i] += grad[i];
+                    out[i] = grad[i];
+                else
+                    out[i] = 0;
            }

        }

--- a/dlib/dnn/layers_abstract.h
+++ b/dlib/dnn/layers_abstract.h
@@ -201,8 +201,8 @@ namespace dlib
                          draw inputs from the immediate sub layer, sub.subnet(), or
                          any earlier layer.  So you must consider the gradients with
                          respect to all inputs drawn from sub)
-                  Finally, backward() adds these gradients into the output by performing:
-                    - params_grad += PARAMETER_GRADIENT
+                  Finally, backward() outputs these gradients by performing:
+                    - params_grad = PARAMETER_GRADIENT 
                    - for all valid I:
                        - layer<I>(sub).get_gradient_input() += DATA_GRADIENT_I
        !*/