Made spatially_filter_image_separable() use SIMD instructions when filtering

float data.

Made spatially_filter_image_separable() use SIMD instructions when filtering
float data.
306c9c5b · Davis King · 9b9ffcba · 306c9c5b · 306c9c5b
Commit 306c9c5b authored Nov 10, 2013 by Davis King
Showing with 310 additions and 82 deletions

dlib/image_transforms/spatial_filtering.h dlib/image_transforms/spatial_filtering.h +307 -82

dlib/image_transforms/spatial_filtering_abstract.h dlib/image_transforms/spatial_filtering_abstract.h +3 -0

No files found.
--- a/dlib/image_transforms/spatial_filtering.h
+++ b/dlib/image_transforms/spatial_filtering.h
@@ -10,6 +10,7 @@
 #include "../array2d.h"
 #include "../matrix.h"
 #include "../geometry/border_enumerator.h"
+#include "../simd.h"
 #include <limits>

 namespace dlib
@@ -208,119 +209,277 @@ namespace dlib

 // ----------------------------------------------------------------------------------------

-    template <
-        typename in_image_type,
-        typename out_image_type,
-        typename EXP1,
-        typename EXP2,
-        typename T
-        >
-    typename enable_if_c<pixel_traits<typename out_image_type::type>::grayscale,rectangle>::type 
-    spatially_filter_image_separable (
-        const in_image_type& in_img,
-        out_image_type& out_img,
-        const matrix_exp<EXP1>& row_filter,
-        const matrix_exp<EXP2>& col_filter,
-        T scale,
-        bool use_abs = false,
-        bool add_to = false
-    )
+    namespace impl
    {
-        COMPILE_TIME_ASSERT( pixel_traits<typename in_image_type::type>::has_alpha == false );
-        COMPILE_TIME_ASSERT( pixel_traits<typename out_image_type::type>::has_alpha == false );
-
-        DLIB_ASSERT(scale != 0 && row_filter.size() != 0 && col_filter.size() != 0 &&
-                    is_vector(row_filter) &&
-                    is_vector(col_filter),
-            "\tvoid spatially_filter_image_separable()"
-            << "\n\t Invalid inputs were given to this function."
-            << "\n\t scale: "<< scale
-            << "\n\t row_filter.size(): "<< row_filter.size()
-            << "\n\t col_filter.size(): "<< col_filter.size()
-            << "\n\t is_vector(row_filter): "<< is_vector(row_filter)
-            << "\n\t is_vector(col_filter): "<< is_vector(col_filter)
+        template <
+            typename in_image_type,
+            typename out_image_type,
+            typename EXP1,
+            typename EXP2,
+            typename T
+            >
+        rectangle grayscale_spatially_filter_image_separable (
+            const in_image_type& in_img,
+            out_image_type& out_img,
+            const matrix_exp<EXP1>& _row_filter,
+            const matrix_exp<EXP2>& _col_filter,
+            T scale,
+            bool use_abs,
+            bool add_to 
+        )
+        {
+            const_temp_matrix<EXP1> row_filter(_row_filter);
+            const_temp_matrix<EXP2> col_filter(_col_filter);
+            COMPILE_TIME_ASSERT( pixel_traits<typename in_image_type::type>::has_alpha == false );
+            COMPILE_TIME_ASSERT( pixel_traits<typename out_image_type::type>::has_alpha == false );
+
+            DLIB_ASSERT(scale != 0 && row_filter.size() != 0 && col_filter.size() != 0 &&
+                is_vector(row_filter) &&
+                is_vector(col_filter),
+                "\tvoid spatially_filter_image_separable()"
+                << "\n\t Invalid inputs were given to this function."
+                << "\n\t scale: "<< scale
+                << "\n\t row_filter.size(): "<< row_filter.size()
+                << "\n\t col_filter.size(): "<< col_filter.size()
+                << "\n\t is_vector(row_filter): "<< is_vector(row_filter)
+                << "\n\t is_vector(col_filter): "<< is_vector(col_filter)
            );
-        DLIB_ASSERT(is_same_object(in_img, out_img) == false,
-            "\tvoid spatially_filter_image_separable()"
-            << "\n\tYou must give two different image objects"
+            DLIB_ASSERT(is_same_object(in_img, out_img) == false,
+                "\tvoid spatially_filter_image_separable()"
+                << "\n\tYou must give two different image objects"
            );



-        // if there isn't any input image then don't do anything
-        if (in_img.size() == 0)
-        {
-            out_img.clear();
-            return rectangle();
-        }
+            // if there isn't any input image then don't do anything
+            if (in_img.size() == 0)
+            {
+                out_img.clear();
+                return rectangle();
+            }

-        out_img.set_size(in_img.nr(),in_img.nc());
+            out_img.set_size(in_img.nr(),in_img.nc());


-        // figure out the range that we should apply the filter to
-        const long first_row = (col_filter.size()-1)/2;
-        const long first_col = (row_filter.size()-1)/2;
-        const long last_row = in_img.nr() - (col_filter.size()/2);
-        const long last_col = in_img.nc() - (row_filter.size()/2);
+            // figure out the range that we should apply the filter to
+            const long first_row = (col_filter.size()-1)/2;
+            const long first_col = (row_filter.size()-1)/2;
+            const long last_row = in_img.nr() - (col_filter.size()/2);
+            const long last_col = in_img.nc() - (row_filter.size()/2);

-        const rectangle non_border = rectangle(first_col, first_row, last_col-1, last_row-1);
-        if (!add_to)
-            zero_border_pixels(out_img, non_border); 
+            const rectangle non_border = rectangle(first_col, first_row, last_col-1, last_row-1);
+            if (!add_to)
+                zero_border_pixels(out_img, non_border); 

-        typedef typename out_image_type::mem_manager_type mem_manager_type;
-        typedef typename EXP1::type ptype;
+            typedef typename out_image_type::mem_manager_type mem_manager_type;
+            typedef typename EXP1::type ptype;

-        array2d<ptype,mem_manager_type> temp_img;
-        temp_img.set_size(in_img.nr(), in_img.nc());
+            array2d<ptype,mem_manager_type> temp_img;
+            temp_img.set_size(in_img.nr(), in_img.nc());

-        // apply the row filter
-        for (long r = 0; r < in_img.nr(); ++r)
-        {
-            for (long c = first_col; c < last_col; ++c)
+            // apply the row filter
+            for (long r = 0; r < in_img.nr(); ++r)
            {
-                ptype p;
-                ptype temp = 0;
-                for (long n = 0; n < row_filter.size(); ++n)
+                for (long c = first_col; c < last_col; ++c)
                {
-                    // pull out the current pixel and put it into p
-                    p = get_pixel_intensity(in_img[r][c-first_col+n]);
-                    temp += p*row_filter(n);
+                    ptype p;
+                    ptype temp = 0;
+                    for (long n = 0; n < row_filter.size(); ++n)
+                    {
+                        // pull out the current pixel and put it into p
+                        p = get_pixel_intensity(in_img[r][c-first_col+n]);
+                        temp += p*row_filter(n);
+                    }
+                    temp_img[r][c] = temp;
                }
-                temp_img[r][c] = temp;
            }
-        }

-        // apply the column filter 
-        for (long r = first_row; r < last_row; ++r)
-        {
-            for (long c = first_col; c < last_col; ++c)
+            // apply the column filter 
+            for (long r = first_row; r < last_row; ++r)
            {
-                ptype temp = 0;
-                for (long m = 0; m < col_filter.size(); ++m)
+                for (long c = first_col; c < last_col; ++c)
                {
-                    temp += temp_img[r-first_row+m][c]*col_filter(m);
+                    ptype temp = 0;
+                    for (long m = 0; m < col_filter.size(); ++m)
+                    {
+                        temp += temp_img[r-first_row+m][c]*col_filter(m);
+                    }
+
+                    temp /= scale;
+
+                    if (use_abs && temp < 0)
+                    {
+                        temp = -temp;
+                    }
+
+                    // save this pixel to the output image
+                    if (add_to == false)
+                    {
+                        assign_pixel(out_img[r][c], temp);
+                    }
+                    else
+                    {
+                        assign_pixel(out_img[r][c], temp + out_img[r][c]);
+                    }
                }
+            }
+            return non_border;
+        }

-                temp /= scale;
+    // ------------------------------------------------------------------------------------
+
+        // This overload is optimized to use SIMD instructions when filtering float images with
+        // float filters.
+        template <
+            typename in_image_type,
+            typename out_image_type,
+            typename EXP1,
+            typename EXP2
+            >
+        rectangle float_spatially_filter_image_separable (
+            const in_image_type& in_img,
+            out_image_type& out_img,
+            const matrix_exp<EXP1>& _row_filter,
+            const matrix_exp<EXP2>& _col_filter,
+            bool add_to 
+        )
+        {
+            const_temp_matrix<EXP1> row_filter(_row_filter);
+            const_temp_matrix<EXP2> col_filter(_col_filter);
+            DLIB_ASSERT(row_filter.size() != 0 && col_filter.size() != 0 &&
+                is_vector(row_filter) &&
+                is_vector(col_filter),
+                "\tvoid spatially_filter_image_separable()"
+                << "\n\t Invalid inputs were given to this function."
+                << "\n\t row_filter.size(): "<< row_filter.size()
+                << "\n\t col_filter.size(): "<< col_filter.size()
+                << "\n\t is_vector(row_filter): "<< is_vector(row_filter)
+                << "\n\t is_vector(col_filter): "<< is_vector(col_filter)
+            );
+            DLIB_ASSERT(is_same_object(in_img, out_img) == false,
+                "\tvoid spatially_filter_image_separable()"
+                << "\n\tYou must give two different image objects"
+            );

-                if (use_abs && temp < 0)
+
+
+            // if there isn't any input image then don't do anything
+            if (in_img.size() == 0)
+            {
+                out_img.clear();
+                return rectangle();
+            }
+
+            out_img.set_size(in_img.nr(),in_img.nc());
+
+            // figure out the range that we should apply the filter to
+            const long first_row = (col_filter.size()-1)/2;
+            const long first_col = (row_filter.size()-1)/2;
+            const long last_row = in_img.nr() - (col_filter.size()/2);
+            const long last_col = in_img.nc() - (row_filter.size()/2);
+
+            const rectangle non_border = rectangle(first_col, first_row, last_col-1, last_row-1);
+            if (!add_to)
+                zero_border_pixels(out_img, non_border); 
+
+            typedef typename in_image_type::mem_manager_type mem_manager_type;
+
+            array2d<float,mem_manager_type> temp_img;
+            temp_img.set_size(in_img.nr(), in_img.nc());
+
+            // apply the row filter
+            for (long r = 0; r < in_img.nr(); ++r)
+            {
+                long c = first_col;
+                for (; c < last_col-3; c+=4)
                {
-                    temp = -temp;
+                    simd4f p, temp = 0;
+                    for (long n = 0; n < row_filter.size(); ++n)
+                    {
+                        // pull out the current pixel and put it into p
+                        p.load(&in_img[r][c-first_col+n]);
+                        temp += p*row_filter(n);
+                    }
+                    temp.store(&temp_img[r][c]);
                }
+                for (; c < last_col; ++c)
+                {
+                    float p;
+                    float temp = 0;
+                    for (long n = 0; n < row_filter.size(); ++n)
+                    {
+                        // pull out the current pixel and put it into p
+                        p = in_img[r][c-first_col+n];
+                        temp += p*row_filter(n);
+                    }
+                    temp_img[r][c] = temp;
+                }
+            }

-                // save this pixel to the output image
-                if (add_to == false)
+            // apply the column filter 
+            for (long r = first_row; r < last_row; ++r)
+            {
+                long c = first_col;
+                for (; c < last_col-3; c+=4)
                {
-                    assign_pixel(out_img[r][c], temp);
+                    simd4f p, temp = 0;
+                    for (long m = 0; m < col_filter.size(); ++m)
+                    {
+                        p.load(&temp_img[r-first_row+m][c]);
+                        temp += p*col_filter(m);
+                    }
+
+                    // save this pixel to the output image
+                    if (add_to == false)
+                    {
+                        temp.store(&out_img[r][c]);
+                    }
+                    else
+                    {
+                        p.load(&out_img[r][c]);
+                        temp += p;
+                        temp.store(&out_img[r][c]);
+                    }
                }
-                else
+                for (; c < last_col; ++c)
                {
-                    assign_pixel(out_img[r][c], temp + out_img[r][c]);
+                    float temp = 0;
+                    for (long m = 0; m < col_filter.size(); ++m)
+                    {
+                        temp += temp_img[r-first_row+m][c]*col_filter(m);
+                    }
+
+                    // save this pixel to the output image
+                    if (add_to == false)
+                    {
+                        out_img[r][c] = temp;
+                    }
+                    else
+                    {
+                        out_img[r][c] += temp;
+                    }
                }
            }
+            return non_border;
        }
-        return non_border;
-    }
+
+    } // namespace impl
+
+// ----------------------------------------------------------------------------------------
+
+    template <
+        typename in_image_type,
+        typename out_image_type,
+        typename EXP1,
+        typename EXP2
+        >
+    struct is_float_filtering
+    {
+        const static bool value = is_same_type<typename in_image_type::type,float>::value &&
+                                  is_same_type<typename out_image_type::type,float>::value &&
+                                  is_same_type<typename EXP1::type,float>::value &&
+                                  is_same_type<typename EXP2::type,float>::value;
+    };

 // ----------------------------------------------------------------------------------------

@@ -331,15 +490,77 @@ namespace dlib
        typename EXP2,
        typename T
        >
-    typename disable_if_c<pixel_traits<typename out_image_type::type>::grayscale,rectangle>::type 
+    typename enable_if_c<pixel_traits<typename out_image_type::type>::grayscale && 
+                         is_float_filtering<in_image_type,out_image_type,EXP1,EXP2>::value,rectangle>::type 
    spatially_filter_image_separable (
        const in_image_type& in_img,
        out_image_type& out_img,
        const matrix_exp<EXP1>& row_filter,
        const matrix_exp<EXP2>& col_filter,
+        T scale,
+        bool use_abs = false,
+        bool add_to = false
+    )
+    {
+        if (use_abs == false)
+        {
+            if (scale == 1)
+                return impl::float_spatially_filter_image_separable(in_img, out_img, row_filter, col_filter, add_to);
+            else
+                return impl::float_spatially_filter_image_separable(in_img, out_img, row_filter/scale, col_filter, add_to);
+        }
+        else
+        {
+            return impl::grayscale_spatially_filter_image_separable(in_img, out_img, row_filter, col_filter, scale, true, add_to);
+        }
+    }
+
+// ----------------------------------------------------------------------------------------
+
+    template <
+        typename in_image_type,
+        typename out_image_type,
+        typename EXP1,
+        typename EXP2,
+        typename T
+        >
+    typename enable_if_c<pixel_traits<typename out_image_type::type>::grayscale && 
+                         !is_float_filtering<in_image_type,out_image_type,EXP1,EXP2>::value,rectangle>::type 
+    spatially_filter_image_separable (
+        const in_image_type& in_img,
+        out_image_type& out_img,
+        const matrix_exp<EXP1>& _row_filter,
+        const matrix_exp<EXP2>& _col_filter,
+        T scale,
+        bool use_abs = false,
+        bool add_to = false
+    )
+    {
+        const_temp_matrix<EXP1> row_filter(_row_filter);
+        const_temp_matrix<EXP2> col_filter(_col_filter);
+        return impl::grayscale_spatially_filter_image_separable(in_img,out_img, row_filter, col_filter, scale, use_abs, add_to);
+    }
+
+// ----------------------------------------------------------------------------------------
+
+    template <
+        typename in_image_type,
+        typename out_image_type,
+        typename EXP1,
+        typename EXP2,
+        typename T
+        >
+    typename disable_if_c<pixel_traits<typename out_image_type::type>::grayscale,rectangle>::type 
+    spatially_filter_image_separable (
+        const in_image_type& in_img,
+        out_image_type& out_img,
+        const matrix_exp<EXP1>& _row_filter,
+        const matrix_exp<EXP2>& _col_filter,
        T scale
    )
    {
+        const_temp_matrix<EXP1> row_filter(_row_filter);
+        const_temp_matrix<EXP2> col_filter(_col_filter);
        COMPILE_TIME_ASSERT( pixel_traits<typename in_image_type::type>::has_alpha == false );
        COMPILE_TIME_ASSERT( pixel_traits<typename out_image_type::type>::has_alpha == false );

@@ -428,6 +649,8 @@ namespace dlib
        return non_border;
    }

+// ----------------------------------------------------------------------------------------
+
    template <
        typename in_image_type,
        typename out_image_type,
@@ -444,6 +667,8 @@ namespace dlib
        return spatially_filter_image_separable(in_img,out_img,row_filter,col_filter,1);
    }

+// ----------------------------------------------------------------------------------------
+// ----------------------------------------------------------------------------------------
 // ----------------------------------------------------------------------------------------

    template <

--- a/dlib/image_transforms/spatial_filtering_abstract.h
+++ b/dlib/image_transforms/spatial_filtering_abstract.h
@@ -121,6 +121,9 @@ namespace dlib
            - #out_img.nr() == in_img.nr()
            - returns a rectangle which indicates what pixels in #out_img are considered 
              non-border pixels and therefore contain output from the filter.
+            - if (use_abs == false && all images and filers contain float types) then
+                - This function will use SIMD instructions and is particularly fast.  So if
+                  you can use this form of the function it can give a decent speed boost.
    !*/

 // ----------------------------------------------------------------------------------------