Fleshed out the AVX SIMD support

80d36f43 · Davis King · 4fec4476 · 80d36f43 · 80d36f43 · 80d36f43
Commit 80d36f43 authored Nov 10, 2013 by Davis King
Showing with 609 additions and 15 deletions

dlib/simd.h dlib/simd.h +1 -0

dlib/simd/simd8f.h dlib/simd/simd8f.h +244 -0

dlib/simd/simd8i.h dlib/simd/simd8i.h +339 -0

dlib/simd/simd_check.h dlib/simd/simd_check.h +25 -15

No files found.
--- a/dlib/simd.h
+++ b/dlib/simd.h
@@ -6,6 +6,7 @@
 #include "simd/simd4f.h"
 #include "simd/simd4i.h"
 #include "simd/simd8f.h"
+#include "simd/simd8i.h"

 #endif // DLIB_SIMd_H__

--- a/dlib/simd/simd8f.h
+++ b/dlib/simd/simd8f.h
@@ -5,6 +5,7 @@

 #include "simd_check.h"
 #include "simd4f.h"
+#include "simd8i.h"


 namespace dlib
@@ -24,6 +25,7 @@ namespace dlib
        inline simd8f(float r0, float r1, float r2, float r3, float r4, float r5, float r6, float r7) 
        { x = _mm256_setr_ps(r0,r1,r2,r3,r4,r5,r6,r7); }

+        simd8f(const simd8i& val):x(_mm256_cvtepi32_ps(val)) {}
        simd8f(const __m256& val):x(val) {}
        simd8f& operator=(const __m256& val)
        {
@@ -32,6 +34,9 @@ namespace dlib
        }
        inline operator __m256() const { return x; }

+        // truncate to 32bit integers
+        operator __m256i() const { return _mm256_cvttps_epi32(x); }
+
        void load_aligned(const type* ptr)  { x = _mm256_load_ps(ptr); }
        void store_aligned(type* ptr) const { _mm256_store_ps(ptr, x); }
        void load(const type* ptr)          { x = _mm256_loadu_ps(ptr); }
@@ -51,6 +56,33 @@ namespace dlib
    private:
        __m256 x;
    };
+
+
+    class simd8f_bool
+    {
+    public:
+        typedef float type;
+
+        simd8f_bool() {}
+        simd8f_bool(const __m256& val):x(val) {}
+        simd8f_bool(const simd4f_bool& low, const simd4f_bool& high)
+        {
+            x = _mm256_insertf128_ps(_mm256_castps128_ps256(low),high,1);
+        }
+
+        simd8f_bool& operator=(const __m256& val)
+        {
+            x = val;
+            return *this;
+        }
+
+        operator __m256() const { return x; }
+
+
+    private:
+        __m256 x;
+    };
+
 #else
    class simd8f
    {
@@ -62,6 +94,16 @@ namespace dlib
        simd8f(float f) :_low(f),_high(f) {}
        simd8f(float r0, float r1, float r2, float r3, float r4, float r5, float r6, float r7) :
            _low(r0,r1,r2,r3), _high(r4,r5,r6,r7) {}
+        simd8f(const simd8i& val) : _low(val.low()), _high(val.high()) { }
+
+        // truncate to 32bit integers
+        operator simd8i::rawarray() const 
+        { 
+            simd8i::rawarray temp;
+            temp.low = _low;
+            temp.high = _high;
+            return temp;
+        }

        void load_aligned(const type* ptr)  { _low.load_aligned(ptr); _high.load_aligned(ptr+4); }
        void store_aligned(type* ptr) const { _low.store_aligned(ptr); _high.store_aligned(ptr+4); }
@@ -83,6 +125,21 @@ namespace dlib
    private:
        simd4f _low, _high;
    };
+
+    class simd8f_bool
+    {
+    public:
+        typedef float type;
+
+        simd8f_bool() {}
+        simd8f_bool(const simd4f_bool& low_, const simd4f_bool& high_): _low(low_),_high(high_){}
+
+
+        simd4f_bool low() const { return _low; }
+        simd4f_bool high() const { return _high; }
+    private:
+        simd4f_bool _low,_high;
+    };
 #endif

 // ----------------------------------------------------------------------------------------
@@ -110,6 +167,20 @@ namespace dlib
    inline simd8f& operator+= (simd8f& lhs, const simd8f& rhs) 
    { return lhs = lhs + rhs; return lhs;}

+// ----------------------------------------------------------------------------------------
+
+    inline simd8f operator- (const simd8f& lhs, const simd8f& rhs) 
+    { 
+#ifdef DLIB_HAVE_AVX
+        return _mm256_sub_ps(lhs, rhs); 
+#else
+        return simd8f(lhs.low()-rhs.low(),
+                      lhs.high()-rhs.high());
+#endif
+    }
+    inline simd8f& operator-= (simd8f& lhs, const simd8f& rhs) 
+    { return lhs = lhs - rhs; return lhs;}
+
 // ----------------------------------------------------------------------------------------

    inline simd8f operator* (const simd8f& lhs, const simd8f& rhs) 
@@ -124,6 +195,130 @@ namespace dlib
    inline simd8f& operator*= (simd8f& lhs, const simd8f& rhs) 
    { return lhs = lhs * rhs; return lhs;}

+// ----------------------------------------------------------------------------------------
+
+    inline simd8f operator/ (const simd8f& lhs, const simd8f& rhs) 
+    { 
+#ifdef DLIB_HAVE_AVX
+        return _mm256_div_ps(lhs, rhs); 
+#else
+        return simd8f(lhs.low()/rhs.low(),
+                      lhs.high()/rhs.high());
+#endif
+    }
+    inline simd8f& operator/= (simd8f& lhs, const simd8f& rhs) 
+    { return lhs = lhs / rhs; return lhs;}
+
+// ----------------------------------------------------------------------------------------
+
+    inline simd8f_bool operator== (const simd8f& lhs, const simd8f& rhs) 
+    { 
+#ifdef DLIB_HAVE_AVX
+        return _mm256_cmp_ps(lhs, rhs, 0); 
+#else
+        return simd8f_bool(lhs.low() ==rhs.low(),
+                      lhs.high()==rhs.high());
+#endif
+    }
+
+// ----------------------------------------------------------------------------------------
+
+    inline simd8f_bool operator!= (const simd8f& lhs, const simd8f& rhs) 
+    { 
+#ifdef DLIB_HAVE_AVX
+        return _mm256_cmp_ps(lhs, rhs, 4); 
+#else
+        return simd8f_bool(lhs.low() !=rhs.low(),
+                      lhs.high()!=rhs.high());
+#endif
+    }
+
+// ----------------------------------------------------------------------------------------
+
+    inline simd8f_bool operator< (const simd8f& lhs, const simd8f& rhs) 
+    { 
+#ifdef DLIB_HAVE_AVX
+        return _mm256_cmp_ps(lhs, rhs, 1); 
+#else
+        return simd8f_bool(lhs.low() <rhs.low(),
+                      lhs.high()<rhs.high());
+#endif
+    }
+
+// ----------------------------------------------------------------------------------------
+
+    inline simd8f_bool operator> (const simd8f& lhs, const simd8f& rhs) 
+    { 
+        return rhs < lhs;
+    }
+
+// ----------------------------------------------------------------------------------------
+
+    inline simd8f_bool operator<= (const simd8f& lhs, const simd8f& rhs) 
+    { 
+#ifdef DLIB_HAVE_AVX
+        return _mm256_cmp_ps(lhs, rhs, 2); 
+#else
+        return simd8f_bool(lhs.low() <=rhs.low(),
+                      lhs.high()<=rhs.high());
+#endif
+    }
+
+// ----------------------------------------------------------------------------------------
+
+    inline simd8f_bool operator>= (const simd8f& lhs, const simd8f& rhs) 
+    { 
+        return rhs <= lhs;
+    }
+
+// ----------------------------------------------------------------------------------------
+
+    inline simd8f min (const simd8f& lhs, const simd8f& rhs) 
+    { 
+#ifdef DLIB_HAVE_AVX
+        return _mm256_min_ps(lhs, rhs); 
+#else
+        return simd8f(min(lhs.low(), rhs.low()),
+                      min(lhs.high(),rhs.high()));
+#endif
+    }
+
+// ----------------------------------------------------------------------------------------
+
+    inline simd8f max (const simd8f& lhs, const simd8f& rhs) 
+    { 
+#ifdef DLIB_HAVE_AVX
+        return _mm256_max_ps(lhs, rhs); 
+#else
+        return simd8f(max(lhs.low(), rhs.low()),
+                      max(lhs.high(),rhs.high()));
+#endif
+    }
+
+// ----------------------------------------------------------------------------------------
+
+    inline simd8f reciprocal (const simd8f& item) 
+    { 
+#ifdef DLIB_HAVE_AVX
+        return _mm256_rcp_ps(item); 
+#else
+        return simd8f(reciprocal(item.low()),
+                      reciprocal(item.high()));
+#endif
+    }
+
+// ----------------------------------------------------------------------------------------
+
+    inline simd8f reciprocal_sqrt (const simd8f& item) 
+    { 
+#ifdef DLIB_HAVE_AVX
+        return _mm256_rsqrt_ps(item); 
+#else
+        return simd8f(reciprocal_sqrt(item.low()),
+                      reciprocal_sqrt(item.high()));
+#endif
+    }
+
 // ----------------------------------------------------------------------------------------

    inline float sum(const simd8f& item)
@@ -144,6 +339,55 @@ namespace dlib
        return sum(lhs*rhs);
    }
   
+// ----------------------------------------------------------------------------------------
+
+    inline simd8f sqrt(const simd8f& item)
+    {
+#ifdef DLIB_HAVE_AVX
+        return _mm256_sqrt_ps(item);
+#else
+        return simd8f(sqrt(item.low()),
+                      sqrt(item.high()));
+#endif
+    }
+
+// ----------------------------------------------------------------------------------------
+
+    inline simd8f ceil(const simd8f& item)
+    {
+#ifdef DLIB_HAVE_AVX
+        return _mm256_ceil_ps(item);
+#else
+        return simd8f(ceil(item.low()),
+                      ceil(item.high()));
+#endif
+    }
+
+// ----------------------------------------------------------------------------------------
+
+    inline simd8f floor(const simd8f& item)
+    {
+#ifdef DLIB_HAVE_AVX
+        return _mm256_floor_ps(item);
+#else
+        return simd8f(floor(item.low()),
+                      floor(item.high()));
+#endif
+    }
+
+// ----------------------------------------------------------------------------------------
+
+    // perform cmp ? a : b
+    inline simd8f select(const simd8f_bool& cmp, const simd8f& a, const simd8f& b)
+    {
+#ifdef DLIB_HAVE_AVX
+        return _mm256_blendv_ps(b,a,cmp);
+#else
+        return simd8f(select(cmp.low(),  a.low(),  b.low()),
+                      select(cmp.high(), a.high(), b.high()));
+#endif
+    }
+
 // ----------------------------------------------------------------------------------------

 }

--- a/dlib/simd/simd8i.h
+++ b/dlib/simd/simd8i.h
+// Copyright (C) 2013  Davis E. King (davis@dlib.net)
+// License: Boost Software License   See LICENSE.txt for the full license.
+#ifndef DLIB_sIMD8I_H__
+#define DLIB_sIMD8I_H__
+
+#include "simd_check.h"
+#include "../uintn.h"
+
+namespace dlib
+{
+
+#ifdef DLIB_HAVE_AVX
+    class simd8i
+    {
+    public:
+        typedef int32 type;
+
+        simd8i() {}
+        simd8i(int32 f) { x = _mm256_set1_epi32(f); }
+        simd8i(int32 r0, int32 r1, int32 r2, int32 r3,
+               int32 r4, int32 r5, int32 r6, int32 r7 ) 
+        { x = _mm256_setr_epi32(r0,r1,r2,r3,r4,r5,r6,r7); }
+
+        simd8i(const __m256i& val):x(val) {}
+
+        simd8i(const simd4i& low, const simd4i& high)
+        {
+            x = _mm256_insertf128_si256(_mm256_castsi128_si256(low),high,1);
+        }
+
+        simd8i& operator=(const __m256i& val)
+        {
+            x = val;
+            return *this;
+        }
+
+        operator __m256i() const { return x; }
+
+        void load_aligned(const type* ptr)  { x = _mm256_load_si256((const __m256i*)ptr); }
+        void store_aligned(type* ptr) const { _mm256_store_si256((__m256i*)ptr, x); }
+        void load(const type* ptr)          { x = _mm256_loadu_si256((const __m256i*)ptr); }
+        void store(type* ptr)         const { _mm256_storeu_si256((__m256i*)ptr, x); }
+
+        simd4i low() const { return _mm256_castsi256_si128(x); }
+        simd4i high() const { return _mm256_extractf128_si256(x,1); }
+
+        unsigned int size() const { return 4; }
+        int32 operator[](unsigned int idx) const 
+        {
+            int32 temp[8];
+            store(temp);
+            return temp[idx];
+        }
+
+    private:
+        __m256i x;
+    };
+#else
+    class simd8i
+    {
+    public:
+        typedef int32 type;
+
+        simd8i() {}
+        simd8i(const simd4i& low_, const simd4i& high_): _low(low_),_high(high_){}
+        simd8i(int32 f) :_low(f),_high(f) {}
+        simd8i(int32 r0, int32 r1, int32 r2, int32 r3, int32 r4, int32 r5, int32 r6, int32 r7) :
+            _low(r0,r1,r2,r3), _high(r4,r5,r6,r7) {}
+
+        struct rawarray
+        {
+            simd4i low, high;
+        };
+        simd8i(const rawarray& a) 
+        { 
+            _low = a.low;
+            _high = a.high;
+        }
+
+        void load_aligned(const type* ptr)  { _low.load_aligned(ptr); _high.load_aligned(ptr+4); }
+        void store_aligned(type* ptr) const { _low.store_aligned(ptr); _high.store_aligned(ptr+4); }
+        void load(const type* ptr)          { _low.load(ptr); _high.load(ptr+4); }
+        void store(type* ptr)         const { _low.store(ptr); _high.store(ptr+4); }
+
+        unsigned int size() const { return 8; }
+        int32 operator[](unsigned int idx) const 
+        {
+            if (idx < 4)
+                return _low[idx];
+            else
+                return _high[idx-4];
+        }
+
+        simd4i low() const { return _low; }
+        simd4i high() const { return _high; }
+
+    private:
+        simd4i _low, _high;
+    };
+
+#endif
+
+// ----------------------------------------------------------------------------------------
+
+    inline std::ostream& operator<<(std::ostream& out, const simd8i& item)
+    {
+        int32 temp[8];
+        item.store(temp);
+        out << "(" << temp[0] << ", " << temp[1] << ", " << temp[2] << ", " << temp[3] << ", "
+                   << temp[4] << ", " << temp[5] << ", " << temp[6] << ", " << temp[7] << ")";
+        return out;
+    }
+
+// ----------------------------------------------------------------------------------------
+
+    inline simd8i operator+ (const simd8i& lhs, const simd8i& rhs) 
+    { 
+#ifdef DLIB_HAVE_AVX2
+        return _mm256_add_epi32(lhs, rhs); 
+#else
+        return simd8i(lhs.low()+rhs.low(),
+                      lhs.high()+rhs.high());
+#endif
+    }
+    inline simd8i& operator+= (simd8i& lhs, const simd8i& rhs) 
+    { return lhs = lhs + rhs; return lhs;}
+
+// ----------------------------------------------------------------------------------------
+
+    inline simd8i operator- (const simd8i& lhs, const simd8i& rhs) 
+    { 
+#ifdef DLIB_HAVE_AVX2
+        return _mm256_sub_epi32(lhs, rhs); 
+#else
+        return simd8i(lhs.low()-rhs.low(),
+                      lhs.high()-rhs.high());
+#endif
+    }
+    inline simd8i& operator-= (simd8i& lhs, const simd8i& rhs) 
+    { return lhs = lhs - rhs; return lhs;}
+
+// ----------------------------------------------------------------------------------------
+
+    inline simd8i operator* (const simd8i& lhs, const simd8i& rhs) 
+    { 
+#ifdef DLIB_HAVE_AVX2
+        return _mm256_mullo_epi32(lhs, rhs); 
+#else
+        return simd8i(lhs.low()*rhs.low(),
+                      lhs.high()*rhs.high());
+#endif
+    }
+    inline simd8i& operator*= (simd8i& lhs, const simd8i& rhs) 
+    { return lhs = lhs * rhs; return lhs;}
+
+// ----------------------------------------------------------------------------------------
+
+    inline simd8i operator& (const simd8i& lhs, const simd8i& rhs) 
+    { 
+#ifdef DLIB_HAVE_AVX2
+        return _mm256_and_si256(lhs, rhs); 
+#else
+        return simd8i(lhs.low()&rhs.low(),
+                      lhs.high()&rhs.high());
+#endif
+    }
+    inline simd8i& operator&= (simd8i& lhs, const simd8i& rhs) 
+    { return lhs = lhs & rhs; return lhs;}
+
+// ----------------------------------------------------------------------------------------
+
+    inline simd8i operator| (const simd8i& lhs, const simd8i& rhs) 
+    { 
+#ifdef DLIB_HAVE_AVX2
+        return _mm256_or_si256(lhs, rhs); 
+#else
+        return simd8i(lhs.low()|rhs.low(),
+                      lhs.high()|rhs.high());
+#endif
+    }
+    inline simd8i& operator|= (simd8i& lhs, const simd8i& rhs) 
+    { return lhs = lhs | rhs; return lhs;}
+
+// ----------------------------------------------------------------------------------------
+
+    inline simd8i operator^ (const simd8i& lhs, const simd8i& rhs) 
+    { 
+#ifdef DLIB_HAVE_AVX2
+        return _mm256_xor_si256(lhs, rhs); 
+#else
+        return simd8i(lhs.low()^rhs.low(),
+                      lhs.high()^rhs.high());
+#endif
+    }
+    inline simd8i& operator^= (simd8i& lhs, const simd8i& rhs) 
+    { return lhs = lhs ^ rhs; return lhs;}
+
+// ----------------------------------------------------------------------------------------
+
+    inline simd8i operator~ (const simd8i& lhs) 
+    { 
+#ifdef DLIB_HAVE_AVX2
+        return _mm256_xor_si256(lhs, _mm256_set1_epi32(0xFFFFFFFF)); 
+#else
+        return simd8i(~lhs.low(), ~lhs.high());
+#endif
+    }
+
+// ----------------------------------------------------------------------------------------
+
+    inline simd8i operator<< (const simd8i& lhs, const int& rhs) 
+    { 
+#ifdef DLIB_HAVE_AVX2
+        return _mm256_sll_epi32(lhs,_mm_cvtsi32_si128(rhs));
+#else
+        return simd8i(lhs.low()<<rhs,
+                      lhs.high()<<rhs);
+#endif
+    }
+    inline simd8i& operator<<= (simd8i& lhs, const int& rhs) 
+    { return lhs = lhs << rhs; return lhs;}
+
+// ----------------------------------------------------------------------------------------
+
+    inline simd8i operator>> (const simd8i& lhs, const int& rhs) 
+    { 
+#ifdef DLIB_HAVE_AVX2
+        return _mm256_sra_epi32(lhs,_mm_cvtsi32_si128(rhs));
+#else
+        return simd8i(lhs.low()>>rhs,
+                      lhs.high()>>rhs);
+#endif
+    }
+    inline simd8i& operator>>= (simd8i& lhs, const int& rhs) 
+    { return lhs = lhs >> rhs; return lhs;}
+
+// ----------------------------------------------------------------------------------------
+
+    inline simd8i operator== (const simd8i& lhs, const simd8i& rhs) 
+    { 
+#ifdef DLIB_HAVE_AVX2
+        return _mm256_cmpeq_epi32(lhs, rhs); 
+#else
+        return simd8i(lhs.low()==rhs.low(),
+                      lhs.high()==rhs.high());
+#endif
+    }
+
+// ----------------------------------------------------------------------------------------
+
+    inline simd8i operator!= (const simd8i& lhs, const simd8i& rhs) 
+    { 
+        return ~(lhs==rhs);
+    }
+
+// ----------------------------------------------------------------------------------------
+
+    inline simd8i operator> (const simd8i& lhs, const simd8i& rhs) 
+    { 
+#ifdef DLIB_HAVE_AVX2
+        return _mm256_cmpgt_epi32(lhs, rhs); 
+#else
+        return simd8i(lhs.low()>rhs.low(),
+                      lhs.high()>rhs.high());
+#endif
+    }
+
+// ----------------------------------------------------------------------------------------
+
+    inline simd8i operator< (const simd8i& lhs, const simd8i& rhs) 
+    { 
+        return rhs > lhs;
+    }
+
+// ----------------------------------------------------------------------------------------
+
+    inline simd8i operator<= (const simd8i& lhs, const simd8i& rhs) 
+    { 
+        return ~(lhs > rhs); 
+    }
+
+// ----------------------------------------------------------------------------------------
+
+    inline simd8i operator>= (const simd8i& lhs, const simd8i& rhs) 
+    { 
+        return rhs <= lhs;
+    }
+
+// ----------------------------------------------------------------------------------------
+
+    inline simd8i min (const simd8i& lhs, const simd8i& rhs) 
+    { 
+#ifdef DLIB_HAVE_AVX2
+        return _mm256_min_epi32(lhs, rhs); 
+#else
+        return simd8i(min(lhs.low(),rhs.low()),
+                      min(lhs.high(),rhs.high()));
+#endif
+    }
+
+// ----------------------------------------------------------------------------------------
+
+    inline simd8i max (const simd8i& lhs, const simd8i& rhs) 
+    { 
+#ifdef DLIB_HAVE_AVX2
+        return _mm256_max_epi32(lhs, rhs); 
+#else
+        return simd8i(max(lhs.low(),rhs.low()),
+                      max(lhs.high(),rhs.high()));
+#endif
+    }
+
+// ----------------------------------------------------------------------------------------
+
+    inline int32 sum(const simd8i& item)
+    {
+        return sum(item.low()+item.high());
+    }
+
+// ----------------------------------------------------------------------------------------
+
+    // perform cmp ? a : b
+    inline simd8i select(const simd8i& cmp, const simd8i& a, const simd8i& b)
+    {
+#ifdef DLIB_HAVE_AVX2
+        return _mm256_blendv_epi8(b,a,cmp);
+#else
+        return simd8i(select(cmp.low(),  a.low(),  b.low()),
+                      select(cmp.high(), a.high(), b.high()));
+#endif
+    }
+
+// ----------------------------------------------------------------------------------------
+
+}
+
+#endif // DLIB_sIMD8I_H__
+
+
--- a/dlib/simd/simd_check.h
+++ b/dlib/simd/simd_check.h
@@ -3,6 +3,7 @@
 #ifndef DLIB_SIMd_CHECK_H__
 #define DLIB_SIMd_CHECK_H__

+//#define DLIB_DO_NOT_USE_SIMD

 // figure out which SIMD instructions we can use.
 #ifndef DLIB_DO_NOT_USE_SIMD
@@ -27,29 +28,38 @@
        #ifdef __AVX__
            #define DLIB_HAVE_AVX
        #endif
+        #ifdef __AVX2__
+            #define DLIB_HAVE_AVX2
+        #endif
    #endif
 #endif

 
 // ----------------------------------------------------------------------------------------

-#ifdef DLIB_HAVE_SSE2
-    #include <xmmintrin.h>
-    #include <emmintrin.h>
-    #include <mmintrin.h>
-#endif
-#ifdef DLIB_HAVE_SSE3
-    #include <pmmintrin.h> // SSE3
-    #include <tmmintrin.h>
-#endif
-#ifdef DLIB_HAVE_SSE41
-    #include <smmintrin.h> // SSE4
-#endif
-#ifdef DLIB_HAVE_AVX
-    #include <immintrin.h> // AVX
+#ifdef __GNUC__
+    #include <x86intrin.h>
+#else
+    #ifdef DLIB_HAVE_SSE2
+        #include <xmmintrin.h>
+        #include <emmintrin.h>
+        #include <mmintrin.h>
+    #endif
+    #ifdef DLIB_HAVE_SSE3
+        #include <pmmintrin.h> // SSE3
+        #include <tmmintrin.h>
+    #endif
+    #ifdef DLIB_HAVE_SSE41
+        #include <smmintrin.h> // SSE4
+    #endif
+    #ifdef DLIB_HAVE_AVX
+        #include <immintrin.h> // AVX
+    #endif
+    #ifdef DLIB_HAVE_AVX2
+        #include <avx2intrin.h>
+    #endif
 #endif

-
 #endif // DLIB_SIMd_CHECK_H__