Merge remote-tracking branch 'upstream/master'

22164e5d · Joachim · 462d61ef · 664ef398 · 22164e5d · 22164e5d
Commit 22164e5d authored Apr 10, 2017 by Joachim
20 changed files
--- a/README.md
+++ b/README.md
@@ -21,7 +21,9 @@ mkdir build; cd build; cmake .. -DUSE_AVX_INSTRUCTIONS=1; cmake --build .

 Doing so will make some things run faster.

+## Compiling your own C++ programs that use dlib

+The examples folder has a [CMake tutorial](https://github.com/davisking/dlib/blob/master/examples/CMakeLists.txt) that tells you what to do.  There are also additional instructions on the [dlib web site](http://dlib.net/compile.html).

 ## Compiling dlib Python API


--- a/dlib/CMakeLists.txt
+++ b/dlib/CMakeLists.txt
@@ -27,29 +27,27 @@ endif()
 # is used.
 cmake_policy(SET CMP0023 OLD)

-include(cmake_utils/add_global_compiler_switch.cmake)

+macro (enable_preprocessor_switch option_name)
+   list(APPEND active_preprocessor_switches "-D${option_name}")
+endmacro()

-if (DLIB_IN_PROJECT_BUILD)
-   # Make sure ENABLE_ASSERTS is defined for debug builds, but only for uses
-   # who are building an application.  If they are just building dlib as a
-   # stand alone library then don't set this because it will conflict with the
-   # settings in config.h if we did.
-   if (NOT CMAKE_CXX_FLAGS_DEBUG MATCHES "-DENABLE_ASSERTS")
-      set (CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -DENABLE_ASSERTS" 
-         CACHE STRING "Flags used by the compiler during C++ debug builds." 
-         FORCE)
+macro (disable_preprocessor_switch option_name)
+   if (active_preprocessor_switches)
+      list(REMOVE_ITEM active_preprocessor_switches "-D${option_name}")
   endif()
-endif()
+endmacro()

 macro (toggle_preprocessor_switch option_name)
   if (${option_name})
-      add_global_define(${option_name})
+      enable_preprocessor_switch(${option_name})
   else()
-      remove_global_define(${option_name})
+      disable_preprocessor_switch(${option_name})
   endif()
 endmacro()

+
+
 # Suppress superfluous randlib warnings about libdlib.a having no symbols on MacOSX.
 if (APPLE)
    set(CMAKE_C_ARCHIVE_CREATE   "<CMAKE_AR> Scr <TARGET> <LINK_FLAGS> <OBJECTS>")
@@ -72,8 +70,6 @@ if (NOT TARGET dlib)
   "Enable this if you don't want to compile any of the dlib GUI code" )
   set (DLIB_ENABLE_STACK_TRACE_STR 
   "Enable this if you want to turn on the DLIB_STACK_TRACE macros" )
-   set (DLIB_ENABLE_ASSERTS_STR 
-   "Enable this if you want to turn on the DLIB_ASSERT macro" )
   set (DLIB_USE_BLAS_STR
   "Disable this if you don't want to use a BLAS library" )
   set (DLIB_USE_LAPACK_STR
@@ -91,7 +87,11 @@ if (NOT TARGET dlib)
   #set (DLIB_USE_FFTW_STR "Disable this if you don't want to link against fftw" )
   set (DLIB_USE_MKL_FFT_STR
   "Disable this is you don't want to use the MKL DFTI FFT implementation" )
+   set (DLIB_ENABLE_ASSERTS_STR
+   "Enable this if you want to turn on the DLIB_ASSERT macro" )
+

+   option(DLIB_ENABLE_ASSERTS ${DLIB_ENABLE_ASSERTS_STR} OFF)
   option(DLIB_ISO_CPP_ONLY ${DLIB_ISO_CPP_ONLY_STR} OFF)
   toggle_preprocessor_switch(DLIB_ISO_CPP_ONLY)
   option(DLIB_NO_GUI_SUPPORT ${DLIB_NO_GUI_SUPPORT_STR} OFF)
@@ -99,27 +99,32 @@ if (NOT TARGET dlib)
   option(DLIB_ENABLE_STACK_TRACE ${DLIB_ENABLE_STACK_TRACE_STR} OFF)
   toggle_preprocessor_switch(DLIB_ENABLE_STACK_TRACE)

-   option(DLIB_ENABLE_ASSERTS ${DLIB_ENABLE_ASSERTS_STR} OFF)
   if(DLIB_ENABLE_ASSERTS)
      # Set these variables so they are set in the config.h.in file when dlib
      # is installed.
      set (DLIB_DISABLE_ASSERTS false)
      set (ENABLE_ASSERTS true) 
-      add_global_define(ENABLE_ASSERTS)
-      remove_global_define(DLIB_DISABLE_ASSERTS)
+      enable_preprocessor_switch(ENABLE_ASSERTS)
+      disable_preprocessor_switch(DLIB_DISABLE_ASSERTS)
   else()
      # Set these variables so they are set in the config.h.in file when dlib
      # is installed.
      set (DLIB_DISABLE_ASSERTS true)
      set (ENABLE_ASSERTS false) 
-      remove_global_define(ENABLE_ASSERTS)
-      # Never force the asserts off when doing an in project build.  Instead,
-      # let the debug/release mode setting toggle asserts on or off (or the
-      # DLIB_ENABLE_ASSERTS option obviously).  That is, even if the
-      # DLIB_ENABLE_ASSERTS option is off debug mode can still cause the
-      # asserts to turn on when using an in project build. 
+      disable_preprocessor_switch(ENABLE_ASSERTS)
+      # Never force the asserts off when doing an in project build.  The only
+      # time this matters is when using visual studio.  The visual studio IDE
+      # has a drop down that lets the user select either release or debug
+      # builds.  The DLIB_ASSERT macro is setup to enable/disable automatically
+      # based on this drop down (via preprocessor magic).  However, if
+      # DLIB_DISABLE_ASSERTS is defined it permanently disables asserts no
+      # matter what, which would defeat the visual studio drop down.  So here
+      # we make a point to not do that kind of severe disabling when in a
+      # project build.  It should also be pointed out that DLIB_DISABLE_ASSERTS
+      # is only needed when building and installing dlib as a separately
+      # installed library.  It doesn't matter when doing an in project build. 
      if (NOT DLIB_IN_PROJECT_BUILD)
-         add_global_define(DLIB_DISABLE_ASSERTS)
+         enable_preprocessor_switch(DLIB_DISABLE_ASSERTS)
      endif()
   endif()

@@ -234,7 +239,7 @@ if (NOT TARGET dlib)
               # If both X11 and anaconda are installed, it's possible for the
               # anaconda path to appear before /opt/X11, so we remove anaconda.
               foreach (ITR ${X11_INCLUDE_DIR})
-                  if ("${ITR}" MATCHES "(.*)anaconda(.*)")
+                  if ("${ITR}" MATCHES "(.*)(ana|mini)conda(.*)")
                     list (REMOVE_ITEM X11_INCLUDE_DIR ${ITR})
                  endif ()
               endforeach(ITR)
@@ -265,7 +270,7 @@ if (NOT TARGET dlib)
               message(" *** You can download XQuartz from: http://xquartz.macosforge.org/landing/ ***")
               message(" *****************************************************************************")
               set(DLIB_NO_GUI_SUPPORT ON CACHE STRING ${DLIB_NO_GUI_SUPPORT_STR} FORCE )
-               add_global_define(DLIB_NO_GUI_SUPPORT)
+               enable_preprocessor_switch(DLIB_NO_GUI_SUPPORT)
            endif()
         endif()

@@ -295,7 +300,7 @@ if (NOT TARGET dlib)
               message(" *** On Ubuntu run: sudo apt-get install libx11-dev                        ***")
               message(" *****************************************************************************")
               set(DLIB_NO_GUI_SUPPORT ON CACHE STRING ${DLIB_NO_GUI_SUPPORT_STR} FORCE )
-               add_global_define(DLIB_NO_GUI_SUPPORT)
+               enable_preprocessor_switch(DLIB_NO_GUI_SUPPORT)
            endif()
         endif()

@@ -524,6 +529,7 @@ if (NOT TARGET dlib)
            # magic in the standard C++ header files (since nvcc uses gcc headers on
            # linux).
            list(APPEND CUDA_NVCC_FLAGS "-arch=sm_30;-D__STRICT_ANSI__;-D_MWAITXINTRIN_H_INCLUDED;-D_FORCE_INLINES;${FLAGS_FOR_NVCC}")
+            list(APPEND CUDA_NVCC_FLAGS ${active_preprocessor_switches})
            if (NOT MSVC)
               list(APPEND CUDA_NVCC_FLAGS "-std=c++11")
            endif()
@@ -568,21 +574,44 @@ if (NOT TARGET dlib)
                  endif()
               endif()
            endif()
-         endif()

+            # Find where cuSOLVER is since the FindCUDA cmake package doesn't
+            # bother to look for it.
+            get_filename_component(cuda_blas_path "${CUDA_CUBLAS_LIBRARIES}" DIRECTORY)
+            find_library(cusolver cusolver HINTS ${cuda_blas_path})
+            mark_as_advanced(cusolver)
+            # Also find OpenMP since cuSOLVER needs it.  Importantly, we only
+            # look for one to link to if our use of BLAS, specifically the
+            # Intel MKL, hasn't already decided what to use.  This is because
+            # it makes the MKL bug out if you link to another openmp lib other
+            # than Intel's when you use the MKL.
+            if (NOT openmp_libarires AND NOT MSVC)
+               find_package(OpenMP)
+               if (OPENMP_FOUND)
+                  set(openmp_libarires ${OpenMP_CXX_FLAGS}) 
+               else()
+                  message(STATUS "*** Didn't find OpenMP, which is required to use CUDA. ***")
+                  set(CUDA_FOUND 0)
+               endif()
+            endif()
+         endif()

         if (CUDA_FOUND AND cudnn AND cudnn_include AND COMPILER_CAN_DO_CPP_11 AND cuda_test_compile_worked AND cudnn_test_compile_worked)
            set(source_files ${source_files} 
               dnn/cuda_dlib.cu 
               dnn/cudnn_dlibapi.cpp
               dnn/cublas_dlibapi.cpp
+               dnn/cusolver_dlibapi.cu
               dnn/curand_dlibapi.cpp
+               dnn/cuda_data_ptr.cpp
               dnn/gpu_data.cpp
               )
            set(dlib_needed_libraries ${dlib_needed_libraries} 
                                      ${CUDA_CUBLAS_LIBRARIES} 
                                      ${cudnn}
                                      ${CUDA_curand_LIBRARY}
+                                      ${cusolver}
+                                      ${openmp_libarires}
                                      )
            include_directories(${cudnn_include})
         else()
@@ -647,6 +676,11 @@ if (NOT TARGET dlib)
                                 PUBLIC ${dlib_needed_includes}
                                 )
      target_link_libraries(dlib PRIVATE ${dlib_needed_libraries})
+      if (DLIB_IN_PROJECT_BUILD)
+         target_compile_options(dlib PUBLIC ${active_preprocessor_switches})
+      else()
+         target_compile_options(dlib PRIVATE ${active_preprocessor_switches})
+      endif()
      if (UNIX AND NOT DLIB_IN_PROJECT_BUILD)
         if (DLIB_USE_CUDA)
            cuda_add_library(dlib_shared SHARED ${source_files} )
@@ -661,6 +695,7 @@ if (NOT TARGET dlib)
                                    PUBLIC ${dlib_needed_includes}
                                    )
         target_link_libraries(dlib_shared PRIVATE ${dlib_needed_libraries})
+         target_compile_options(dlib_shared PRIVATE ${active_preprocessor_switches})
      endif()

   endif ()  ##### end of if NOT DLIB_ISO_CPP_ONLY ##########################################################
@@ -669,6 +704,7 @@ if (NOT TARGET dlib)
   if (DLIB_TEST_COMPILE_ALL_SOURCE_CPP)
      ADD_LIBRARY(dlib_all_source_cpp STATIC all/source.cpp) 
      target_link_libraries(dlib_all_source_cpp dlib)
+      target_compile_options(dlib_all_source_cpp PUBLIC ${active_preprocessor_switches})
   endif()

   # Install the library

--- a/dlib/cmake_utils/add_python_module
+++ b/dlib/cmake_utils/add_python_module
@@ -68,7 +68,7 @@ if (PYTHON3)
    if (NOT Boost_FOUND)
        FIND_PACKAGE(Boost 1.41.0 COMPONENTS python)
    endif()
-    set(Python_ADDITIONAL_VERSIONS 3.5)
+    set(Python_ADDITIONAL_VERSIONS 3.5 3.6)
    FIND_PACKAGE(PythonLibs 3.4 REQUIRED)
 else()
    FIND_PACKAGE(Boost 1.41.0 COMPONENTS python)

--- a/dlib/cmake_utils/cmake_find_blas.txt
+++ b/dlib/cmake_utils/cmake_find_blas.txt
@@ -16,6 +16,8 @@
 #  lapack_libraries         - link against these to use LAPACK library 
 #  mkl_libraries            - link against these to use the MKL library
 #  mkl_include_dir          - add to the include path to use the MKL library
+#  openmp_libarires         - Set to Intel's OpenMP library if and only if we
+#                             find the MKL.

 # setting this makes CMake allow normal looking if else statements
 SET(CMAKE_ALLOW_LOOSE_LOOP_CONSTRUCTS true)
@@ -91,11 +93,13 @@ if (UNIX OR MINGW)
        /opt/intel/include
    )
    find_path(mkl_include_dir mkl_version.h ${mkl_include_search_path})
+    mark_as_advanced(mkl_include_dir)

    # Search for the needed libraries from the MKL.  We will try to link against the mkl_rt
    # file first since this way avoids linking bugs in some cases.
    find_library(mkl_rt mkl_rt ${mkl_search_path})
-    mark_as_advanced(  mkl_rt )
+    find_library(openmp_libarires iomp5 ${mkl_search_path}) 
+    mark_as_advanced(  mkl_rt  openmp_libarires )
    # if we found the MKL 
    if ( mkl_rt)
        set(mkl_libraries  ${mkl_rt} )

--- a/dlib/data_io/image_dataset_metadata.cpp
+++ b/dlib/data_io/image_dataset_metadata.cpp
@@ -80,6 +80,12 @@ namespace dlib
                        fout << " ignore='" << b.ignore << "'";
                    if (b.angle != 0)
                        fout << " angle='" << b.angle << "'";
+                    if (b.age != 0)
+                        fout << " age='" << b.age << "'";
+                    if (b.gender == FEMALE)
+                        fout << " gender='female'";
+                    else if (b.gender == MALE)
+                        fout << " gender='male'";
                    if (b.pose != 0)
                        fout << " pose='" << b.pose << "'";
                    if (b.detection_score != 0)
@@ -196,6 +202,18 @@ namespace dlib
                        if (atts.is_in_list("occluded"))  temp_box.occluded  = sa = atts["occluded"];
                        if (atts.is_in_list("ignore"))  temp_box.ignore  = sa = atts["ignore"];
                        if (atts.is_in_list("angle"))  temp_box.angle  = sa = atts["angle"];
+                        if (atts.is_in_list("age"))  temp_box.age  = sa = atts["age"];
+                        if (atts.is_in_list("gender"))  
+                        {
+                            if (atts["gender"] == "male")
+                                temp_box.gender = MALE;
+                            else if (atts["gender"] == "female")
+                                temp_box.gender = FEMALE;
+                            else if (atts["gender"] == "unknown")
+                                temp_box.gender = UNKNOWN;
+                            else
+                                throw dlib::error("Invalid gender string in box attribute.");
+                        }
                        if (atts.is_in_list("pose"))  temp_box.pose  = sa = atts["pose"];
                        if (atts.is_in_list("detection_score"))  temp_box.detection_score  = sa = atts["detection_score"];


--- a/dlib/data_io/image_dataset_metadata.h
+++ b/dlib/data_io/image_dataset_metadata.h
@@ -14,6 +14,15 @@ namespace dlib
    namespace image_dataset_metadata
    {

+    // ------------------------------------------------------------------------------------
+
+        enum gender_t
+        {
+            UNKNOWN,
+            MALE,
+            FEMALE
+        };
+
    // ------------------------------------------------------------------------------------

        struct box
@@ -36,7 +45,9 @@ namespace dlib
                ignore(false),
                pose(0),
                detection_score(0),
-                angle(0)
+                angle(0),
+                gender(UNKNOWN),
+                age(0)
            {}

            box (
@@ -49,7 +60,9 @@ namespace dlib
                ignore(false),
                pose(0),
                detection_score(0),
-                angle(0)
+                angle(0),
+                gender(UNKNOWN),
+                age(0)
            {}

            rectangle rect;
@@ -72,6 +85,9 @@ namespace dlib
            // image counter-clockwise by angle radians.
            double angle; 

+            gender_t gender;
+            double age;
+
            bool has_label() const { return label.size() != 0; }
            /*!
                ensures

--- a/dlib/dnn/core.h
+++ b/dlib/dnn/core.h
@@ -3396,7 +3396,7 @@ namespace dlib
                visitor&& v
            )
            {
-                vl_loop<i+1, num>::visit(net,v);
+                vl_loop_backwards<i+1, num>::visit(net,v);
                v(i, layer<i>(net));
            }
        };

--- a/dlib/dnn/cpu_dlib.cpp
+++ b/dlib/dnn/cpu_dlib.cpp
@@ -486,6 +486,46 @@ namespace dlib
            }
        }

+    // ----------------------------------------------------------------------------------------
+
+        void affine_transform(
+            const rectangle& rect,
+            tensor& dest, 
+            const tensor& src1, 
+            const tensor& src2, 
+            const tensor& src3, 
+            float A, 
+            float B,
+            float C
+        )
+        {
+            DLIB_CASSERT(dest.size() == src1.size());
+            DLIB_CASSERT(dest.size() == src2.size());
+            DLIB_CASSERT(dest.size() == src3.size());
+            DLIB_CASSERT(dest.num_samples() == src1.num_samples());
+            DLIB_CASSERT(dest.num_samples() == src2.num_samples());
+            DLIB_CASSERT(dest.num_samples() == src3.num_samples());
+            DLIB_CASSERT(rectangle(0,0, dest.size()/dest.num_samples()-1, dest.num_samples()-1).contains(rect));
+
+
+            auto d = dest.host();
+            auto s1 = src1.host();
+            auto s2 = src2.host();
+            auto s3 = src3.host();
+
+            const auto nc = dest.size()/dest.num_samples();
+
+            for (long r = rect.top(); r <= rect.bottom(); ++r)
+            {
+                for (long c = rect.left(); c <= rect.right(); ++c)
+                {
+                    auto idx = r*nc + c;
+                    d[idx] = s1[idx]*A + s2[idx]*B + s3[idx]*C;
+                }
+            }
+
+        }
+
    // -----------------------------------------------------------------------------------

        void compute_adam_update (

--- a/dlib/dnn/cpu_dlib.h
+++ b/dlib/dnn/cpu_dlib.h
@@ -7,6 +7,7 @@
 // and cudnn_dlibapi.h

 #include "tensor.h"
+#include "../geometry/rectangle.h"

 namespace dlib
 {
@@ -111,6 +112,19 @@ namespace dlib
            const tensor& B
        );

+    // -----------------------------------------------------------------------------------
+
+        void affine_transform(
+            const rectangle& rect,
+            tensor& dest, 
+            const tensor& src1, 
+            const tensor& src2, 
+            const tensor& src3, 
+            float A, 
+            float B,
+            float C
+        );
+
    // -----------------------------------------------------------------------------------

        void compute_adam_update (

--- a/dlib/dnn/cuda_data_ptr.cpp
+++ b/dlib/dnn/cuda_data_ptr.cpp
+// Copyright (C) 2017  Davis E. King (davis@dlib.net)
+// License: Boost Software License   See LICENSE.txt for the full license.
+#ifndef DLIB_DNN_CuDA_DATA_PTR_CPP_
+#define DLIB_DNN_CuDA_DATA_PTR_CPP_
+
+#ifdef DLIB_USE_CUDA
+
+#include "cuda_data_ptr.h"
+#include "cuda_utils.h"
+
+namespace dlib
+{
+    namespace cuda 
+    {
+
+    // -----------------------------------------------------------------------------------
+
+        cuda_data_void_ptr::
+        cuda_data_void_ptr(
+            size_t n
+        ) : num(n)
+        {
+            if (n == 0)
+                return;
+
+            void* data = nullptr;
+
+            CHECK_CUDA(cudaMalloc(&data, n));
+            pdata.reset(data, [](void* ptr){
+                auto err = cudaFree(ptr);
+                if(err!=cudaSuccess)
+                std::cerr << "cudaFree() failed. Reason: " << cudaGetErrorString(err) << std::endl;
+            });
+        }
+
+    // ------------------------------------------------------------------------------------
+
+        void memcpy(
+            void* dest,
+            const cuda_data_void_ptr& src
+        )
+        {
+            if (src.size() != 0)
+            {
+                CHECK_CUDA(cudaMemcpy(dest, src.data(),  src.size(), cudaMemcpyDefault));
+            }
+        }
+
+    // ------------------------------------------------------------------------------------
+
+        void memcpy(
+            cuda_data_void_ptr& dest, 
+            const void* src
+        )
+        {
+            if (dest.size() != 0)
+            {
+                CHECK_CUDA(cudaMemcpy(dest.data(), src, dest.size(), cudaMemcpyDefault));
+            }
+        }
+
+    // ------------------------------------------------------------------------------------
+
+    }  
+}
+
+#endif // DLIB_USE_CUDA
+
+#endif // DLIB_DNN_CuDA_DATA_PTR_CPP_
+
+
--- a/dlib/dnn/cuda_data_ptr.h
+++ b/dlib/dnn/cuda_data_ptr.h
+// Copyright (C) 2017  Davis E. King (davis@dlib.net)
+// License: Boost Software License   See LICENSE.txt for the full license.
+#ifndef DLIB_DNN_CuDA_DATA_PTR_H_
+#define DLIB_DNN_CuDA_DATA_PTR_H_
+
+#ifdef DLIB_USE_CUDA
+
+#include <memory>
+#include <vector>
+
+namespace dlib
+{
+    namespace cuda
+    {
+
+    // ------------------------------------------------------------------------------------
+
+        class cuda_data_void_ptr
+        {
+            /*!
+                WHAT THIS OBJECT REPRESENTS
+                    This is a block of memory on a CUDA device.  
+            !*/
+        public:
+
+            cuda_data_void_ptr() = default;
+
+            cuda_data_void_ptr(size_t n); 
+            /*!
+                ensures
+                    - This object will allocate a device memory buffer of n bytes.
+                    - #size() == n
+            !*/
+
+            void* data() { return pdata.get(); }
+            const void* data() const { return pdata.get(); }
+            operator void*() { return pdata.get(); }
+            operator const void*() const { return pdata.get(); }
+
+            void reset() { pdata.reset(); }
+
+            size_t size() const { return num; }
+            /*!
+                ensures
+                    - returns the length of this buffer, in bytes.
+            !*/
+
+        private:
+
+            size_t num = 0;
+            std::shared_ptr<void> pdata;
+        };
+
+    // ------------------------------------------------------------------------------------
+
+        void memcpy(
+            void* dest,
+            const cuda_data_void_ptr& src
+        );
+        /*!
+            requires
+                - dest == a pointer to at least src.size() bytes on the host machine.
+            ensures
+                - copies the GPU data from src into dest.
+        !*/
+
+    // ------------------------------------------------------------------------------------
+
+        void memcpy(
+            cuda_data_void_ptr& dest, 
+            const void* src
+        );
+        /*!
+            requires
+                - dest == a pointer to at least src.size() bytes on the host machine.
+            ensures
+                - copies the host data from src to the GPU memory buffer dest.
+        !*/
+
+    // ------------------------------------------------------------------------------------
+    // ------------------------------------------------------------------------------------
+    // ------------------------------------------------------------------------------------
+
+        template <typename T>
+        class cuda_data_ptr
+        {
+            /*!
+                WHAT THIS OBJECT REPRESENTS
+                    This is a block of memory on a CUDA device.   It is just a type safe
+                    version of cuda_data_void_ptr.
+            !*/
+
+        public:
+
+            static_assert(std::is_standard_layout<T>::value, "You can only create basic standard layout types on the GPU");
+
+            cuda_data_ptr() = default;
+            cuda_data_ptr(size_t n) : num(n)
+            /*!
+                ensures
+                    - This object will allocate a device memory buffer of n T objects.
+                    - #size() == n
+            !*/
+            {
+                if (n == 0)
+                    return;
+
+                pdata = cuda_data_void_ptr(n*sizeof(T));
+            }
+
+            T* data() { return (T*)pdata.data(); }
+            const T* data() const { return (T*)pdata.data(); }
+
+            operator T*() { return (T*)pdata.data(); }
+            operator const T*() const { return (T*)pdata.data(); }
+
+            void reset() { pdata.reset(); }
+
+            size_t size() const { return num; }
+
+
+            friend void memcpy(
+                std::vector<T>& dest,
+                const cuda_data_ptr& src
+            )
+            {
+                dest.resize(src.size());
+                if (src.size() != 0)
+                    memcpy(dest.data(), src.pdata);
+            }
+
+            friend void memcpy(
+                cuda_data_ptr& src,
+                const std::vector<T>& dest
+            )
+            {
+                if (dest.size() != src.size())
+                    dest = cuda_data_ptr<T>(src.size());
+
+                if (src.size() != 0)
+                    memcpy(src.pdata, dest.data());
+            }
+
+        private:
+
+            size_t num = 0;
+            cuda_data_void_ptr pdata;
+        };
+
+    // ------------------------------------------------------------------------------------
+
+    }
+}
+
+#endif // DLIB_USE_CUDA
+
+#endif // DLIB_DNN_CuDA_DATA_PTR_H_
+
--- a/dlib/dnn/cuda_dlib.cu
+++ b/dlib/dnn/cuda_dlib.cu
@@ -575,6 +575,57 @@ namespace dlib
            launch_kernel(_cuda_affine_transform1_0,max_jobs(dest.size()),dest.device(), src.device(), src.size(), A);
        }

+    // ----------------------------------------------------------------------------------------
+
+        __global__ void _cuda_affine_transform_rect(
+            float* d, 
+            const float* s1, 
+            const float* s2, 
+            const float* s3, 
+            float A, 
+            float B,
+            float C,
+            size_t start_idx,
+            size_t n, 
+            size_t rect_nc,
+            size_t total_nc
+        )
+        {
+            for (auto i : grid_stride_range(0, n))
+            {
+                size_t r = i/rect_nc;
+                size_t c = i%rect_nc;
+                size_t idx = r*total_nc + c + start_idx;
+                d[idx] = A*s1[idx] + B*s2[idx] + C*s3[idx];
+            }
+        }
+
+        void affine_transform(
+            const rectangle& rect,
+            tensor& dest, 
+            const tensor& src1, 
+            const tensor& src2, 
+            const tensor& src3, 
+            float A, 
+            float B,
+            float C
+        )
+        {
+            DLIB_CASSERT(dest.size() == src1.size());
+            DLIB_CASSERT(dest.size() == src2.size());
+            DLIB_CASSERT(dest.size() == src3.size());
+            DLIB_CASSERT(dest.num_samples() == src1.num_samples());
+            DLIB_CASSERT(dest.num_samples() == src2.num_samples());
+            DLIB_CASSERT(dest.num_samples() == src3.num_samples());
+            DLIB_CASSERT(rectangle(0,0, dest.size()/dest.num_samples()-1, dest.num_samples()-1).contains(rect));
+            launch_kernel(_cuda_affine_transform_rect,max_jobs(rect.area()),
+                dest.device(), src1.device(), src2.device(), src3.device(), A, B, C,
+                rect.left() + rect.top()*(dest.size()/dest.num_samples()),
+                rect.area(),
+                rect.width(),
+                dest.size()/dest.num_samples());
+        }
+
    // ----------------------------------------------------------------------------------------

        __global__ void _cuda_affine_transform4(float* d, const float* s1, const float* s2, size_t n, float A, float B, float C)

--- a/dlib/dnn/cuda_dlib.h
+++ b/dlib/dnn/cuda_dlib.h
@@ -5,6 +5,7 @@


 #include "tensor.h"
+#include "../geometry/rectangle.h"

 namespace dlib
 {
@@ -230,6 +231,17 @@ namespace dlib
            const float C
        );

+        void affine_transform(
+            const rectangle& rect,
+            tensor& dest, 
+            const tensor& src1, 
+            const tensor& src2, 
+            const tensor& src3, 
+            float A, 
+            float B,
+            float C
+        );
+
        // Note that this function isn't in the tt:: namespace because add_scaled() is
        // called by cuda::add() so we don't need a tt:: version of add_scaled().  
        void add_scaled(

--- a/dlib/dnn/cuda_errors.h
+++ b/dlib/dnn/cuda_errors.h
@@ -52,6 +52,17 @@ namespace dlib

        cublas_error(const std::string& message): cuda_error(message) {}
    };
+
+    struct cusolver_error : public cuda_error
+    {
+        /*!
+            WHAT THIS OBJECT REPRESENTS
+                This is the exception thrown if any calls to the NVIDIA cuSolver library
+                returns an error.  
+        !*/
+
+        cusolver_error(const std::string& message): cuda_error(message) {}
+    };
 }



--- a/dlib/dnn/cuda_utils.h
+++ b/dlib/dnn/cuda_utils.h
@@ -11,6 +11,10 @@

 #include <cuda_runtime.h>
 #include <sstream>
+#include <iostream>
+#include <memory>
+#include <vector>
+#include <type_traits>


 // Check the return value of a call to the CUDA runtime for an error condition.

--- a/dlib/dnn/cudnn_dlibapi.cpp
+++ b/dlib/dnn/cudnn_dlibapi.cpp
@@ -820,6 +820,16 @@ namespace dlib
                                                 filters.nc()));

                CHECK_CUDNN(cudnnCreateConvolutionDescriptor((cudnnConvolutionDescriptor_t*)&conv_handle));
+#if CUDNN_MAJOR >= 6
+                CHECK_CUDNN(cudnnSetConvolution2dDescriptor((cudnnConvolutionDescriptor_t)conv_handle,
+                        padding_y, // vertical padding
+                        padding_x, // horizontal padding
+                        stride_y,
+                        stride_x,
+                        1, 1, // must be 1,1
+                        CUDNN_CROSS_CORRELATION,
+                        CUDNN_DATA_FLOAT)); // could also be CUDNN_CONVOLUTION
+#else
                CHECK_CUDNN(cudnnSetConvolution2dDescriptor((cudnnConvolutionDescriptor_t)conv_handle,
                        padding_y, // vertical padding
                        padding_x, // horizontal padding
@@ -827,6 +837,7 @@ namespace dlib
                        stride_x,
                        1, 1, // must be 1,1
                        CUDNN_CROSS_CORRELATION)); // could also be CUDNN_CONVOLUTION
+#endif

                CHECK_CUDNN(cudnnGetConvolution2dForwardOutputDim(
                        (const cudnnConvolutionDescriptor_t)conv_handle,

--- a/dlib/dnn/cusolver_dlibapi.cu
+++ b/dlib/dnn/cusolver_dlibapi.cu
+// Copyright (C) 2017  Davis E. King (davis@dlib.net)
+// License: Boost Software License   See LICENSE.txt for the full license.
+#ifndef DLIB_DNN_CuSOLVER_CU_
+#define DLIB_DNN_CuSOLVER_CU_
+
+#ifdef DLIB_USE_CUDA
+
+#include "cusolver_dlibapi.h"
+#include <cublas_v2.h>
+#include <cusolverDn.h>
+#include "cuda_utils.h"
+
+// ----------------------------------------------------------------------------------------
+
+static const char* cusolver_get_error_string(cusolverStatus_t s)
+{
+    switch(s)
+    {
+        case CUSOLVER_STATUS_NOT_INITIALIZED: 
+            return "CUDA Runtime API initialization failed.";
+        case CUSOLVER_STATUS_ALLOC_FAILED: 
+            return "CUDA Resources could not be allocated.";
+        default:
+            return "A call to cuSolver failed";
+    }
+}
+
+// Check the return value of a call to the cuSolver runtime for an error condition.
+#define CHECK_CUSOLVER(call)                                                      \
+do{                                                                              \
+    const cusolverStatus_t error = call;                                         \
+    if (error != CUSOLVER_STATUS_SUCCESS)                                        \
+    {                                                                          \
+        std::ostringstream sout;                                               \
+        sout << "Error while calling " << #call << " in file " << __FILE__ << ":" << __LINE__ << ". ";\
+        sout << "code: " << error << ", reason: " << cusolver_get_error_string(error);\
+        throw dlib::cusolver_error(sout.str());                                \
+    }                                                                          \
+}while(false)
+
+// ----------------------------------------------------------------------------------------
+// ----------------------------------------------------------------------------------------
+
+namespace dlib
+{
+    namespace cuda 
+    {
+
+    // -----------------------------------------------------------------------------------
+
+        class cusolver_context
+        {
+        public:
+            // not copyable
+            cusolver_context(const cusolver_context&) = delete;
+            cusolver_context& operator=(const cusolver_context&) = delete;
+
+            cusolver_context()
+            {
+                handles.resize(16);
+            }
+            ~cusolver_context()
+            {
+                for (auto h : handles)
+                {
+                    if (h)
+                        cusolverDnDestroy(h);
+                }
+            }
+
+            cusolverDnHandle_t get_handle (
+            )  
+            { 
+                int new_device_id;
+                CHECK_CUDA(cudaGetDevice(&new_device_id));
+                // make room for more devices if needed
+                if (new_device_id >= (long)handles.size())
+                    handles.resize(new_device_id+16);
+
+                // If we don't have a handle already for this device then make one
+                if (!handles[new_device_id])
+                    CHECK_CUSOLVER(cusolverDnCreate(&handles[new_device_id]));
+
+                // Finally, return the handle for the current device
+                return handles[new_device_id];
+            }
+
+        private:
+
+            std::vector<cusolverDnHandle_t> handles;
+        };
+
+        static cusolverDnHandle_t context()
+        {
+            thread_local cusolver_context c;
+            return c.get_handle();
+        }
+
+    // ------------------------------------------------------------------------------------
+    // ------------------------------------------------------------------------------------
+    // ------------------------------------------------------------------------------------
+
+        __global__ void _cuda_set_to_identity_matrix(float* m, size_t nr)
+        {
+            for (auto j : grid_stride_range(0, nr*nr))
+            {
+                if (j%(nr+1) == 0)
+                    m[j] = 1;
+                else
+                    m[j] = 0;
+            }
+        }
+
+        void set_to_identity_matrix (
+            tensor& m 
+        )
+        {
+            DLIB_CASSERT(m.size() == m.num_samples()*m.num_samples());
+            launch_kernel(_cuda_set_to_identity_matrix, max_jobs(m.size()), m.device(), m.num_samples());
+        }
+
+    // ------------------------------------------------------------------------------------
+
+        inv::~inv()
+        {
+            sync_if_needed();
+        }
+
+    // ------------------------------------------------------------------------------------
+
+        void inv::
+        operator() (
+            const tensor& m_,
+            resizable_tensor& out
+        )
+        {
+            DLIB_CASSERT(m_.size() == m_.num_samples()*m_.num_samples(), "Input matrix must be square if you want to invert it.");
+            m = m_;
+
+            out.copy_size(m);
+            set_to_identity_matrix(out);
+
+            const int nc = m.num_samples();
+            int Lwork;
+            CHECK_CUSOLVER(cusolverDnSgetrf_bufferSize(context(), nc , nc, m.device(), nc, &Lwork));
+
+            if (Lwork > (int)workspace.size())
+            {
+                sync_if_needed();
+                workspace = cuda_data_ptr<float>(Lwork);
+            }
+            if (nc > (int)Ipiv.size())
+            {
+                sync_if_needed();
+                Ipiv = cuda_data_ptr<int>(nc);
+            }
+            if (info.size() != 1)
+            {
+                info = cuda_data_ptr<int>(1);
+            }
+
+            CHECK_CUSOLVER(cusolverDnSgetrf(context(), nc, nc, m.device(), nc, workspace, Ipiv, info));
+            CHECK_CUSOLVER(cusolverDnSgetrs(context(), CUBLAS_OP_N, nc, nc, m.device(), nc, Ipiv, out.device(), nc, info));
+            did_work_lately = true;
+        }
+
+    // ------------------------------------------------------------------------------------
+
+        int inv::
+        get_last_status(
+        )
+        {
+            std::vector<int> linfo; 
+            memcpy(linfo, info);
+            if (linfo.size() != 0)
+                return linfo[0];
+            else
+                return 0;
+        }
+
+    // ------------------------------------------------------------------------------------
+
+        void inv::
+        sync_if_needed()
+        {
+            if (did_work_lately)
+            {
+                did_work_lately = false;
+                // make sure we wait until any previous kernel launches have finished
+                // before we do something like deallocate the GPU memory.
+                cudaDeviceSynchronize();
+            }
+        }
+
+    // ------------------------------------------------------------------------------------
+
+    }  
+}
+
+#endif // DLIB_USE_CUDA
+
+#endif // DLIB_DNN_CuSOLVER_CU_
+
+
--- a/dlib/dnn/cusolver_dlibapi.h
+++ b/dlib/dnn/cusolver_dlibapi.h
+// Copyright (C) 2017  Davis E. King (davis@dlib.net)
+// License: Boost Software License   See LICENSE.txt for the full license.
+#ifndef DLIB_DNN_CuSOLVER_H_
+#define DLIB_DNN_CuSOLVER_H_
+
+#ifdef DLIB_USE_CUDA
+
+#include "tensor.h"
+#include "cuda_errors.h"
+#include "cuda_data_ptr.h"
+#include "../noncopyable.h"
+
+namespace dlib
+{
+    namespace cuda 
+    {
+
+    // -----------------------------------------------------------------------------------
+
+        class inv : noncopyable
+        {
+            /*!
+                WHAT THIS OBJECT REPRESENTS
+                    This is a functor for doing matrix inversion on the GPU.  The only
+                    reason it's an object is to avoid the reallocation of some GPU memory
+                    blocks if you want to do a bunch of matrix inversions in a row.
+            !*/
+
+        public:
+
+            inv() = default;
+            ~inv();
+
+            void operator() (
+                const tensor& m,
+                resizable_tensor& out
+            );
+            /*!
+                requires
+                    - m.size() == m.num_samples()*m.num_samples()
+                      (i.e. mat(m) must be a square matrix)
+                ensures
+                    - out == inv(mat(m));
+            !*/
+
+            int get_last_status(
+            );
+            /*!
+                ensures
+                    - returns 0 if the last matrix inversion was successful and != 0
+                      otherwise.
+            !*/
+
+        private:
+
+            void sync_if_needed();
+
+            bool did_work_lately = false;
+            resizable_tensor m;
+            cuda_data_ptr<float> workspace;
+            cuda_data_ptr<int> Ipiv;
+            cuda_data_ptr<int> info;
+        };
+
+    // ------------------------------------------------------------------------------------
+
+    }  
+}
+
+#endif // DLIB_USE_CUDA
+
+#endif // DLIB_DNN_CuSOLVER_H_
+
+
+
--- a/dlib/dnn/tensor_tools.cpp
+++ b/dlib/dnn/tensor_tools.cpp
@@ -353,6 +353,24 @@ namespace dlib { namespace tt
 #endif
    }

+    void affine_transform(
+        const rectangle& rect,
+        tensor& dest, 
+        const tensor& src1, 
+        const tensor& src2, 
+        const tensor& src3, 
+        float A, 
+        float B,
+        float C
+    )
+    {
+#ifdef DLIB_USE_CUDA
+        cuda::affine_transform(rect, dest,src1,src2,src3,A,B,C);
+#else
+        cpu::affine_transform(rect, dest,src1,src2,src3,A,B,C);
+#endif
+    }
+
    void affine_transform(
        tensor& dest,
        const tensor& src1,
@@ -789,6 +807,21 @@ namespace dlib { namespace tt
 #endif
    }

+// ----------------------------------------------------------------------------------------
+
+    void inv::
+    operator() (
+        const tensor& m,
+        resizable_tensor& out
+    )
+    {
+#ifdef DLIB_USE_CUDA
+        finv(m,out);
+#else
+        out = dlib::inv(mat(m));
+#endif
+    }
+
 // ----------------------------------------------------------------------------------------

 }}

--- a/dlib/dnn/tensor_tools.h
+++ b/dlib/dnn/tensor_tools.h
@@ -6,11 +6,13 @@
 #include "tensor.h"
 #include "cudnn_dlibapi.h"
 #include "cublas_dlibapi.h"
+#include "cusolver_dlibapi.h"
 #include "curand_dlibapi.h"
 #include "cpu_dlib.h"
 #include "cuda_dlib.h"
 #include "../rand.h"
 #include <memory>
+#include "../geometry/rectangle.h"

 namespace dlib
 {
@@ -122,6 +124,36 @@ namespace dlib { namespace tt
            - performs: dest = alpha*L*R + beta*mat(dest)
    !*/

+// ----------------------------------------------------------------------------------------
+
+    class inv
+    {
+        /*!
+            WHAT THIS OBJECT REPRESENTS
+                This is a functor for doing matrix inversion on the GPU.  The only
+                reason it's an object is to avoid the reallocation of some GPU memory
+                blocks if you want to do a bunch of matrix inversions in a row.
+        !*/
+    public:
+
+        void operator() (
+            const tensor& m,
+            resizable_tensor& out
+        );
+        /*!
+            requires
+                - m.size() == m.num_samples()*m.num_samples()
+                  (i.e. mat(m) must be a square matrix)
+            ensures
+                - out == inv(mat(m));
+        !*/
+
+    private:
+#ifdef DLIB_USE_CUDA
+        cuda::inv finv;
+#endif
+    };
+
 // ----------------------------------------------------------------------------------------

    class tensor_rand
@@ -356,6 +388,34 @@ namespace dlib { namespace tt
                    - #dest.host()[i] == A*src1.host()[i] + B*src2.host()[i] + C*src3.host()[i]
    !*/

+    void affine_transform(
+        const rectangle& rect,
+        tensor& dest, 
+        const tensor& src1, 
+        const tensor& src2, 
+        const tensor& src3, 
+        float A, 
+        float B,
+        float C
+    );
+    /*!
+        requires
+            - dest.size()==src1.size()
+            - dest.size()==src2.size()
+            - dest.size()==src3.size()
+            - dest.num_samples()==src1.num_samples()
+            - dest.num_samples()==src2.num_samples()
+            - dest.num_samples()==src3.num_samples()
+            - get_rect(mat(dest)).contains(rect) == true
+              (i.e. rect must be entirely contained within dest)
+        ensures
+            - This function operates much like
+              affine_transform(dest,src1,src2,src3,A,B,C,0), except that it runs over only
+              the sub-rectangle indicated by rect.  In particular, this function is equivalent
+              to:
+                set_subm(dest,rect) = A*subm(mat(src1),rect) + B*subm(mat(src2),rect) + C*subm(mat(src3),rect)
+    !*/
+
 // ----------------------------------------------------------------------------------------

    void affine_transform(