Commit 22164e5d authored by Joachim's avatar Joachim
Browse files

Merge remote-tracking branch 'upstream/master'

parents 462d61ef 664ef398
......@@ -21,7 +21,9 @@ mkdir build; cd build; cmake .. -DUSE_AVX_INSTRUCTIONS=1; cmake --build .
Doing so will make some things run faster.
## Compiling your own C++ programs that use dlib
The examples folder has a [CMake tutorial](https://github.com/davisking/dlib/blob/master/examples/CMakeLists.txt) that tells you what to do. There are also additional instructions on the [dlib web site](http://dlib.net/compile.html).
## Compiling dlib Python API
......
......@@ -27,29 +27,27 @@ endif()
# is used.
cmake_policy(SET CMP0023 OLD)
include(cmake_utils/add_global_compiler_switch.cmake)
macro (enable_preprocessor_switch option_name)
list(APPEND active_preprocessor_switches "-D${option_name}")
endmacro()
if (DLIB_IN_PROJECT_BUILD)
# Make sure ENABLE_ASSERTS is defined for debug builds, but only for uses
# who are building an application. If they are just building dlib as a
# stand alone library then don't set this because it will conflict with the
# settings in config.h if we did.
if (NOT CMAKE_CXX_FLAGS_DEBUG MATCHES "-DENABLE_ASSERTS")
set (CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -DENABLE_ASSERTS"
CACHE STRING "Flags used by the compiler during C++ debug builds."
FORCE)
macro (disable_preprocessor_switch option_name)
if (active_preprocessor_switches)
list(REMOVE_ITEM active_preprocessor_switches "-D${option_name}")
endif()
endif()
endmacro()
macro (toggle_preprocessor_switch option_name)
if (${option_name})
add_global_define(${option_name})
enable_preprocessor_switch(${option_name})
else()
remove_global_define(${option_name})
disable_preprocessor_switch(${option_name})
endif()
endmacro()
# Suppress superfluous randlib warnings about libdlib.a having no symbols on MacOSX.
if (APPLE)
set(CMAKE_C_ARCHIVE_CREATE "<CMAKE_AR> Scr <TARGET> <LINK_FLAGS> <OBJECTS>")
......@@ -72,8 +70,6 @@ if (NOT TARGET dlib)
"Enable this if you don't want to compile any of the dlib GUI code" )
set (DLIB_ENABLE_STACK_TRACE_STR
"Enable this if you want to turn on the DLIB_STACK_TRACE macros" )
set (DLIB_ENABLE_ASSERTS_STR
"Enable this if you want to turn on the DLIB_ASSERT macro" )
set (DLIB_USE_BLAS_STR
"Disable this if you don't want to use a BLAS library" )
set (DLIB_USE_LAPACK_STR
......@@ -91,7 +87,11 @@ if (NOT TARGET dlib)
#set (DLIB_USE_FFTW_STR "Disable this if you don't want to link against fftw" )
set (DLIB_USE_MKL_FFT_STR
"Disable this is you don't want to use the MKL DFTI FFT implementation" )
set (DLIB_ENABLE_ASSERTS_STR
"Enable this if you want to turn on the DLIB_ASSERT macro" )
option(DLIB_ENABLE_ASSERTS ${DLIB_ENABLE_ASSERTS_STR} OFF)
option(DLIB_ISO_CPP_ONLY ${DLIB_ISO_CPP_ONLY_STR} OFF)
toggle_preprocessor_switch(DLIB_ISO_CPP_ONLY)
option(DLIB_NO_GUI_SUPPORT ${DLIB_NO_GUI_SUPPORT_STR} OFF)
......@@ -99,27 +99,32 @@ if (NOT TARGET dlib)
option(DLIB_ENABLE_STACK_TRACE ${DLIB_ENABLE_STACK_TRACE_STR} OFF)
toggle_preprocessor_switch(DLIB_ENABLE_STACK_TRACE)
option(DLIB_ENABLE_ASSERTS ${DLIB_ENABLE_ASSERTS_STR} OFF)
if(DLIB_ENABLE_ASSERTS)
# Set these variables so they are set in the config.h.in file when dlib
# is installed.
set (DLIB_DISABLE_ASSERTS false)
set (ENABLE_ASSERTS true)
add_global_define(ENABLE_ASSERTS)
remove_global_define(DLIB_DISABLE_ASSERTS)
enable_preprocessor_switch(ENABLE_ASSERTS)
disable_preprocessor_switch(DLIB_DISABLE_ASSERTS)
else()
# Set these variables so they are set in the config.h.in file when dlib
# is installed.
set (DLIB_DISABLE_ASSERTS true)
set (ENABLE_ASSERTS false)
remove_global_define(ENABLE_ASSERTS)
# Never force the asserts off when doing an in project build. Instead,
# let the debug/release mode setting toggle asserts on or off (or the
# DLIB_ENABLE_ASSERTS option obviously). That is, even if the
# DLIB_ENABLE_ASSERTS option is off debug mode can still cause the
# asserts to turn on when using an in project build.
disable_preprocessor_switch(ENABLE_ASSERTS)
# Never force the asserts off when doing an in project build. The only
# time this matters is when using visual studio. The visual studio IDE
# has a drop down that lets the user select either release or debug
# builds. The DLIB_ASSERT macro is setup to enable/disable automatically
# based on this drop down (via preprocessor magic). However, if
# DLIB_DISABLE_ASSERTS is defined it permanently disables asserts no
# matter what, which would defeat the visual studio drop down. So here
# we make a point to not do that kind of severe disabling when in a
# project build. It should also be pointed out that DLIB_DISABLE_ASSERTS
# is only needed when building and installing dlib as a separately
# installed library. It doesn't matter when doing an in project build.
if (NOT DLIB_IN_PROJECT_BUILD)
add_global_define(DLIB_DISABLE_ASSERTS)
enable_preprocessor_switch(DLIB_DISABLE_ASSERTS)
endif()
endif()
......@@ -234,7 +239,7 @@ if (NOT TARGET dlib)
# If both X11 and anaconda are installed, it's possible for the
# anaconda path to appear before /opt/X11, so we remove anaconda.
foreach (ITR ${X11_INCLUDE_DIR})
if ("${ITR}" MATCHES "(.*)anaconda(.*)")
if ("${ITR}" MATCHES "(.*)(ana|mini)conda(.*)")
list (REMOVE_ITEM X11_INCLUDE_DIR ${ITR})
endif ()
endforeach(ITR)
......@@ -265,7 +270,7 @@ if (NOT TARGET dlib)
message(" *** You can download XQuartz from: http://xquartz.macosforge.org/landing/ ***")
message(" *****************************************************************************")
set(DLIB_NO_GUI_SUPPORT ON CACHE STRING ${DLIB_NO_GUI_SUPPORT_STR} FORCE )
add_global_define(DLIB_NO_GUI_SUPPORT)
enable_preprocessor_switch(DLIB_NO_GUI_SUPPORT)
endif()
endif()
......@@ -295,7 +300,7 @@ if (NOT TARGET dlib)
message(" *** On Ubuntu run: sudo apt-get install libx11-dev ***")
message(" *****************************************************************************")
set(DLIB_NO_GUI_SUPPORT ON CACHE STRING ${DLIB_NO_GUI_SUPPORT_STR} FORCE )
add_global_define(DLIB_NO_GUI_SUPPORT)
enable_preprocessor_switch(DLIB_NO_GUI_SUPPORT)
endif()
endif()
......@@ -524,6 +529,7 @@ if (NOT TARGET dlib)
# magic in the standard C++ header files (since nvcc uses gcc headers on
# linux).
list(APPEND CUDA_NVCC_FLAGS "-arch=sm_30;-D__STRICT_ANSI__;-D_MWAITXINTRIN_H_INCLUDED;-D_FORCE_INLINES;${FLAGS_FOR_NVCC}")
list(APPEND CUDA_NVCC_FLAGS ${active_preprocessor_switches})
if (NOT MSVC)
list(APPEND CUDA_NVCC_FLAGS "-std=c++11")
endif()
......@@ -568,21 +574,44 @@ if (NOT TARGET dlib)
endif()
endif()
endif()
endif()
# Find where cuSOLVER is since the FindCUDA cmake package doesn't
# bother to look for it.
get_filename_component(cuda_blas_path "${CUDA_CUBLAS_LIBRARIES}" DIRECTORY)
find_library(cusolver cusolver HINTS ${cuda_blas_path})
mark_as_advanced(cusolver)
# Also find OpenMP since cuSOLVER needs it. Importantly, we only
# look for one to link to if our use of BLAS, specifically the
# Intel MKL, hasn't already decided what to use. This is because
# it makes the MKL bug out if you link to another openmp lib other
# than Intel's when you use the MKL.
if (NOT openmp_libarires AND NOT MSVC)
find_package(OpenMP)
if (OPENMP_FOUND)
set(openmp_libarires ${OpenMP_CXX_FLAGS})
else()
message(STATUS "*** Didn't find OpenMP, which is required to use CUDA. ***")
set(CUDA_FOUND 0)
endif()
endif()
endif()
if (CUDA_FOUND AND cudnn AND cudnn_include AND COMPILER_CAN_DO_CPP_11 AND cuda_test_compile_worked AND cudnn_test_compile_worked)
set(source_files ${source_files}
dnn/cuda_dlib.cu
dnn/cudnn_dlibapi.cpp
dnn/cublas_dlibapi.cpp
dnn/cusolver_dlibapi.cu
dnn/curand_dlibapi.cpp
dnn/cuda_data_ptr.cpp
dnn/gpu_data.cpp
)
set(dlib_needed_libraries ${dlib_needed_libraries}
${CUDA_CUBLAS_LIBRARIES}
${cudnn}
${CUDA_curand_LIBRARY}
${cusolver}
${openmp_libarires}
)
include_directories(${cudnn_include})
else()
......@@ -647,6 +676,11 @@ if (NOT TARGET dlib)
PUBLIC ${dlib_needed_includes}
)
target_link_libraries(dlib PRIVATE ${dlib_needed_libraries})
if (DLIB_IN_PROJECT_BUILD)
target_compile_options(dlib PUBLIC ${active_preprocessor_switches})
else()
target_compile_options(dlib PRIVATE ${active_preprocessor_switches})
endif()
if (UNIX AND NOT DLIB_IN_PROJECT_BUILD)
if (DLIB_USE_CUDA)
cuda_add_library(dlib_shared SHARED ${source_files} )
......@@ -661,6 +695,7 @@ if (NOT TARGET dlib)
PUBLIC ${dlib_needed_includes}
)
target_link_libraries(dlib_shared PRIVATE ${dlib_needed_libraries})
target_compile_options(dlib_shared PRIVATE ${active_preprocessor_switches})
endif()
endif () ##### end of if NOT DLIB_ISO_CPP_ONLY ##########################################################
......@@ -669,6 +704,7 @@ if (NOT TARGET dlib)
if (DLIB_TEST_COMPILE_ALL_SOURCE_CPP)
ADD_LIBRARY(dlib_all_source_cpp STATIC all/source.cpp)
target_link_libraries(dlib_all_source_cpp dlib)
target_compile_options(dlib_all_source_cpp PUBLIC ${active_preprocessor_switches})
endif()
# Install the library
......
......@@ -68,7 +68,7 @@ if (PYTHON3)
if (NOT Boost_FOUND)
FIND_PACKAGE(Boost 1.41.0 COMPONENTS python)
endif()
set(Python_ADDITIONAL_VERSIONS 3.5)
set(Python_ADDITIONAL_VERSIONS 3.5 3.6)
FIND_PACKAGE(PythonLibs 3.4 REQUIRED)
else()
FIND_PACKAGE(Boost 1.41.0 COMPONENTS python)
......
......@@ -16,6 +16,8 @@
# lapack_libraries - link against these to use LAPACK library
# mkl_libraries - link against these to use the MKL library
# mkl_include_dir - add to the include path to use the MKL library
# openmp_libarires - Set to Intel's OpenMP library if and only if we
# find the MKL.
# setting this makes CMake allow normal looking if else statements
SET(CMAKE_ALLOW_LOOSE_LOOP_CONSTRUCTS true)
......@@ -91,11 +93,13 @@ if (UNIX OR MINGW)
/opt/intel/include
)
find_path(mkl_include_dir mkl_version.h ${mkl_include_search_path})
mark_as_advanced(mkl_include_dir)
# Search for the needed libraries from the MKL. We will try to link against the mkl_rt
# file first since this way avoids linking bugs in some cases.
find_library(mkl_rt mkl_rt ${mkl_search_path})
mark_as_advanced( mkl_rt )
find_library(openmp_libarires iomp5 ${mkl_search_path})
mark_as_advanced( mkl_rt openmp_libarires )
# if we found the MKL
if ( mkl_rt)
set(mkl_libraries ${mkl_rt} )
......
......@@ -80,6 +80,12 @@ namespace dlib
fout << " ignore='" << b.ignore << "'";
if (b.angle != 0)
fout << " angle='" << b.angle << "'";
if (b.age != 0)
fout << " age='" << b.age << "'";
if (b.gender == FEMALE)
fout << " gender='female'";
else if (b.gender == MALE)
fout << " gender='male'";
if (b.pose != 0)
fout << " pose='" << b.pose << "'";
if (b.detection_score != 0)
......@@ -196,6 +202,18 @@ namespace dlib
if (atts.is_in_list("occluded")) temp_box.occluded = sa = atts["occluded"];
if (atts.is_in_list("ignore")) temp_box.ignore = sa = atts["ignore"];
if (atts.is_in_list("angle")) temp_box.angle = sa = atts["angle"];
if (atts.is_in_list("age")) temp_box.age = sa = atts["age"];
if (atts.is_in_list("gender"))
{
if (atts["gender"] == "male")
temp_box.gender = MALE;
else if (atts["gender"] == "female")
temp_box.gender = FEMALE;
else if (atts["gender"] == "unknown")
temp_box.gender = UNKNOWN;
else
throw dlib::error("Invalid gender string in box attribute.");
}
if (atts.is_in_list("pose")) temp_box.pose = sa = atts["pose"];
if (atts.is_in_list("detection_score")) temp_box.detection_score = sa = atts["detection_score"];
......
......@@ -14,6 +14,15 @@ namespace dlib
namespace image_dataset_metadata
{
// ------------------------------------------------------------------------------------
enum gender_t
{
UNKNOWN,
MALE,
FEMALE
};
// ------------------------------------------------------------------------------------
struct box
......@@ -36,7 +45,9 @@ namespace dlib
ignore(false),
pose(0),
detection_score(0),
angle(0)
angle(0),
gender(UNKNOWN),
age(0)
{}
box (
......@@ -49,7 +60,9 @@ namespace dlib
ignore(false),
pose(0),
detection_score(0),
angle(0)
angle(0),
gender(UNKNOWN),
age(0)
{}
rectangle rect;
......@@ -72,6 +85,9 @@ namespace dlib
// image counter-clockwise by angle radians.
double angle;
gender_t gender;
double age;
bool has_label() const { return label.size() != 0; }
/*!
ensures
......
......@@ -3396,7 +3396,7 @@ namespace dlib
visitor&& v
)
{
vl_loop<i+1, num>::visit(net,v);
vl_loop_backwards<i+1, num>::visit(net,v);
v(i, layer<i>(net));
}
};
......
......@@ -486,6 +486,46 @@ namespace dlib
}
}
// ----------------------------------------------------------------------------------------
void affine_transform(
const rectangle& rect,
tensor& dest,
const tensor& src1,
const tensor& src2,
const tensor& src3,
float A,
float B,
float C
)
{
DLIB_CASSERT(dest.size() == src1.size());
DLIB_CASSERT(dest.size() == src2.size());
DLIB_CASSERT(dest.size() == src3.size());
DLIB_CASSERT(dest.num_samples() == src1.num_samples());
DLIB_CASSERT(dest.num_samples() == src2.num_samples());
DLIB_CASSERT(dest.num_samples() == src3.num_samples());
DLIB_CASSERT(rectangle(0,0, dest.size()/dest.num_samples()-1, dest.num_samples()-1).contains(rect));
auto d = dest.host();
auto s1 = src1.host();
auto s2 = src2.host();
auto s3 = src3.host();
const auto nc = dest.size()/dest.num_samples();
for (long r = rect.top(); r <= rect.bottom(); ++r)
{
for (long c = rect.left(); c <= rect.right(); ++c)
{
auto idx = r*nc + c;
d[idx] = s1[idx]*A + s2[idx]*B + s3[idx]*C;
}
}
}
// -----------------------------------------------------------------------------------
void compute_adam_update (
......
......@@ -7,6 +7,7 @@
// and cudnn_dlibapi.h
#include "tensor.h"
#include "../geometry/rectangle.h"
namespace dlib
{
......@@ -111,6 +112,19 @@ namespace dlib
const tensor& B
);
// -----------------------------------------------------------------------------------
void affine_transform(
const rectangle& rect,
tensor& dest,
const tensor& src1,
const tensor& src2,
const tensor& src3,
float A,
float B,
float C
);
// -----------------------------------------------------------------------------------
void compute_adam_update (
......
// Copyright (C) 2017 Davis E. King (davis@dlib.net)
// License: Boost Software License See LICENSE.txt for the full license.
#ifndef DLIB_DNN_CuDA_DATA_PTR_CPP_
#define DLIB_DNN_CuDA_DATA_PTR_CPP_
#ifdef DLIB_USE_CUDA
#include "cuda_data_ptr.h"
#include "cuda_utils.h"
namespace dlib
{
namespace cuda
{
// -----------------------------------------------------------------------------------
cuda_data_void_ptr::
cuda_data_void_ptr(
size_t n
) : num(n)
{
if (n == 0)
return;
void* data = nullptr;
CHECK_CUDA(cudaMalloc(&data, n));
pdata.reset(data, [](void* ptr){
auto err = cudaFree(ptr);
if(err!=cudaSuccess)
std::cerr << "cudaFree() failed. Reason: " << cudaGetErrorString(err) << std::endl;
});
}
// ------------------------------------------------------------------------------------
void memcpy(
void* dest,
const cuda_data_void_ptr& src
)
{
if (src.size() != 0)
{
CHECK_CUDA(cudaMemcpy(dest, src.data(), src.size(), cudaMemcpyDefault));
}
}
// ------------------------------------------------------------------------------------
void memcpy(
cuda_data_void_ptr& dest,
const void* src
)
{
if (dest.size() != 0)
{
CHECK_CUDA(cudaMemcpy(dest.data(), src, dest.size(), cudaMemcpyDefault));
}
}
// ------------------------------------------------------------------------------------
}
}
#endif // DLIB_USE_CUDA
#endif // DLIB_DNN_CuDA_DATA_PTR_CPP_
// Copyright (C) 2017 Davis E. King (davis@dlib.net)
// License: Boost Software License See LICENSE.txt for the full license.
#ifndef DLIB_DNN_CuDA_DATA_PTR_H_
#define DLIB_DNN_CuDA_DATA_PTR_H_
#ifdef DLIB_USE_CUDA
#include <memory>
#include <vector>
namespace dlib
{
namespace cuda
{
// ------------------------------------------------------------------------------------
class cuda_data_void_ptr
{
/*!
WHAT THIS OBJECT REPRESENTS
This is a block of memory on a CUDA device.
!*/
public:
cuda_data_void_ptr() = default;
cuda_data_void_ptr(size_t n);
/*!
ensures
- This object will allocate a device memory buffer of n bytes.
- #size() == n
!*/
void* data() { return pdata.get(); }
const void* data() const { return pdata.get(); }
operator void*() { return pdata.get(); }
operator const void*() const { return pdata.get(); }
void reset() { pdata.reset(); }
size_t size() const { return num; }
/*!
ensures
- returns the length of this buffer, in bytes.
!*/
private:
size_t num = 0;
std::shared_ptr<void> pdata;
};
// ------------------------------------------------------------------------------------
void memcpy(
void* dest,
const cuda_data_void_ptr& src
);
/*!
requires
- dest == a pointer to at least src.size() bytes on the host machine.
ensures
- copies the GPU data from src into dest.
!*/
// ------------------------------------------------------------------------------------
void memcpy(
cuda_data_void_ptr& dest,
const void* src
);
/*!
requires
- dest == a pointer to at least src.size() bytes on the host machine.
ensures
- copies the host data from src to the GPU memory buffer dest.
!*/
// ------------------------------------------------------------------------------------
// ------------------------------------------------------------------------------------
// ------------------------------------------------------------------------------------
template <typename T>
class cuda_data_ptr
{
/*!
WHAT THIS OBJECT REPRESENTS
This is a block of memory on a CUDA device. It is just a type safe
version of cuda_data_void_ptr.
!*/
public:
static_assert(std::is_standard_layout<T>::value, "You can only create basic standard layout types on the GPU");
cuda_data_ptr() = default;
cuda_data_ptr(size_t n) : num(n)
/*!
ensures
- This object will allocate a device memory buffer of n T objects.
- #size() == n
!*/
{
if (n == 0)
return;
pdata = cuda_data_void_ptr(n*sizeof(T));
}
T* data() { return (T*)pdata.data(); }
const T* data() const { return (T*)pdata.data(); }
operator T*() { return (T*)pdata.data(); }
operator const T*() const { return (T*)pdata.data(); }
void reset() { pdata.reset(); }
size_t size() const { return num; }
friend void memcpy(
std::vector<T>& dest,
const cuda_data_ptr& src
)
{
dest.resize(src.size());
if (src.size() != 0)
memcpy(dest.data(), src.pdata);
}
friend void memcpy(
cuda_data_ptr& src,
const std::vector<T>& dest
)
{
if (dest.size() != src.size())
dest = cuda_data_ptr<T>(src.size());
if (src.size() != 0)
memcpy(src.pdata, dest.data());
}
private:
size_t num = 0;
cuda_data_void_ptr pdata;
};
// ------------------------------------------------------------------------------------
}
}
#endif // DLIB_USE_CUDA
#endif // DLIB_DNN_CuDA_DATA_PTR_H_
......@@ -575,6 +575,57 @@ namespace dlib
launch_kernel(_cuda_affine_transform1_0,max_jobs(dest.size()),dest.device(), src.device(), src.size(), A);
}
// ----------------------------------------------------------------------------------------
__global__ void _cuda_affine_transform_rect(
float* d,
const float* s1,
const float* s2,
const float* s3,
float A,
float B,
float C,
size_t start_idx,
size_t n,
size_t rect_nc,
size_t total_nc
)
{
for (auto i : grid_stride_range(0, n))
{
size_t r = i/rect_nc;
size_t c = i%rect_nc;
size_t idx = r*total_nc + c + start_idx;
d[idx] = A*s1[idx] + B*s2[idx] + C*s3[idx];
}
}
void affine_transform(
const rectangle& rect,
tensor& dest,
const tensor& src1,
const tensor& src2,
const tensor& src3,
float A,
float B,
float C
)
{
DLIB_CASSERT(dest.size() == src1.size());
DLIB_CASSERT(dest.size() == src2.size());
DLIB_CASSERT(dest.size() == src3.size());
DLIB_CASSERT(dest.num_samples() == src1.num_samples());
DLIB_CASSERT(dest.num_samples() == src2.num_samples());
DLIB_CASSERT(dest.num_samples() == src3.num_samples());
DLIB_CASSERT(rectangle(0,0, dest.size()/dest.num_samples()-1, dest.num_samples()-1).contains(rect));
launch_kernel(_cuda_affine_transform_rect,max_jobs(rect.area()),
dest.device(), src1.device(), src2.device(), src3.device(), A, B, C,
rect.left() + rect.top()*(dest.size()/dest.num_samples()),
rect.area(),
rect.width(),
dest.size()/dest.num_samples());
}
// ----------------------------------------------------------------------------------------
__global__ void _cuda_affine_transform4(float* d, const float* s1, const float* s2, size_t n, float A, float B, float C)
......
......@@ -5,6 +5,7 @@
#include "tensor.h"
#include "../geometry/rectangle.h"
namespace dlib
{
......@@ -230,6 +231,17 @@ namespace dlib
const float C
);
void affine_transform(
const rectangle& rect,
tensor& dest,
const tensor& src1,
const tensor& src2,
const tensor& src3,
float A,
float B,
float C
);
// Note that this function isn't in the tt:: namespace because add_scaled() is
// called by cuda::add() so we don't need a tt:: version of add_scaled().
void add_scaled(
......
......@@ -52,6 +52,17 @@ namespace dlib
cublas_error(const std::string& message): cuda_error(message) {}
};
struct cusolver_error : public cuda_error
{
/*!
WHAT THIS OBJECT REPRESENTS
This is the exception thrown if any calls to the NVIDIA cuSolver library
returns an error.
!*/
cusolver_error(const std::string& message): cuda_error(message) {}
};
}
......
......@@ -11,6 +11,10 @@
#include <cuda_runtime.h>
#include <sstream>
#include <iostream>
#include <memory>
#include <vector>
#include <type_traits>
// Check the return value of a call to the CUDA runtime for an error condition.
......
......@@ -820,6 +820,16 @@ namespace dlib
filters.nc()));
CHECK_CUDNN(cudnnCreateConvolutionDescriptor((cudnnConvolutionDescriptor_t*)&conv_handle));
#if CUDNN_MAJOR >= 6
CHECK_CUDNN(cudnnSetConvolution2dDescriptor((cudnnConvolutionDescriptor_t)conv_handle,
padding_y, // vertical padding
padding_x, // horizontal padding
stride_y,
stride_x,
1, 1, // must be 1,1
CUDNN_CROSS_CORRELATION,
CUDNN_DATA_FLOAT)); // could also be CUDNN_CONVOLUTION
#else
CHECK_CUDNN(cudnnSetConvolution2dDescriptor((cudnnConvolutionDescriptor_t)conv_handle,
padding_y, // vertical padding
padding_x, // horizontal padding
......@@ -827,6 +837,7 @@ namespace dlib
stride_x,
1, 1, // must be 1,1
CUDNN_CROSS_CORRELATION)); // could also be CUDNN_CONVOLUTION
#endif
CHECK_CUDNN(cudnnGetConvolution2dForwardOutputDim(
(const cudnnConvolutionDescriptor_t)conv_handle,
......
// Copyright (C) 2017 Davis E. King (davis@dlib.net)
// License: Boost Software License See LICENSE.txt for the full license.
#ifndef DLIB_DNN_CuSOLVER_CU_
#define DLIB_DNN_CuSOLVER_CU_
#ifdef DLIB_USE_CUDA
#include "cusolver_dlibapi.h"
#include <cublas_v2.h>
#include <cusolverDn.h>
#include "cuda_utils.h"
// ----------------------------------------------------------------------------------------
static const char* cusolver_get_error_string(cusolverStatus_t s)
{
switch(s)
{
case CUSOLVER_STATUS_NOT_INITIALIZED:
return "CUDA Runtime API initialization failed.";
case CUSOLVER_STATUS_ALLOC_FAILED:
return "CUDA Resources could not be allocated.";
default:
return "A call to cuSolver failed";
}
}
// Check the return value of a call to the cuSolver runtime for an error condition.
#define CHECK_CUSOLVER(call) \
do{ \
const cusolverStatus_t error = call; \
if (error != CUSOLVER_STATUS_SUCCESS) \
{ \
std::ostringstream sout; \
sout << "Error while calling " << #call << " in file " << __FILE__ << ":" << __LINE__ << ". ";\
sout << "code: " << error << ", reason: " << cusolver_get_error_string(error);\
throw dlib::cusolver_error(sout.str()); \
} \
}while(false)
// ----------------------------------------------------------------------------------------
// ----------------------------------------------------------------------------------------
namespace dlib
{
namespace cuda
{
// -----------------------------------------------------------------------------------
class cusolver_context
{
public:
// not copyable
cusolver_context(const cusolver_context&) = delete;
cusolver_context& operator=(const cusolver_context&) = delete;
cusolver_context()
{
handles.resize(16);
}
~cusolver_context()
{
for (auto h : handles)
{
if (h)
cusolverDnDestroy(h);
}
}
cusolverDnHandle_t get_handle (
)
{
int new_device_id;
CHECK_CUDA(cudaGetDevice(&new_device_id));
// make room for more devices if needed
if (new_device_id >= (long)handles.size())
handles.resize(new_device_id+16);
// If we don't have a handle already for this device then make one
if (!handles[new_device_id])
CHECK_CUSOLVER(cusolverDnCreate(&handles[new_device_id]));
// Finally, return the handle for the current device
return handles[new_device_id];
}
private:
std::vector<cusolverDnHandle_t> handles;
};
static cusolverDnHandle_t context()
{
thread_local cusolver_context c;
return c.get_handle();
}
// ------------------------------------------------------------------------------------
// ------------------------------------------------------------------------------------
// ------------------------------------------------------------------------------------
__global__ void _cuda_set_to_identity_matrix(float* m, size_t nr)
{
for (auto j : grid_stride_range(0, nr*nr))
{
if (j%(nr+1) == 0)
m[j] = 1;
else
m[j] = 0;
}
}
void set_to_identity_matrix (
tensor& m
)
{
DLIB_CASSERT(m.size() == m.num_samples()*m.num_samples());
launch_kernel(_cuda_set_to_identity_matrix, max_jobs(m.size()), m.device(), m.num_samples());
}
// ------------------------------------------------------------------------------------
inv::~inv()
{
sync_if_needed();
}
// ------------------------------------------------------------------------------------
void inv::
operator() (
const tensor& m_,
resizable_tensor& out
)
{
DLIB_CASSERT(m_.size() == m_.num_samples()*m_.num_samples(), "Input matrix must be square if you want to invert it.");
m = m_;
out.copy_size(m);
set_to_identity_matrix(out);
const int nc = m.num_samples();
int Lwork;
CHECK_CUSOLVER(cusolverDnSgetrf_bufferSize(context(), nc , nc, m.device(), nc, &Lwork));
if (Lwork > (int)workspace.size())
{
sync_if_needed();
workspace = cuda_data_ptr<float>(Lwork);
}
if (nc > (int)Ipiv.size())
{
sync_if_needed();
Ipiv = cuda_data_ptr<int>(nc);
}
if (info.size() != 1)
{
info = cuda_data_ptr<int>(1);
}
CHECK_CUSOLVER(cusolverDnSgetrf(context(), nc, nc, m.device(), nc, workspace, Ipiv, info));
CHECK_CUSOLVER(cusolverDnSgetrs(context(), CUBLAS_OP_N, nc, nc, m.device(), nc, Ipiv, out.device(), nc, info));
did_work_lately = true;
}
// ------------------------------------------------------------------------------------
int inv::
get_last_status(
)
{
std::vector<int> linfo;
memcpy(linfo, info);
if (linfo.size() != 0)
return linfo[0];
else
return 0;
}
// ------------------------------------------------------------------------------------
void inv::
sync_if_needed()
{
if (did_work_lately)
{
did_work_lately = false;
// make sure we wait until any previous kernel launches have finished
// before we do something like deallocate the GPU memory.
cudaDeviceSynchronize();
}
}
// ------------------------------------------------------------------------------------
}
}
#endif // DLIB_USE_CUDA
#endif // DLIB_DNN_CuSOLVER_CU_
// Copyright (C) 2017 Davis E. King (davis@dlib.net)
// License: Boost Software License See LICENSE.txt for the full license.
#ifndef DLIB_DNN_CuSOLVER_H_
#define DLIB_DNN_CuSOLVER_H_
#ifdef DLIB_USE_CUDA
#include "tensor.h"
#include "cuda_errors.h"
#include "cuda_data_ptr.h"
#include "../noncopyable.h"
namespace dlib
{
namespace cuda
{
// -----------------------------------------------------------------------------------
class inv : noncopyable
{
/*!
WHAT THIS OBJECT REPRESENTS
This is a functor for doing matrix inversion on the GPU. The only
reason it's an object is to avoid the reallocation of some GPU memory
blocks if you want to do a bunch of matrix inversions in a row.
!*/
public:
inv() = default;
~inv();
void operator() (
const tensor& m,
resizable_tensor& out
);
/*!
requires
- m.size() == m.num_samples()*m.num_samples()
(i.e. mat(m) must be a square matrix)
ensures
- out == inv(mat(m));
!*/
int get_last_status(
);
/*!
ensures
- returns 0 if the last matrix inversion was successful and != 0
otherwise.
!*/
private:
void sync_if_needed();
bool did_work_lately = false;
resizable_tensor m;
cuda_data_ptr<float> workspace;
cuda_data_ptr<int> Ipiv;
cuda_data_ptr<int> info;
};
// ------------------------------------------------------------------------------------
}
}
#endif // DLIB_USE_CUDA
#endif // DLIB_DNN_CuSOLVER_H_
......@@ -353,6 +353,24 @@ namespace dlib { namespace tt
#endif
}
void affine_transform(
const rectangle& rect,
tensor& dest,
const tensor& src1,
const tensor& src2,
const tensor& src3,
float A,
float B,
float C
)
{
#ifdef DLIB_USE_CUDA
cuda::affine_transform(rect, dest,src1,src2,src3,A,B,C);
#else
cpu::affine_transform(rect, dest,src1,src2,src3,A,B,C);
#endif
}
void affine_transform(
tensor& dest,
const tensor& src1,
......@@ -789,6 +807,21 @@ namespace dlib { namespace tt
#endif
}
// ----------------------------------------------------------------------------------------
void inv::
operator() (
const tensor& m,
resizable_tensor& out
)
{
#ifdef DLIB_USE_CUDA
finv(m,out);
#else
out = dlib::inv(mat(m));
#endif
}
// ----------------------------------------------------------------------------------------
}}
......
......@@ -6,11 +6,13 @@
#include "tensor.h"
#include "cudnn_dlibapi.h"
#include "cublas_dlibapi.h"
#include "cusolver_dlibapi.h"
#include "curand_dlibapi.h"
#include "cpu_dlib.h"
#include "cuda_dlib.h"
#include "../rand.h"
#include <memory>
#include "../geometry/rectangle.h"
namespace dlib
{
......@@ -122,6 +124,36 @@ namespace dlib { namespace tt
- performs: dest = alpha*L*R + beta*mat(dest)
!*/
// ----------------------------------------------------------------------------------------
class inv
{
/*!
WHAT THIS OBJECT REPRESENTS
This is a functor for doing matrix inversion on the GPU. The only
reason it's an object is to avoid the reallocation of some GPU memory
blocks if you want to do a bunch of matrix inversions in a row.
!*/
public:
void operator() (
const tensor& m,
resizable_tensor& out
);
/*!
requires
- m.size() == m.num_samples()*m.num_samples()
(i.e. mat(m) must be a square matrix)
ensures
- out == inv(mat(m));
!*/
private:
#ifdef DLIB_USE_CUDA
cuda::inv finv;
#endif
};
// ----------------------------------------------------------------------------------------
class tensor_rand
......@@ -356,6 +388,34 @@ namespace dlib { namespace tt
- #dest.host()[i] == A*src1.host()[i] + B*src2.host()[i] + C*src3.host()[i]
!*/
void affine_transform(
const rectangle& rect,
tensor& dest,
const tensor& src1,
const tensor& src2,
const tensor& src3,
float A,
float B,
float C
);
/*!
requires
- dest.size()==src1.size()
- dest.size()==src2.size()
- dest.size()==src3.size()
- dest.num_samples()==src1.num_samples()
- dest.num_samples()==src2.num_samples()
- dest.num_samples()==src3.num_samples()
- get_rect(mat(dest)).contains(rect) == true
(i.e. rect must be entirely contained within dest)
ensures
- This function operates much like
affine_transform(dest,src1,src2,src3,A,B,C,0), except that it runs over only
the sub-rectangle indicated by rect. In particular, this function is equivalent
to:
set_subm(dest,rect) = A*subm(mat(src1),rect) + B*subm(mat(src2),rect) + C*subm(mat(src3),rect)
!*/
// ----------------------------------------------------------------------------------------
void affine_transform(
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment