"...git@developer.sourcefind.cn:OpenDAS/pytorch-encoding.git" did not exist on "1a8a08fbdaa4e5b9da348b1a548f59930b9c1e73"
Unverified Commit 74123841 authored by Juha Reunanen's avatar Juha Reunanen Committed by GitHub
Browse files

To avoid a GPU memory leak, allow passing thread pools to dnn_trainer from outside (#2027)

* Problem: The CUDA runtime allocates resources for each thread, and apparently those resources are not freed when the corresponding threads terminate. Therefore, each instantiation of dnn_trainer leaks a bit of GPU memory.

Solution: Add possibility to pass thread pools from outside. This way, subsequent dnn_trainer instances can use the same threads, and there's no memory leak.

* Add helpful comments
parent 6fc503d2
...@@ -75,6 +75,7 @@ namespace dlib ...@@ -75,6 +75,7 @@ namespace dlib
typedef typename net_type::input_type input_type; typedef typename net_type::input_type input_type;
const static size_t num_computational_layers = net_type::num_computational_layers; const static size_t num_computational_layers = net_type::num_computational_layers;
const static size_t num_layers = net_type::num_layers; const static size_t num_layers = net_type::num_layers;
using threads = std::vector<std::shared_ptr<thread_pool>>;
private: private:
typedef impl::dnn_job_t<training_label_type> job_t; typedef impl::dnn_job_t<training_label_type> job_t;
public: public:
...@@ -104,8 +105,9 @@ namespace dlib ...@@ -104,8 +105,9 @@ namespace dlib
dnn_trainer( dnn_trainer(
net_type& net_, net_type& net_,
const solver_type& solver_, const solver_type& solver_,
const std::vector<int>& cuda_extra_devices const std::vector<int>& cuda_extra_devices,
) : job_pipe(0), net(net_) std::shared_ptr<threads> thread_pools_ = std::shared_ptr<threads>()
) : job_pipe(0), net(net_), thread_pools(thread_pools_)
{ {
devices.push_back(std::make_shared<device_data>(dlib::cuda::get_device(), net, solver_)); devices.push_back(std::make_shared<device_data>(dlib::cuda::get_device(), net, solver_));
...@@ -667,6 +669,14 @@ namespace dlib ...@@ -667,6 +669,14 @@ namespace dlib
std::vector<tensor*> reference_params; std::vector<tensor*> reference_params;
visit_layer_parameters(devices[0]->net, [&](size_t, tensor& t) { reference_params.push_back(&t); }); visit_layer_parameters(devices[0]->net, [&](size_t, tensor& t) { reference_params.push_back(&t); });
// If no external thread pools vector was passed, then create one that will
// be automatically destructed as soon as the dnn_trainer object goes out of
// scope.
if (!thread_pools)
thread_pools = std::make_shared<threads>();
auto& tp = *thread_pools;
// We make separate thread pools with just one thread in them because we want // We make separate thread pools with just one thread in them because we want
// to make sure each device is always executed on the same thread. We care // to make sure each device is always executed on the same thread. We care
// about this because there are thread_local context variables for some cuda // about this because there are thread_local context variables for some cuda
...@@ -674,8 +684,7 @@ namespace dlib ...@@ -674,8 +684,7 @@ namespace dlib
// So if we make sure the same device always uses the same thread this will // So if we make sure the same device always uses the same thread this will
// reduce the number of contexts we allocate from num_devices*num_devices to // reduce the number of contexts we allocate from num_devices*num_devices to
// just num_devices. // just num_devices.
std::vector<std::shared_ptr<thread_pool>> tp; while (tp.size() < devices.size())
for (size_t i = 0; i < devices.size(); ++i)
tp.push_back(std::make_shared<thread_pool>(1)); tp.push_back(std::make_shared<thread_pool>(1));
...@@ -1274,6 +1283,7 @@ namespace dlib ...@@ -1274,6 +1283,7 @@ namespace dlib
std::vector<std::shared_ptr<device_data>> devices; std::vector<std::shared_ptr<device_data>> devices;
dlib::pipe<job_t> job_pipe; dlib::pipe<job_t> job_pipe;
std::shared_ptr<threads> thread_pools;
job_t job; job_t job;
......
...@@ -58,6 +58,8 @@ namespace dlib ...@@ -58,6 +58,8 @@ namespace dlib
typedef typename net_type::input_type input_type; typedef typename net_type::input_type input_type;
const static size_t num_computational_layers = net_type::num_computational_layers; const static size_t num_computational_layers = net_type::num_computational_layers;
using threads = std::vector<std::shared_ptr<thread_pool>>;
dnn_trainer() = delete; dnn_trainer() = delete;
dnn_trainer(const dnn_trainer&) = delete; dnn_trainer(const dnn_trainer&) = delete;
dnn_trainer& operator=(const dnn_trainer&) = delete; dnn_trainer& operator=(const dnn_trainer&) = delete;
...@@ -65,7 +67,8 @@ namespace dlib ...@@ -65,7 +67,8 @@ namespace dlib
dnn_trainer( dnn_trainer(
net_type& net, net_type& net,
const solver_type& solver = solver_type(), const solver_type& solver = solver_type(),
const std::vector<int>& cuda_extra_devices = {} const std::vector<int>& cuda_extra_devices = {},
std::shared_ptr<threads> thread_pools = std::shared_ptr<threads>()
); );
/*! /*!
requires requires
...@@ -96,6 +99,19 @@ namespace dlib ...@@ -96,6 +99,19 @@ namespace dlib
cudaGetDevice()). In addition, you can ask to use additional cudaGetDevice()). In addition, you can ask to use additional
devices, which you do by putting their device numbers into devices, which you do by putting their device numbers into
cuda_extra_devices. cuda_extra_devices.
- if (thread_pools.get() != nullptr) then
- Any new threads spun within the trainer will execute within the
passed thread pools vector. This means that the same threads can
be re-used across different dnn_trainer instances. Otherwise, the
CUDA runtime may leak memory. This, however, is relevant only if
your program is going to instantiate a large number of trainers,
and generally stay up and running for a very long time. If not,
then you need not worry about this.
NB: Any particular thread pools vector should be passed to max
one trainer instance at a time.
NB: The mentioned leak isn't happening because dlib is or isn't
doing something. Instead, it is a limitation of the CUDA
runtime that dlib has no control over.
!*/ !*/
net_type& get_net ( net_type& get_net (
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment