To avoid a GPU memory leak, allow passing thread pools to dnn_trainer from outside (#2027)

* Problem: The CUDA runtime allocates resources for each thread, and apparently those resources are not freed when the corresponding threads terminate. Therefore, each instantiation of dnn_trainer leaks a bit of GPU memory. Solution: Add possibility to pass thread pools from outside. This way, subsequent dnn_trainer instances can use the same threads, and there's no memory leak. * Add helpful comments

To avoid a GPU memory leak, allow passing thread pools to dnn_trainer from outside (#2027)
* Problem: The CUDA runtime allocates resources for each thread, and apparently those resources are not freed when the corresponding threads terminate. Therefore, each instantiation of dnn_trainer leaks a bit of GPU memory. Solution: Add possibility to pass thread pools from outside. This way, subsequent dnn_trainer instances can use the same threads, and there's no memory leak. * Add helpful comments
74123841 · Juha Reunanen · GitHub · 6fc503d2 · 74123841 · 74123841
Unverified Commit 74123841 authored Mar 19, 2020 by Juha Reunanen Committed by GitHub Mar 19, 2020
Hide whitespace changes
Inline Side-by-side

Showing with 31 additions and 5 deletions

dlib/dnn/trainer.h dlib/dnn/trainer.h +14 -4

dlib/dnn/trainer_abstract.h dlib/dnn/trainer_abstract.h +17 -1

No files found.
--- a/dlib/dnn/trainer.h
+++ b/dlib/dnn/trainer.h
@@ -75,6 +75,7 @@ namespace dlib
        typedef typename net_type::input_type input_type;
        const static size_t num_computational_layers = net_type::num_computational_layers;
        const static size_t num_layers = net_type::num_layers;
+        using threads = std::vector<std::shared_ptr<thread_pool>>;
    private:
        typedef impl::dnn_job_t<training_label_type> job_t;
    public:
@@ -104,8 +105,9 @@ namespace dlib
        dnn_trainer(
            net_type& net_, 
            const solver_type& solver_,
-            const std::vector<int>& cuda_extra_devices
+            const std::vector<int>& cuda_extra_devices,
-        ) : job_pipe(0), net(net_) 
+            std::shared_ptr<threads> thread_pools_ = std::shared_ptr<threads>()
+        ) : job_pipe(0), net(net_), thread_pools(thread_pools_)
        {
            devices.push_back(std::make_shared<device_data>(dlib::cuda::get_device(), net, solver_));
@@ -667,6 +669,14 @@ namespace dlib
            std::vector<tensor*> reference_params;
            visit_layer_parameters(devices[0]->net, [&](size_t, tensor& t) { reference_params.push_back(&t); });
+            // If no external thread pools vector was passed, then create one that will
+            // be automatically destructed as soon as the dnn_trainer object goes out of
+            // scope.
+            if (!thread_pools)
+                thread_pools = std::make_shared<threads>();
+            auto& tp = *thread_pools;
            // We make separate thread pools with just one thread in them because we want
            // to make sure each device is always executed on the same thread.  We care
            // about this because there are thread_local context variables for some cuda
@@ -674,8 +684,7 @@ namespace dlib
            // So if we make sure the same device always uses the same thread this will
            // reduce the number of contexts we allocate from num_devices*num_devices to
            // just num_devices. 
-            std::vector<std::shared_ptr<thread_pool>> tp;
+            while (tp.size() < devices.size())
-            for (size_t i = 0; i < devices.size(); ++i)
                tp.push_back(std::make_shared<thread_pool>(1));
@@ -1274,6 +1283,7 @@ namespace dlib
        std::vector<std::shared_ptr<device_data>> devices;
        dlib::pipe<job_t> job_pipe;
+        std::shared_ptr<threads> thread_pools;
        job_t job;

--- a/dlib/dnn/trainer_abstract.h
+++ b/dlib/dnn/trainer_abstract.h
@@ -58,6 +58,8 @@ namespace dlib
        typedef typename net_type::input_type input_type;
        const static size_t num_computational_layers = net_type::num_computational_layers;
+        using threads = std::vector<std::shared_ptr<thread_pool>>;
        dnn_trainer() = delete;
        dnn_trainer(const dnn_trainer&) = delete;
        dnn_trainer& operator=(const dnn_trainer&) = delete;
@@ -65,7 +67,8 @@ namespace dlib
        dnn_trainer(
            net_type& net, 
            const solver_type& solver = solver_type(),
-            const std::vector<int>& cuda_extra_devices = {}
+            const std::vector<int>& cuda_extra_devices = {},
+            std::shared_ptr<threads> thread_pools = std::shared_ptr<threads>()
        ); 
        /*!
            requires
@@ -96,6 +99,19 @@ namespace dlib
                      cudaGetDevice()).  In addition, you can ask to use additional
                      devices, which you do by putting their device numbers into
                      cuda_extra_devices.
+                - if (thread_pools.get() != nullptr) then
+                    - Any new threads spun within the trainer will execute within the
+                      passed thread pools vector. This means that the same threads can
+                      be re-used across different dnn_trainer instances. Otherwise, the
+                      CUDA runtime may leak memory. This, however, is relevant only if
+                      your program is going to instantiate a large number of trainers,
+                      and generally stay up and running for a very long time. If not,
+                      then you need not worry about this.
+                      NB: Any particular thread pools vector should be passed to max
+                          one trainer instance at a time.
+                      NB: The mentioned leak isn't happening because dlib is or isn't
+                          doing something. Instead, it is a limitation of the CUDA
+                          runtime that dlib has no control over.
        !*/
        net_type& get_net (