Problem: `Error while calling cudnnGetConvolutionForwardWorkspaceSize(...

Problem: `Error while calling cudnnGetConvolutionForwardWorkspaceSize( context(), descriptor(data), (const cudnnFilterDescriptor_t)filter_handle, (const cudnnConvolutionDescriptor_t)conv_handle, descriptor(dest_desc), (cudnnConvolutionFwdAlgo_t)forward_algo, &forward_workspace_size_in_bytes) in file C:\a\2\s\3rdparty\dlib\dlib\cuda\cudnn_dlibapi.cpp:1029. code: 9, reason: CUDNN_STATUS_NOT_SUPPORTED` (#2532) Solution: when this happens, select the best algorithms again - but this time bypassing the cache

Problem: `Error while calling cudnnGetConvolutionForwardWorkspaceSize(...
Problem: `Error while calling cudnnGetConvolutionForwardWorkspaceSize( context(), descriptor(data), (const cudnnFilterDescriptor_t)filter_handle, (const cudnnConvolutionDescriptor_t)conv_handle, descriptor(dest_desc), (cudnnConvolutionFwdAlgo_t)forward_algo, &forward_workspace_size_in_bytes) in file C:\a\2\s\3rdparty\dlib\dlib\cuda\cudnn_dlibapi.cpp:1029. code: 9, reason: CUDNN_STATUS_NOT_SUPPORTED` (#2532) Solution: when this happens, select the best algorithms again - but this time bypassing the cache
bf427f56 · Juha Reunanen · GitHub · 5f7e19b7 · bf427f56 · bf427f56
Unverified Commit bf427f56 authored Mar 03, 2022 by Juha Reunanen Committed by GitHub Mar 03, 2022
Show whitespace changes
Inline Side-by-side

Showing with 56 additions and 34 deletions

dlib/cuda/cudnn_dlibapi.cpp dlib/cuda/cudnn_dlibapi.cpp +50 -33

dlib/cuda/cudnn_dlibapi.h dlib/cuda/cudnn_dlibapi.h +6 -1

No files found.
--- a/dlib/cuda/cudnn_dlibapi.cpp
+++ b/dlib/cuda/cudnn_dlibapi.cpp
@@ -787,10 +787,11 @@ namespace dlib
        void tensor_conv::
        select_best_algorithms (
            const tensor& data,
-            const tensor_descriptor& dest_desc
+            const tensor_descriptor& dest_desc,
+            allow_cache_use allow_cache_use
        ) 
        {
-            // Calling the cuDNN "find the best algorithm" functions are really slow.  So we keep a
+            // Calling the cuDNN "find the best algorithm" functions is really slow.  So we keep a
            // cache that tells us what method was best for a particular configuration.
            thread_local std::map<std::tuple<int,int,int,int,long,long>,
                                  std::tuple<int,int,int>> config_to_algo_cache;
@@ -799,7 +800,7 @@ namespace dlib
            // the cache.
            const auto cache_key = std::make_tuple(stride_y, stride_x, padding_y, padding_x, filters_nr, filters_nc);
            const auto iter = config_to_algo_cache.find(cache_key);
-            if (iter != config_to_algo_cache.end())
+            if (iter != config_to_algo_cache.end() && allow_cache_use == allow_cache_use::yes)
            {
                std::tie(forward_algo, backward_data_algo, backward_filters_algo) = iter->second;
                return;
@@ -933,6 +934,40 @@ namespace dlib
            config_to_algo_cache[cache_key] = std::make_tuple(forward_algo, backward_data_algo, backward_filters_algo);
        }

+        void tensor_conv::
+        update_convolution_data_workspace_sizes(
+            const tensor& data,
+            const tensor_descriptor& dest_desc
+        )
+        {
+            CHECK_CUDNN(cudnnGetConvolutionForwardWorkspaceSize(
+                context(),
+                descriptor(data),
+                (const cudnnFilterDescriptor_t)filter_handle,
+                (const cudnnConvolutionDescriptor_t)conv_handle,
+                descriptor(dest_desc),
+                (cudnnConvolutionFwdAlgo_t)forward_algo,
+                &forward_workspace_size_in_bytes));
+
+            CHECK_CUDNN(cudnnGetConvolutionBackwardDataWorkspaceSize(
+                context(),
+                (const cudnnFilterDescriptor_t)filter_handle,
+                descriptor(dest_desc),
+                (const cudnnConvolutionDescriptor_t)conv_handle,
+                descriptor(data),
+                (cudnnConvolutionBwdDataAlgo_t)backward_data_algo,
+                &backward_data_workspace_size_in_bytes));
+
+            CHECK_CUDNN(cudnnGetConvolutionBackwardFilterWorkspaceSize(
+                context(),
+                descriptor(data),
+                descriptor(dest_desc),
+                (const cudnnConvolutionDescriptor_t)conv_handle,
+                (const cudnnFilterDescriptor_t)filter_handle,
+                (cudnnConvolutionBwdFilterAlgo_t)backward_filters_algo,
+                &backward_filters_workspace_size_in_bytes));
+        }
+
        void tensor_conv::
        setup(
            const tensor& data,
@@ -1021,36 +1056,18 @@ namespace dlib
                tensor_descriptor dest_desc;
                dest_desc.set_size(out_num_samples,out_k,out_nr,out_nc);

-                select_best_algorithms(data, dest_desc);
-
-                CHECK_CUDNN(cudnnGetConvolutionForwardWorkspaceSize( 
-                        context(),
-                        descriptor(data),
-                        (const cudnnFilterDescriptor_t)filter_handle,
-                        (const cudnnConvolutionDescriptor_t)conv_handle,
-                        descriptor(dest_desc),
-                        (cudnnConvolutionFwdAlgo_t)forward_algo,
-                        &forward_workspace_size_in_bytes));
-
-
-                CHECK_CUDNN(cudnnGetConvolutionBackwardDataWorkspaceSize(
-                        context(),
-                        (const cudnnFilterDescriptor_t)filter_handle,
-                        descriptor(dest_desc),
-                        (const cudnnConvolutionDescriptor_t)conv_handle,
-                        descriptor(data),
-                        (cudnnConvolutionBwdDataAlgo_t)backward_data_algo,
-                        &backward_data_workspace_size_in_bytes));
-
-
-                CHECK_CUDNN(cudnnGetConvolutionBackwardFilterWorkspaceSize( 
-                        context(),
-                        descriptor(data),
-                        descriptor(dest_desc),
-                        (const cudnnConvolutionDescriptor_t)conv_handle,
-                        (const cudnnFilterDescriptor_t)filter_handle,
-                        (cudnnConvolutionBwdFilterAlgo_t)backward_filters_algo,
-                        &backward_filters_workspace_size_in_bytes));
+                try
+                {
+                    select_best_algorithms(data, dest_desc, allow_cache_use::yes);
+                    update_convolution_data_workspace_sizes(data, dest_desc);
+                }
+                catch (dlib::cudnn_error&)
+                {
+                    // Sometimes the values stored in `config_to_algo_cache` do not quite work -
+                    // so let's get a fresh estimate, instead of using a cached value.
+                    select_best_algorithms(data, dest_desc, allow_cache_use::no);
+                    update_convolution_data_workspace_sizes(data, dest_desc);
+                }
            }
            catch(...)
            {

--- a/dlib/cuda/cudnn_dlibapi.h
+++ b/dlib/cuda/cudnn_dlibapi.h
@@ -254,15 +254,20 @@ namespace dlib
            int out_nr;
            int out_nc;

+            enum class allow_cache_use { no, yes };
+
            // sets the three _algo fields.
-            void select_best_algorithms(const tensor& data, const tensor_descriptor& dest_desc);
+            void select_best_algorithms(const tensor& data, const tensor_descriptor& dest_desc, allow_cache_use allow_cache_use);
            int forward_algo;
            int backward_data_algo;
            int backward_filters_algo;

+            // sets the three _workspace_size_in_bytes fields.
+            void update_convolution_data_workspace_sizes(const tensor& data, const tensor_descriptor& dest_desc);
            size_t forward_workspace_size_in_bytes;
            size_t backward_data_workspace_size_in_bytes;
            size_t backward_filters_workspace_size_in_bytes;
+
            cuda_data_void_ptr forward_workspace;
            cuda_data_void_ptr backward_data_workspace;
            cuda_data_void_ptr backward_filters_workspace;