rocm环境适配

06efa79d · xiabo · bd635011 · 06efa79d · 06efa79d · 06efa79d
Commit 06efa79d authored May 04, 2023 by xiabo
4 changed files
--- a/mmcv/ops/conv2d_gradfix.py
+++ b/mmcv/ops/conv2d_gradfix.py
@@ -39,7 +39,7 @@ def conv2d(input: torch.Tensor,
           dilation: Union[int, Tuple[int, ...]] = 1,
           groups: int = 1):
    flag = True
-    if torch.__version__ >= '1.10.0':
+    if digit_version(torch.__version__) >= digit_version('1.10.0'):
        warnings.warn('Since '
                      'aten:cudnn_convolution_backward_weight is '
                      f'not supported in torch=={torch.__version__},'
@@ -283,15 +283,33 @@ def _conv2d_gradfix(
                    output_padding=output_padding,
                    output_mask=[0, 1, 0])[1]
            else:
+                is_rocm_pytorch = False
+                try:
+                    from torch.utils.cpp_extension import ROCM_HOME
+                    is_rocm_pytorch = True if ((torch.version.hip is not None) and
+                                       (ROCM_HOME is not None)) else False
+                except ImportError:
+                    pass
+                name=''
+                flags=[]
+                if is_rocm_pytorch:
+                    name = ('aten::miopen_convolution_transpose_backward_weight'
+                        if transpose else
+                        'aten::miopen_convolution_backward_weight')
+                    flags = [
+                        torch.backends.cudnn.benchmark,
+                        torch.backends.cudnn.deterministic
+                    ]
+                else:
                # General case => cuDNN.
-                name = ('aten::cudnn_convolution_transpose_backward_weight'
+                    name = ('aten::cudnn_convolution_transpose_backward_weight'
                        if transpose else
                        'aten::cudnn_convolution_backward_weight')
-                flags = [
+                    flags = [
-                    torch.backends.cudnn.benchmark,
+                        torch.backends.cudnn.benchmark,
-                    torch.backends.cudnn.deterministic,
+                        torch.backends.cudnn.deterministic,
-                    torch.backends.cudnn.allow_tf32
+                        torch.backends.cudnn.allow_tf32
-                ]
+                    ]
                return torch._C._jit_get_operation(name)(weight_shape,
                                                         grad_output, input,
                                                         padding, stride,

--- a/mmcv/ops/corner_pool.py
+++ b/mmcv/ops/corner_pool.py
 # Copyright (c) OpenMMLab. All rights reserved.
 import torch
 from torch import Tensor, nn
+from mmengine.utils import digit_version
 _mode_dict = {'top': 0, 'bottom': 1, 'left': 2, 'right': 3}
@@ -70,7 +70,7 @@ class CornerPool(nn.Module):
        self.mode = mode
    def forward(self, x: Tensor) -> Tensor:
-        if torch.__version__ != 'parrots' and torch.__version__ >= '1.5.0':
+        if torch.__version__ != 'parrots' and digit_version(torch.__version__) >= digit_version('1.5.0'):
            dim, flip = self.cummax_dim_flip[self.mode]
            if flip:
                x = x.flip(dim)

--- a/mmcv/ops/csrc/pytorch/cuda/filtered_lrelu.cu
+++ b/mmcv/ops/csrc/pytorch/cuda/filtered_lrelu.cu
@@ -1619,6 +1619,7 @@ filtered_lrelu_kernel_spec choose_filtered_lrelu_kernel(
 #define BUILD_FILTERED_LRELU_OP 1
+#ifndef MMCV_WITH_HIP
 #ifdef __GNUC__
 #if __GNUC__ < 6
 #undef BUILD_FILTERED_LRELU_OP
@@ -1626,10 +1627,12 @@ filtered_lrelu_kernel_spec choose_filtered_lrelu_kernel(
 #endif
 #endif
 #if CUDA_VERSION < 10020
 #undef BUILD_FILTERED_LRELU_OP
 #define BUILD_FILTERED_LRELU_OP 0
 #endif
+#endif
 #if BUILD_FILTERED_LRELU_OP == 1
 std::tuple<torch::Tensor, torch::Tensor, int> filtered_lrelu_op(
@@ -1670,9 +1673,10 @@ std::tuple<torch::Tensor, torch::Tensor, int> filtered_lrelu_op(
  // Figure out how much shared memory is available on the device.
  int maxSharedBytes = 0;
-  AT_CUDA_CHECK(cudaDeviceGetAttribute(&maxSharedBytes,
+  int result=cudaDeviceGetAttribute(&maxSharedBytes,
-                                       cudaDevAttrMaxSharedMemoryPerBlockOptin,
+                                      //  cudaDevAttrMaxSharedMemoryPerBlockOptin,
-                                       x.device().index()));
+                                      hipDeviceAttributeSharedMemPerBlockOptin,
+                                       x.device().index());
  int sharedKB = maxSharedBytes >> 10;
  // Populate enough launch parameters to check if a CUDA kernel exists.
@@ -1890,8 +1894,10 @@ std::tuple<torch::Tensor, torch::Tensor, int> filtered_lrelu_op(
  // Set cache and shared memory configurations for main kernel.
  AT_CUDA_CHECK(cudaFuncSetCacheConfig(spec.exec, cudaFuncCachePreferShared));
  if (spec.dynamicSharedKB)  // Need dynamically allocated shared memory?
-    AT_CUDA_CHECK(cudaFuncSetAttribute(
+    // AT_CUDA_CHECK(cudaFuncSetAttribute(
-        spec.exec, cudaFuncAttributeMaxDynamicSharedMemorySize,
+    AT_CUDA_CHECK(hipFuncSetAttribute(
+        // spec.exec, cudaFuncAttributeMaxDynamicSharedMemorySize,
+        spec.exec, hipFuncAttributeMaxDynamicSharedMemorySize,
        spec.dynamicSharedKB << 10));
  AT_CUDA_CHECK(
      cudaFuncSetSharedMemConfig(spec.exec, cudaSharedMemBankSizeFourByte));

--- a/tests/test_ops/test_filtered_lrelu.py
+++ b/tests/test_ops/test_filtered_lrelu.py
@@ -116,7 +116,8 @@ class TestFilteredLrelu:
    @pytest.mark.skipif(
        not torch.cuda.is_available()
-        or digit_version(torch.version.cuda) < digit_version('10.2'),
+        # or digit_version(torch.version.cuda) < digit_version('10.2'),
+        or False,
        reason='requires cuda>=10.2')
    def test_filtered_lrelu_cuda(self):
        out = filtered_lrelu(self.input_tensor.cuda(), bias=self.bias.cuda())