[ROCm] Create torchvision as a HIP Extension (#1928)

* Added code to support creating extension on ROCm * max -> fmaxf conversion for hipification * added WITH_HIP flag for hipExtension * added appropriate headers for HIP build * use USE_ROCM in condition to build * change fmaxf and fminf calls * fminf -> min * fix the check for ROCM_HOME * more robust checking for rocm pytorch * add check for pytorch version before using HIP extensions * conditional reading of ROCM_HOME

[ROCm] Create torchvision as a HIP Extension (#1928)
* Added code to support creating extension on ROCm * max -> fmaxf conversion for hipification * added WITH_HIP flag for hipExtension * added appropriate headers for HIP build * use USE_ROCM in condition to build * change fmaxf and fminf calls * fminf -> min * fix the check for ROCM_HOME * more robust checking for rocm pytorch * add check for pytorch version before using HIP extensions * conditional reading of ROCM_HOME
43e94b39 · Ashish Farmer · GitHub · cca0c77a · 43e94b39 · 43e94b39
Unverified Commit 43e94b39 authored Mar 11, 2020 by Ashish Farmer Committed by GitHub Mar 11, 2020
9 changed files
--- a/setup.py
+++ b/setup.py
@@ -13,6 +13,7 @@ import shutil

 import torch
 from torch.utils.cpp_extension import BuildExtension, CppExtension, CUDAExtension, CUDA_HOME
+from torch.utils.hipify import hipify_python


 def read(*names, **kwargs):
@@ -83,6 +84,26 @@ def get_extensions():

    main_file = glob.glob(os.path.join(extensions_dir, '*.cpp'))
    source_cpu = glob.glob(os.path.join(extensions_dir, 'cpu', '*.cpp'))
+
+    is_rocm_pytorch = False
+    if torch.__version__ >= '1.5':
+        from torch.utils.cpp_extension import ROCM_HOME
+        is_rocm_pytorch = True if ((torch.version.hip is not None) and (ROCM_HOME is not None)) else False
+
+    if is_rocm_pytorch:
+        hipify_python.hipify(
+            project_directory=this_dir,
+            output_directory=this_dir,
+            includes="torchvision/csrc/cuda/*",
+            show_detailed=True,
+            is_pytorch_extension=True,
+            )
+        source_cuda = glob.glob(os.path.join(extensions_dir, 'hip', '*.hip'))
+        ## Copy over additional files
+        shutil.copy("torchvision/csrc/cuda/cuda_helpers.h", "torchvision/csrc/hip/cuda_helpers.h")
+        shutil.copy("torchvision/csrc/cuda/vision_cuda.h", "torchvision/csrc/hip/vision_cuda.h")
+
+    else:
        source_cuda = glob.glob(os.path.join(extensions_dir, 'cuda', '*.cu'))

    sources = main_file + source_cpu
@@ -103,15 +124,19 @@ def get_extensions():
    define_macros = []

    extra_compile_args = {}
-    if (torch.cuda.is_available() and CUDA_HOME is not None) or os.getenv('FORCE_CUDA', '0') == '1':
+    if (torch.cuda.is_available() and ((CUDA_HOME is not None)  or is_rocm_pytorch)) or os.getenv('FORCE_CUDA', '0') == '1':
        extension = CUDAExtension
        sources += source_cuda
+        if not is_rocm_pytorch:
            define_macros += [('WITH_CUDA', None)]
            nvcc_flags = os.getenv('NVCC_FLAGS', '')
            if nvcc_flags == '':
                nvcc_flags = []
            else:
                nvcc_flags = nvcc_flags.split(' ')
+        else:
+            define_macros += [('WITH_HIP', None)]
+            nvcc_flags = []
        extra_compile_args = {
            'cxx': [],
            'nvcc': nvcc_flags,

--- a/torchvision/csrc/DeformConv.h
+++ b/torchvision/csrc/DeformConv.h
@@ -5,6 +5,9 @@
 #ifdef WITH_CUDA
 #include "cuda/vision_cuda.h"
 #endif
+#ifdef WITH_HIP
+#include "hip/vision_cuda.h"
+#endif

 at::Tensor DeformConv2d_forward(
    const at::Tensor& input,
@@ -17,7 +20,7 @@ at::Tensor DeformConv2d_forward(
    const int groups,
    const int offset_groups) {
  if (input.type().is_cuda()) {
-#ifdef WITH_CUDA
+#if defined(WITH_CUDA) || defined(WITH_HIP)
    return DeformConv2d_forward_cuda(
        input.contiguous(),
        weight.contiguous(),
@@ -56,7 +59,7 @@ std::tuple<at::Tensor, at::Tensor, at::Tensor, at::Tensor> DeformConv2d_backward
    const int groups,
    const int offset_groups) {
  if (grad.type().is_cuda()) {
-#ifdef WITH_CUDA
+#if defined(WITH_CUDA) || defined(WITH_HIP)
    return DeformConv2d_backward_cuda(
        grad.contiguous(),
        input.contiguous(),

--- a/torchvision/csrc/PSROIAlign.h
+++ b/torchvision/csrc/PSROIAlign.h
@@ -5,6 +5,9 @@
 #ifdef WITH_CUDA
 #include "cuda/vision_cuda.h"
 #endif
+#ifdef WITH_HIP
+#include "hip/vision_cuda.h"
+#endif

 #include <iostream>

@@ -16,7 +19,7 @@ std::tuple<at::Tensor, at::Tensor> PSROIAlign_forward(
    const int pooled_width,
    const int sampling_ratio) {
  if (input.type().is_cuda()) {
-#ifdef WITH_CUDA
+#if defined(WITH_CUDA) || defined(WITH_HIP)
    return PSROIAlign_forward_cuda(
        input,
        rois,
@@ -45,7 +48,7 @@ at::Tensor PSROIAlign_backward(
    const int height,
    const int width) {
  if (grad.type().is_cuda()) {
-#ifdef WITH_CUDA
+#if defined(WITH_CUDA) || defined(WITH_HIP)
    return PSROIAlign_backward_cuda(
        grad,
        rois,

--- a/torchvision/csrc/PSROIPool.h
+++ b/torchvision/csrc/PSROIPool.h
@@ -5,6 +5,9 @@
 #ifdef WITH_CUDA
 #include "cuda/vision_cuda.h"
 #endif
+#ifdef WITH_HIP
+#include "hip/vision_cuda.h"
+#endif

 std::tuple<at::Tensor, at::Tensor> PSROIPool_forward(
    const at::Tensor& input,
@@ -13,7 +16,7 @@ std::tuple<at::Tensor, at::Tensor> PSROIPool_forward(
    const int pooled_height,
    const int pooled_width) {
  if (input.type().is_cuda()) {
-#ifdef WITH_CUDA
+#if defined(WITH_CUDA) || defined(WITH_HIP)
    return PSROIPool_forward_cuda(
        input, rois, spatial_scale, pooled_height, pooled_width);
 #else
@@ -36,7 +39,7 @@ at::Tensor PSROIPool_backward(
    const int height,
    const int width) {
  if (grad.type().is_cuda()) {
-#ifdef WITH_CUDA
+#if defined(WITH_CUDA) || defined(WITH_HIP)
    return PSROIPool_backward_cuda(
        grad,
        rois,

--- a/torchvision/csrc/ROIAlign.h
+++ b/torchvision/csrc/ROIAlign.h
@@ -5,6 +5,9 @@
 #ifdef WITH_CUDA
 #include "cuda/vision_cuda.h"
 #endif
+#ifdef WITH_HIP
+#include "hip/vision_cuda.h"
+#endif

 // Interface for Python
 at::Tensor ROIAlign_forward(
@@ -19,7 +22,7 @@ at::Tensor ROIAlign_forward(
 // along each axis.
 {
  if (input.type().is_cuda()) {
-#ifdef WITH_CUDA
+#if defined(WITH_CUDA) || defined(WITH_HIP)
    return ROIAlign_forward_cuda(
        input,
        rois,
@@ -49,7 +52,7 @@ at::Tensor ROIAlign_backward(
    const int sampling_ratio,
    const bool aligned) {
  if (grad.type().is_cuda()) {
-#ifdef WITH_CUDA
+#if defined(WITH_CUDA) || defined(WITH_HIP)
    return ROIAlign_backward_cuda(
        grad,
        rois,

--- a/torchvision/csrc/ROIPool.h
+++ b/torchvision/csrc/ROIPool.h
@@ -5,6 +5,9 @@
 #ifdef WITH_CUDA
 #include "cuda/vision_cuda.h"
 #endif
+#ifdef WITH_HIP
+#include "hip/vision_cuda.h"
+#endif

 std::tuple<at::Tensor, at::Tensor> ROIPool_forward(
    const at::Tensor& input,
@@ -13,7 +16,7 @@ std::tuple<at::Tensor, at::Tensor> ROIPool_forward(
    const int64_t pooled_height,
    const int64_t pooled_width) {
  if (input.type().is_cuda()) {
-#ifdef WITH_CUDA
+#if defined(WITH_CUDA) || defined(WITH_HIP)
    return ROIPool_forward_cuda(
        input, rois, spatial_scale, pooled_height, pooled_width);
 #else
@@ -36,7 +39,7 @@ at::Tensor ROIPool_backward(
    const int height,
    const int width) {
  if (grad.type().is_cuda()) {
-#ifdef WITH_CUDA
+#if defined(WITH_CUDA) || defined(WITH_HIP)
    return ROIPool_backward_cuda(
        grad,
        rois,

--- a/torchvision/csrc/cuda/vision_cuda.h
+++ b/torchvision/csrc/cuda/vision_cuda.h
 #pragma once
+#if defined(WITH_CUDA)
 #include <c10/cuda/CUDAGuard.h>
+#elif defined(WITH_HIP)
+#include <c10/hip/HIPGuard.h>
+#endif
 #include <torch/extension.h>

 at::Tensor ROIAlign_forward_cuda(

--- a/torchvision/csrc/nms.h
+++ b/torchvision/csrc/nms.h
@@ -4,18 +4,27 @@
 #ifdef WITH_CUDA
 #include "cuda/vision_cuda.h"
 #endif
+#ifdef WITH_HIP
+#include "hip/vision_cuda.h"
+#endif

 at::Tensor nms(
    const at::Tensor& dets,
    const at::Tensor& scores,
    const double iou_threshold) {
  if (dets.device().is_cuda()) {
-#ifdef WITH_CUDA
+#if defined(WITH_CUDA)
    if (dets.numel() == 0) {
      at::cuda::CUDAGuard device_guard(dets.device());
      return at::empty({0}, dets.options().dtype(at::kLong));
    }
    return nms_cuda(dets, scores, iou_threshold);
+#elif defined(WITH_HIP)
+    if (dets.numel() == 0) {
+      at::cuda::HIPGuard device_guard(dets.device());
+      return at::empty({0}, dets.options().dtype(at::kLong));
+    }
+    return nms_cuda(dets, scores, iou_threshold);
 #else
    AT_ERROR("Not compiled with GPU support");
 #endif

--- a/torchvision/csrc/vision.cpp
+++ b/torchvision/csrc/vision.cpp
@@ -4,6 +4,9 @@
 #ifdef WITH_CUDA
 #include <cuda.h>
 #endif
+#ifdef WITH_HIP
+#include <hip/hip_runtime.h>
+#endif

 #include "DeformConv.h"
 #include "PSROIAlign.h"