Revert "Enable ROCm RNN-T Loss (#2485)" (#3586)

Summary: This reverts commit c5939616. Unblock 2.1.0 rc Pull Request resolved: https://github.com/pytorch/audio/pull/3586 Reviewed By: osalpekar Differential Revision: D48842032 Pulled By: atalman fbshipit-source-id: bbdf9e45c9aa5fde00f315a2ff491ed050bc1707

Revert "Enable ROCm RNN-T Loss (#2485)" (#3586)
Summary: This reverts commit c5939616. Unblock 2.1.0 rc Pull Request resolved: https://github.com/pytorch/audio/pull/3586 Reviewed By: osalpekar Differential Revision: D48842032 Pulled By: atalman fbshipit-source-id: bbdf9e45c9aa5fde00f315a2ff491ed050bc1707
5cf7d2db · atalman · Facebook GitHub Bot · bf77b2a0 · 5cf7d2db · 5cf7d2db
Commit 5cf7d2db authored Aug 30, 2023 by atalman Committed by Facebook GitHub Bot Aug 30, 2023
15 changed files
--- a/.gitmodules
+++ b/.gitmodules
-[submodule "third_party/hipify_torch"]
-	path = third_party/hipify_torch
-	url = https://github.com/ROCmSoftwarePlatform/hipify_torch
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -76,11 +76,6 @@ if(USE_ROCM)
  if(NOT PYTORCH_FOUND_HIP)
    set(USE_ROCM OFF)
  endif()
-
-  if(CMAKE_VERSION VERSION_LESS 3.21.0)
-    message("Need at least CMake 3.21.0 to compile ROCm support.")
-    set(USE_ROCM OFF)
-  endif()
 endif()

 if(USE_CUDA)
@@ -95,11 +90,6 @@ if(USE_CUDA)
  )
 endif()

-if(USE_ROCM)
-  enable_language(HIP)
-endif()
-
-find_package(Torch REQUIRED)
 include(cmake/TorchAudioHelper.cmake)

 # https://github.com/pytorch/pytorch/issues/54174

--- a/hipify_torch @ 083ff9b5
+++ b/hipify_torch @ 083ff9b5
-Subproject commit 083ff9b50c7ed861f7f6eddd983cdedb72e8b964
--- a/torchaudio/csrc/CMakeLists.txt
+++ b/torchaudio/csrc/CMakeLists.txt
 ################################################################################
 # libtorchaudio
 ################################################################################
-
-if(USE_ROCM)
-  list (APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm)
-  FIND_PACKAGE(HIP REQUIRED)
-  MESSAGE(STATUS "hip found ${ROCM_FOUND}")
-
-  list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/third_party/hipify_torch/cmake")
-  include(Hipify)
-
-  set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE})
-  set(CMAKE_CXX_LINKER ${HIP_HIPCC_EXECUTABLE})
-  set(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE)
-  list( APPEND CMAKE_INSTALL_RPATH "/opt/rocm/llvm/lib" )
-  set(OPENMP_LIBRARIES "/opt/rocm/llvm/lib/")
-  set(OpenMP_CXX "${CMAKE_CXX_COMPILER}")
-  set(OpenMP_CXX_FLAGS "-fopenmp=libomp")
-  #set(OpenMP_CXX_LIB_NAMES "omp")
-  set(OpenMP_omp_LIBRARY omp)
-  find_package(OpenMP REQUIRED)
-
-endif()
-
-
 set(
  sources
  lfilter.cpp
@@ -62,37 +39,6 @@ if(BUILD_RNNT)
      rnnt/gpu/compute.cu
      )
  endif()
-
-  if (USE_ROCM)
-    hipify(CUDA_SOURCE_DIR ${PROJECT_SOURCE_DIR}/torchaudio/csrc/rnnt/gpu HIP_SOURCE_DIR ${PROJECT_SOURCE_DIR}/torchaudio/csrc/rnnt/hip)
-    if ( NOT HIP_ADD_LIBRARY_FOUND )
-      list(APPEND CMAKE_MODULE_PATH /opt/rocm/hip/cmake)
-      find_package(HIP REQUIRED)
-    endif()
-
-    list(
-      APPEND
-      sources 
-      rnnt/hip/compute_alphas.hip
-      rnnt/hip/compute_betas.hip
-      rnnt/hip/compute.hip
-      )
-  endif()
-endif()
-
-if(USE_ROCM)
-  list(
-    APPEND
-    additional_libs 
-    hip::host
-    hip::device
-    /opt/rocm/llvm/lib/libomp.so
-    )
-  list(
-    APPEND
-    compile_definitions 
-    USE_ROCM
-  )
 endif()

 if(BUILD_RIR)
@@ -141,6 +87,7 @@ endif()
 #------------------------------------------------------------------------------#
 # END OF CUSTOMIZATION LOGICS
 #------------------------------------------------------------------------------#
+
 torchaudio_library(
  libtorchaudio
  "${sources}"

--- a/torchaudio/csrc/rnnt/gpu/compute.cu
+++ b/torchaudio/csrc/rnnt/gpu/compute.cu
 #include <c10/cuda/CUDAStream.h>
 #include <torch/types.h>
-#ifdef __HIP_PLATFORM_AMD__
-#include <torchaudio/csrc/rnnt/hip/gpu_transducer_hip.h>
-#else
 #include <torchaudio/csrc/rnnt/gpu/gpu_transducer.h>
-#endif

 namespace torchaudio {
 namespace rnnt {

--- a/torchaudio/csrc/rnnt/gpu/compute_alphas.cu
+++ b/torchaudio/csrc/rnnt/gpu/compute_alphas.cu
 #include <c10/cuda/CUDAStream.h>
 #include <torch/types.h>
-#ifdef __HIP_PLATFORM_AMD__
-#include <torchaudio/csrc/rnnt/hip/gpu_transducer_hip.h>
-#else
 #include <torchaudio/csrc/rnnt/gpu/gpu_transducer.h>
-#endif

 namespace torchaudio {
 namespace rnnt {

--- a/torchaudio/csrc/rnnt/gpu/compute_betas.cu
+++ b/torchaudio/csrc/rnnt/gpu/compute_betas.cu
 #include <c10/cuda/CUDAStream.h>
 #include <torch/types.h>
-#ifdef __HIP_PLATFORM_AMD__
-#include <torchaudio/csrc/rnnt/hip/gpu_transducer_hip.h>
-#else
 #include <torchaudio/csrc/rnnt/gpu/gpu_transducer.h>
-#endif

 namespace torchaudio {
 namespace rnnt {

--- a/torchaudio/csrc/rnnt/gpu/gpu_kernel_utils.cuh
+++ b/torchaudio/csrc/rnnt/gpu/gpu_kernel_utils.cuh
@@ -2,11 +2,7 @@

 #ifdef USE_CUDA

-#ifdef __HIP_PLATFORM_AMD__
-#include <torchaudio/csrc/rnnt/hip/math_hip.cuh>
-#else
 #include <torchaudio/csrc/rnnt/gpu/math.cuh>
-#endif

 namespace torchaudio {
 namespace rnnt {
@@ -43,11 +39,7 @@ __global__ void ReduceMax2D(

  CAST_DTYPE shf;
  for (int stride = (WARP_SIZE >> 1); stride > 0; stride >>= 1) {
-#ifdef __HIP_PLATFORM_AMD__
-    shf = __shfl_down(val, stride);
-#else
    shf = __shfl_down_sync(0xFFFFFFFF, val, stride);
-#endif
    if (threadIdx.x < stride && threadIdx.x + stride < dim) {
      if (shf > val) {
        val = shf;
@@ -89,11 +81,7 @@ __global__ void ReduceLogSumExpGivenMax2D(

  CAST_DTYPE shf;
  for (int stride = (WARP_SIZE >> 1); stride > 0; stride >>= 1) {
-#ifdef __HIP_PLATFORM_AMD__
-    shf = __shfl_down(val, stride);
-#else
    shf = __shfl_down_sync(0xFFFFFFFF, val, stride);
-#endif
    if (threadIdx.x < stride && threadIdx.x + stride < dim) {
      val = val + shf;
    }

--- a/torchaudio/csrc/rnnt/gpu/gpu_kernels.cuh
+++ b/torchaudio/csrc/rnnt/gpu/gpu_kernels.cuh
@@ -4,15 +4,9 @@

 #include <cassert>

-#ifdef __HIP_PLATFORM_AMD__
-#include <torchaudio/csrc/rnnt/hip/kernel_utils.h>
-#include <torchaudio/csrc/rnnt/hip/kernels.h>
-#include <torchaudio/csrc/rnnt/hip/math_hip.cuh>
-#else
 #include <torchaudio/csrc/rnnt/gpu/kernel_utils.h>
 #include <torchaudio/csrc/rnnt/gpu/kernels.h>
 #include <torchaudio/csrc/rnnt/gpu/math.cuh>
-#endif

 namespace torchaudio {
 namespace rnnt {
@@ -132,11 +126,7 @@ __device__ void ComputeAlphas(

 #pragma unroll
    for (int i = 1; i < warpSize; i <<= 1) {
-#ifdef __HIP_PLATFORM_AMD__
-      val = __shfl_up(skip_prob, i);
-#else
      val = __shfl_up_sync(0xffffffff, skip_prob, i);
-#endif
      if (i <= threadIdx.x) {
        skip_prob = skip_prob + val;
      }
@@ -160,11 +150,7 @@ __device__ void ComputeAlphas(
    CAST_DTYPE out = val;

    for (int i = 1; i < warpSize; ++i) {
-#ifdef __HIP_PLATFORM_AMD__
-      val = __shfl_up(val, 1);
-#else
      val = __shfl_up_sync(0xffffffff, val, 1);
-#endif
      if (i == threadIdx.x) {
        val = math::lse(val + skip_prob, emit);
        out = val;
@@ -239,11 +225,7 @@ __device__ void ComputeBetasCosts(

 #pragma unroll
    for (int i = 1; i < warpSize; i <<= 1) {
-#ifdef __HIP_PLATFORM_AMD__
-      val = __shfl_up(skip_prob, i);
-#else
      val = __shfl_up_sync(0xffffffff, skip_prob, i);
-#endif
      if (i <= threadIdx.x) {
        skip_prob = skip_prob + val;
      }
@@ -266,11 +248,7 @@ __device__ void ComputeBetasCosts(
    CAST_DTYPE out = val;

    for (int i = 1; i < warpSize; ++i) {
-#ifdef __HIP_PLATFORM_AMD__
-      val = __shfl_up(val, 1);
-#else
      val = __shfl_up_sync(0xffffffff, val, 1);
-#endif
      if (i == threadIdx.x) {
        val = math::lse(val + skip_prob, emit);
        out = val;

--- a/torchaudio/csrc/rnnt/gpu/gpu_transducer.h
+++ b/torchaudio/csrc/rnnt/gpu/gpu_transducer.h
@@ -3,13 +3,8 @@
 #ifdef USE_CUDA

 #include <torchaudio/csrc/rnnt/workspace.h>
-#ifdef __HIP_PLATFORM_AMD__
-#include <torchaudio/csrc/rnnt/hip/gpu_kernel_utils_hip.cuh>
-#include <torchaudio/csrc/rnnt/hip/gpu_kernels_hip.cuh>
-#else
 #include <torchaudio/csrc/rnnt/gpu/gpu_kernel_utils.cuh>
 #include <torchaudio/csrc/rnnt/gpu/gpu_kernels.cuh>
-#endif

 namespace torchaudio {
 namespace rnnt {

--- a/torchaudio/csrc/rnnt/gpu/kernel_utils.h
+++ b/torchaudio/csrc/rnnt/gpu/kernel_utils.h
@@ -2,11 +2,7 @@

 #include <cassert>

-#ifdef __HIP_PLATFORM_AMD__
-#include <torchaudio/csrc/rnnt/hip/math_hip.cuh>
-#else
 #include <torchaudio/csrc/rnnt/gpu/math.cuh>
-#endif

 namespace torchaudio {
 namespace rnnt {

--- a/torchaudio/csrc/rnnt/gpu/kernels.h
+++ b/torchaudio/csrc/rnnt/gpu/kernels.h
@@ -2,13 +2,8 @@

 #include <cassert>

-#ifdef __HIP_PLATFORM_AMD__
-#include <torchaudio/csrc/rnnt/hip/kernel_utils.h>
-#include <torchaudio/csrc/rnnt/hip/math_hip.cuh>
-#else
 #include <torchaudio/csrc/rnnt/gpu/kernel_utils.h>
 #include <torchaudio/csrc/rnnt/gpu/math.cuh>
-#endif

 namespace torchaudio {
 namespace rnnt {

--- a/torchaudio/csrc/rnnt/macros.h
+++ b/torchaudio/csrc/rnnt/macros.h
@@ -8,14 +8,6 @@
 #define FORCE_INLINE __forceinline__
 #include <cuda_fp16.h>
 #include <cuda_runtime.h>
-#elif USE_ROCM
-#define WARP_SIZE 32
-#define MAX_THREADS_PER_BLOCK 1024
-#define REDUCE_THREADS 256
-#define HOST_AND_DEVICE __host__ __device__
-#define FORCE_INLINE __forceinline__
-#include <hip/hip_fp16.h>
-#include <hip/hip_runtime.h>
 #else
 #define HOST_AND_DEVICE
 #define FORCE_INLINE inline

--- a/torchaudio/csrc/rnnt/options.h
+++ b/torchaudio/csrc/rnnt/options.h
@@ -4,12 +4,7 @@

 #ifdef USE_CUDA
 #include <cuda_runtime.h>
-typedef cudaStream_t gpuStream_t;
 #endif // USE_CUDA
-#ifdef USE_ROCM
-#include <hip/hip_runtime.h>
-typedef hipStream_t gpuStream_t;
-#endif // USE_ROCM

 #include <torchaudio/csrc/rnnt/macros.h>
 #include <torchaudio/csrc/rnnt/types.h>
@@ -20,9 +15,9 @@ namespace rnnt {
 typedef struct Options {
  // the device to compute transducer loss.
  device_t device_;
-#if defined(USE_CUDA) || defined(USE_ROCM)
+#ifdef USE_CUDA
  // the stream to launch kernels in when using GPU.
-  gpuStream_t stream_;
+  cudaStream_t stream_;
 #endif
  // The maximum number of threads that can be used.
  int numThreads_;

--- a/torchaudio/csrc/rnnt/workspace.h
+++ b/torchaudio/csrc/rnnt/workspace.h
@@ -131,22 +131,10 @@ class IntWorkspace {
          ComputeSizeForBetaCounters(options_) * sizeof(int));
    }
 #endif // USE_CUDA
-#ifdef USE_ROCM
-    if (data_ != nullptr && options_.device_ == GPU) {
-      hipMemset(
-          GetPointerToAlphaCounters(),
-          0,
-          ComputeSizeForAlphaCounters(options_) * sizeof(int));
-      hipMemset(
-          GetPointerToBetaCounters(),
-          0,
-          ComputeSizeForBetaCounters(options_) * sizeof(int));
-    }
-#endif // USE_ROCM
  }

  static int ComputeSizeForAlphaCounters(const Options& options) { // B * U
-#if defined(USE_CUDA) || defined(USE_ROCM)
+#ifdef USE_CUDA
    if (options.device_ == GPU) {
      return options.BU();
    } else {
@@ -157,7 +145,7 @@ class IntWorkspace {
 #endif // USE_CUDA
  }
  static int ComputeSizeForBetaCounters(const Options& options) { // B * U
-#if defined(USE_CUDA) || defined(USE_ROCM)
+#ifdef USE_CUDA
    if (options.device_ == GPU) {
      return options.BU();
    } else {