Enable ROCm RNN-T Loss (#2485)

Summary: Added HIPIFY code and small changes for ROCm. Targeting RNN-T loss. Pull Request resolved: https://github.com/pytorch/audio/pull/2485 Reviewed By: huangruizhe Differential Revision: D43537864 Pulled By: mthrok fbshipit-source-id: 4bdb1f291dc51a12232ccd072b97ae94ae20cc0c

Enable ROCm RNN-T Loss (#2485)
Summary: Added HIPIFY code and small changes for ROCm. Targeting RNN-T loss. Pull Request resolved: https://github.com/pytorch/audio/pull/2485 Reviewed By: huangruizhe Differential Revision: D43537864 Pulled By: mthrok fbshipit-source-id: 4bdb1f291dc51a12232ccd072b97ae94ae20cc0c
c5939616 · Juan Villamizar · Facebook GitHub Bot · 1638efee · c5939616 · c5939616
Commit c5939616 authored Aug 18, 2023 by Juan Villamizar Committed by Facebook GitHub Bot Aug 18, 2023
15 changed files
--- a/.gitmodules
+++ b/.gitmodules
+[submodule "third_party/hipify_torch"]
+	path = third_party/hipify_torch
+	url = https://github.com/ROCmSoftwarePlatform/hipify_torch
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -76,6 +76,11 @@ if(USE_ROCM)
  if(NOT PYTORCH_FOUND_HIP)
    set(USE_ROCM OFF)
  endif()
+
+  if(CMAKE_VERSION VERSION_LESS 3.21.0)
+    message("Need at least CMake 3.21.0 to compile ROCm support.")
+    set(USE_ROCM OFF)
+  endif()
 endif()

 if(USE_CUDA)
@@ -90,6 +95,11 @@ if(USE_CUDA)
  )
 endif()

+if(USE_ROCM)
+  enable_language(HIP)
+endif()
+
+find_package(Torch REQUIRED)
 include(cmake/TorchAudioHelper.cmake)

 # https://github.com/pytorch/pytorch/issues/54174

--- a/hipify_torch @ 083ff9b5
+++ b/hipify_torch @ 083ff9b5
+Subproject commit 083ff9b50c7ed861f7f6eddd983cdedb72e8b964
--- a/torchaudio/csrc/CMakeLists.txt
+++ b/torchaudio/csrc/CMakeLists.txt
 ################################################################################
 # libtorchaudio
 ################################################################################
+
+if(USE_ROCM)
+  list (APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm)
+  FIND_PACKAGE(HIP REQUIRED)
+  MESSAGE(STATUS "hip found ${ROCM_FOUND}")
+
+  list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/third_party/hipify_torch/cmake")
+  include(Hipify)
+
+  set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE})
+  set(CMAKE_CXX_LINKER ${HIP_HIPCC_EXECUTABLE})
+  set(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE)
+  list( APPEND CMAKE_INSTALL_RPATH "/opt/rocm/llvm/lib" )
+  set(OPENMP_LIBRARIES "/opt/rocm/llvm/lib/")
+  set(OpenMP_CXX "${CMAKE_CXX_COMPILER}")
+  set(OpenMP_CXX_FLAGS "-fopenmp=libomp")
+  #set(OpenMP_CXX_LIB_NAMES "omp")
+  set(OpenMP_omp_LIBRARY omp)
+  find_package(OpenMP REQUIRED)
+
+endif()
+
+
 set(
  sources
  lfilter.cpp
@@ -39,6 +62,37 @@ if(BUILD_RNNT)
      rnnt/gpu/compute.cu
      )
  endif()
+
+  if (USE_ROCM)
+    hipify(CUDA_SOURCE_DIR ${PROJECT_SOURCE_DIR}/torchaudio/csrc/rnnt/gpu HIP_SOURCE_DIR ${PROJECT_SOURCE_DIR}/torchaudio/csrc/rnnt/hip)
+    if ( NOT HIP_ADD_LIBRARY_FOUND )
+      list(APPEND CMAKE_MODULE_PATH /opt/rocm/hip/cmake)
+      find_package(HIP REQUIRED)
+    endif()
+
+    list(
+      APPEND
+      sources 
+      rnnt/hip/compute_alphas.hip
+      rnnt/hip/compute_betas.hip
+      rnnt/hip/compute.hip
+      )
+  endif()
+endif()
+
+if(USE_ROCM)
+  list(
+    APPEND
+    additional_libs 
+    hip::host
+    hip::device
+    /opt/rocm/llvm/lib/libomp.so
+    )
+  list(
+    APPEND
+    compile_definitions 
+    USE_ROCM
+  )
 endif()

 if(BUILD_RIR)
@@ -87,7 +141,6 @@ endif()
 #------------------------------------------------------------------------------#
 # END OF CUSTOMIZATION LOGICS
 #------------------------------------------------------------------------------#
-
 torchaudio_library(
  libtorchaudio
  "${sources}"

--- a/torchaudio/csrc/rnnt/gpu/compute.cu
+++ b/torchaudio/csrc/rnnt/gpu/compute.cu
 #include <c10/cuda/CUDAStream.h>
 #include <torch/types.h>
+#ifdef __HIP_PLATFORM_AMD__
+#include <torchaudio/csrc/rnnt/hip/gpu_transducer_hip.h>
+#else
 #include <torchaudio/csrc/rnnt/gpu/gpu_transducer.h>
+#endif

 namespace torchaudio {
 namespace rnnt {

--- a/torchaudio/csrc/rnnt/gpu/compute_alphas.cu
+++ b/torchaudio/csrc/rnnt/gpu/compute_alphas.cu
 #include <c10/cuda/CUDAStream.h>
 #include <torch/types.h>
+#ifdef __HIP_PLATFORM_AMD__
+#include <torchaudio/csrc/rnnt/hip/gpu_transducer_hip.h>
+#else
 #include <torchaudio/csrc/rnnt/gpu/gpu_transducer.h>
+#endif

 namespace torchaudio {
 namespace rnnt {

--- a/torchaudio/csrc/rnnt/gpu/compute_betas.cu
+++ b/torchaudio/csrc/rnnt/gpu/compute_betas.cu
 #include <c10/cuda/CUDAStream.h>
 #include <torch/types.h>
+#ifdef __HIP_PLATFORM_AMD__
+#include <torchaudio/csrc/rnnt/hip/gpu_transducer_hip.h>
+#else
 #include <torchaudio/csrc/rnnt/gpu/gpu_transducer.h>
+#endif

 namespace torchaudio {
 namespace rnnt {

--- a/torchaudio/csrc/rnnt/gpu/gpu_kernel_utils.cuh
+++ b/torchaudio/csrc/rnnt/gpu/gpu_kernel_utils.cuh
@@ -2,7 +2,11 @@

 #ifdef USE_CUDA

+#ifdef __HIP_PLATFORM_AMD__
+#include <torchaudio/csrc/rnnt/hip/math_hip.cuh>
+#else
 #include <torchaudio/csrc/rnnt/gpu/math.cuh>
+#endif

 namespace torchaudio {
 namespace rnnt {
@@ -39,7 +43,11 @@ __global__ void ReduceMax2D(

  CAST_DTYPE shf;
  for (int stride = (WARP_SIZE >> 1); stride > 0; stride >>= 1) {
+#ifdef __HIP_PLATFORM_AMD__
+    shf = __shfl_down(val, stride);
+#else
    shf = __shfl_down_sync(0xFFFFFFFF, val, stride);
+#endif
    if (threadIdx.x < stride && threadIdx.x + stride < dim) {
      if (shf > val) {
        val = shf;
@@ -81,7 +89,11 @@ __global__ void ReduceLogSumExpGivenMax2D(

  CAST_DTYPE shf;
  for (int stride = (WARP_SIZE >> 1); stride > 0; stride >>= 1) {
+#ifdef __HIP_PLATFORM_AMD__
+    shf = __shfl_down(val, stride);
+#else
    shf = __shfl_down_sync(0xFFFFFFFF, val, stride);
+#endif
    if (threadIdx.x < stride && threadIdx.x + stride < dim) {
      val = val + shf;
    }

--- a/torchaudio/csrc/rnnt/gpu/gpu_kernels.cuh
+++ b/torchaudio/csrc/rnnt/gpu/gpu_kernels.cuh
@@ -4,9 +4,15 @@

 #include <cassert>

+#ifdef __HIP_PLATFORM_AMD__
+#include <torchaudio/csrc/rnnt/hip/kernel_utils.h>
+#include <torchaudio/csrc/rnnt/hip/kernels.h>
+#include <torchaudio/csrc/rnnt/hip/math_hip.cuh>
+#else
 #include <torchaudio/csrc/rnnt/gpu/kernel_utils.h>
 #include <torchaudio/csrc/rnnt/gpu/kernels.h>
 #include <torchaudio/csrc/rnnt/gpu/math.cuh>
+#endif

 namespace torchaudio {
 namespace rnnt {
@@ -126,7 +132,11 @@ __device__ void ComputeAlphas(

 #pragma unroll
    for (int i = 1; i < warpSize; i <<= 1) {
+#ifdef __HIP_PLATFORM_AMD__
+      val = __shfl_up(skip_prob, i);
+#else
      val = __shfl_up_sync(0xffffffff, skip_prob, i);
+#endif
      if (i <= threadIdx.x) {
        skip_prob = skip_prob + val;
      }
@@ -150,7 +160,11 @@ __device__ void ComputeAlphas(
    CAST_DTYPE out = val;

    for (int i = 1; i < warpSize; ++i) {
+#ifdef __HIP_PLATFORM_AMD__
+      val = __shfl_up(val, 1);
+#else
      val = __shfl_up_sync(0xffffffff, val, 1);
+#endif
      if (i == threadIdx.x) {
        val = math::lse(val + skip_prob, emit);
        out = val;
@@ -225,7 +239,11 @@ __device__ void ComputeBetasCosts(

 #pragma unroll
    for (int i = 1; i < warpSize; i <<= 1) {
+#ifdef __HIP_PLATFORM_AMD__
+      val = __shfl_up(skip_prob, i);
+#else
      val = __shfl_up_sync(0xffffffff, skip_prob, i);
+#endif
      if (i <= threadIdx.x) {
        skip_prob = skip_prob + val;
      }
@@ -248,7 +266,11 @@ __device__ void ComputeBetasCosts(
    CAST_DTYPE out = val;

    for (int i = 1; i < warpSize; ++i) {
+#ifdef __HIP_PLATFORM_AMD__
+      val = __shfl_up(val, 1);
+#else
      val = __shfl_up_sync(0xffffffff, val, 1);
+#endif
      if (i == threadIdx.x) {
        val = math::lse(val + skip_prob, emit);
        out = val;

--- a/torchaudio/csrc/rnnt/gpu/gpu_transducer.h
+++ b/torchaudio/csrc/rnnt/gpu/gpu_transducer.h
@@ -3,8 +3,13 @@
 #ifdef USE_CUDA

 #include <torchaudio/csrc/rnnt/workspace.h>
+#ifdef __HIP_PLATFORM_AMD__
+#include <torchaudio/csrc/rnnt/hip/gpu_kernel_utils_hip.cuh>
+#include <torchaudio/csrc/rnnt/hip/gpu_kernels_hip.cuh>
+#else
 #include <torchaudio/csrc/rnnt/gpu/gpu_kernel_utils.cuh>
 #include <torchaudio/csrc/rnnt/gpu/gpu_kernels.cuh>
+#endif

 namespace torchaudio {
 namespace rnnt {

--- a/torchaudio/csrc/rnnt/gpu/kernel_utils.h
+++ b/torchaudio/csrc/rnnt/gpu/kernel_utils.h
@@ -2,7 +2,11 @@

 #include <cassert>

+#ifdef __HIP_PLATFORM_AMD__
+#include <torchaudio/csrc/rnnt/hip/math_hip.cuh>
+#else
 #include <torchaudio/csrc/rnnt/gpu/math.cuh>
+#endif

 namespace torchaudio {
 namespace rnnt {

--- a/torchaudio/csrc/rnnt/gpu/kernels.h
+++ b/torchaudio/csrc/rnnt/gpu/kernels.h
@@ -2,8 +2,13 @@

 #include <cassert>

+#ifdef __HIP_PLATFORM_AMD__
+#include <torchaudio/csrc/rnnt/hip/kernel_utils.h>
+#include <torchaudio/csrc/rnnt/hip/math_hip.cuh>
+#else
 #include <torchaudio/csrc/rnnt/gpu/kernel_utils.h>
 #include <torchaudio/csrc/rnnt/gpu/math.cuh>
+#endif

 namespace torchaudio {
 namespace rnnt {

--- a/torchaudio/csrc/rnnt/macros.h
+++ b/torchaudio/csrc/rnnt/macros.h
@@ -8,6 +8,14 @@
 #define FORCE_INLINE __forceinline__
 #include <cuda_fp16.h>
 #include <cuda_runtime.h>
+#elif USE_ROCM
+#define WARP_SIZE 32
+#define MAX_THREADS_PER_BLOCK 1024
+#define REDUCE_THREADS 256
+#define HOST_AND_DEVICE __host__ __device__
+#define FORCE_INLINE __forceinline__
+#include <hip/hip_fp16.h>
+#include <hip/hip_runtime.h>
 #else
 #define HOST_AND_DEVICE
 #define FORCE_INLINE inline

--- a/torchaudio/csrc/rnnt/options.h
+++ b/torchaudio/csrc/rnnt/options.h
@@ -4,7 +4,12 @@

 #ifdef USE_CUDA
 #include <cuda_runtime.h>
+typedef cudaStream_t gpuStream_t;
 #endif // USE_CUDA
+#ifdef USE_ROCM
+#include <hip/hip_runtime.h>
+typedef hipStream_t gpuStream_t;
+#endif // USE_ROCM

 #include <torchaudio/csrc/rnnt/macros.h>
 #include <torchaudio/csrc/rnnt/types.h>
@@ -15,9 +20,9 @@ namespace rnnt {
 typedef struct Options {
  // the device to compute transducer loss.
  device_t device_;
-#ifdef USE_CUDA
+#if defined(USE_CUDA) || defined(USE_ROCM)
  // the stream to launch kernels in when using GPU.
-  cudaStream_t stream_;
+  gpuStream_t stream_;
 #endif
  // The maximum number of threads that can be used.
  int numThreads_;

--- a/torchaudio/csrc/rnnt/workspace.h
+++ b/torchaudio/csrc/rnnt/workspace.h
@@ -131,10 +131,22 @@ class IntWorkspace {
          ComputeSizeForBetaCounters(options_) * sizeof(int));
    }
 #endif // USE_CUDA
+#ifdef USE_ROCM
+    if (data_ != nullptr && options_.device_ == GPU) {
+      hipMemset(
+          GetPointerToAlphaCounters(),
+          0,
+          ComputeSizeForAlphaCounters(options_) * sizeof(int));
+      hipMemset(
+          GetPointerToBetaCounters(),
+          0,
+          ComputeSizeForBetaCounters(options_) * sizeof(int));
+    }
+#endif // USE_ROCM
  }

  static int ComputeSizeForAlphaCounters(const Options& options) { // B * U
-#ifdef USE_CUDA
+#if defined(USE_CUDA) || defined(USE_ROCM)
    if (options.device_ == GPU) {
      return options.BU();
    } else {
@@ -145,7 +157,7 @@ class IntWorkspace {
 #endif // USE_CUDA
  }
  static int ComputeSizeForBetaCounters(const Options& options) { // B * U
-#ifdef USE_CUDA
+#if defined(USE_CUDA) || defined(USE_ROCM)
    if (options.device_ == GPU) {
      return options.BU();
    } else {