Commit 5cf7d2db authored by atalman's avatar atalman Committed by Facebook GitHub Bot
Browse files

Revert "Enable ROCm RNN-T Loss (#2485)" (#3586)

Summary:
This reverts commit c5939616.

Unblock 2.1.0 rc

Pull Request resolved: https://github.com/pytorch/audio/pull/3586

Reviewed By: osalpekar

Differential Revision: D48842032

Pulled By: atalman

fbshipit-source-id: bbdf9e45c9aa5fde00f315a2ff491ed050bc1707
parent bf77b2a0
[submodule "third_party/hipify_torch"]
path = third_party/hipify_torch
url = https://github.com/ROCmSoftwarePlatform/hipify_torch
...@@ -76,11 +76,6 @@ if(USE_ROCM) ...@@ -76,11 +76,6 @@ if(USE_ROCM)
if(NOT PYTORCH_FOUND_HIP) if(NOT PYTORCH_FOUND_HIP)
set(USE_ROCM OFF) set(USE_ROCM OFF)
endif() endif()
if(CMAKE_VERSION VERSION_LESS 3.21.0)
message("Need at least CMake 3.21.0 to compile ROCm support.")
set(USE_ROCM OFF)
endif()
endif() endif()
if(USE_CUDA) if(USE_CUDA)
...@@ -95,11 +90,6 @@ if(USE_CUDA) ...@@ -95,11 +90,6 @@ if(USE_CUDA)
) )
endif() endif()
if(USE_ROCM)
enable_language(HIP)
endif()
find_package(Torch REQUIRED)
include(cmake/TorchAudioHelper.cmake) include(cmake/TorchAudioHelper.cmake)
# https://github.com/pytorch/pytorch/issues/54174 # https://github.com/pytorch/pytorch/issues/54174
......
Subproject commit 083ff9b50c7ed861f7f6eddd983cdedb72e8b964
################################################################################ ################################################################################
# libtorchaudio # libtorchaudio
################################################################################ ################################################################################
if(USE_ROCM)
list (APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm)
FIND_PACKAGE(HIP REQUIRED)
MESSAGE(STATUS "hip found ${ROCM_FOUND}")
list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/third_party/hipify_torch/cmake")
include(Hipify)
set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE})
set(CMAKE_CXX_LINKER ${HIP_HIPCC_EXECUTABLE})
set(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE)
list( APPEND CMAKE_INSTALL_RPATH "/opt/rocm/llvm/lib" )
set(OPENMP_LIBRARIES "/opt/rocm/llvm/lib/")
set(OpenMP_CXX "${CMAKE_CXX_COMPILER}")
set(OpenMP_CXX_FLAGS "-fopenmp=libomp")
#set(OpenMP_CXX_LIB_NAMES "omp")
set(OpenMP_omp_LIBRARY omp)
find_package(OpenMP REQUIRED)
endif()
set( set(
sources sources
lfilter.cpp lfilter.cpp
...@@ -62,37 +39,6 @@ if(BUILD_RNNT) ...@@ -62,37 +39,6 @@ if(BUILD_RNNT)
rnnt/gpu/compute.cu rnnt/gpu/compute.cu
) )
endif() endif()
if (USE_ROCM)
hipify(CUDA_SOURCE_DIR ${PROJECT_SOURCE_DIR}/torchaudio/csrc/rnnt/gpu HIP_SOURCE_DIR ${PROJECT_SOURCE_DIR}/torchaudio/csrc/rnnt/hip)
if ( NOT HIP_ADD_LIBRARY_FOUND )
list(APPEND CMAKE_MODULE_PATH /opt/rocm/hip/cmake)
find_package(HIP REQUIRED)
endif()
list(
APPEND
sources
rnnt/hip/compute_alphas.hip
rnnt/hip/compute_betas.hip
rnnt/hip/compute.hip
)
endif()
endif()
if(USE_ROCM)
list(
APPEND
additional_libs
hip::host
hip::device
/opt/rocm/llvm/lib/libomp.so
)
list(
APPEND
compile_definitions
USE_ROCM
)
endif() endif()
if(BUILD_RIR) if(BUILD_RIR)
...@@ -141,6 +87,7 @@ endif() ...@@ -141,6 +87,7 @@ endif()
#------------------------------------------------------------------------------# #------------------------------------------------------------------------------#
# END OF CUSTOMIZATION LOGICS # END OF CUSTOMIZATION LOGICS
#------------------------------------------------------------------------------# #------------------------------------------------------------------------------#
torchaudio_library( torchaudio_library(
libtorchaudio libtorchaudio
"${sources}" "${sources}"
......
#include <c10/cuda/CUDAStream.h> #include <c10/cuda/CUDAStream.h>
#include <torch/types.h> #include <torch/types.h>
#ifdef __HIP_PLATFORM_AMD__
#include <torchaudio/csrc/rnnt/hip/gpu_transducer_hip.h>
#else
#include <torchaudio/csrc/rnnt/gpu/gpu_transducer.h> #include <torchaudio/csrc/rnnt/gpu/gpu_transducer.h>
#endif
namespace torchaudio { namespace torchaudio {
namespace rnnt { namespace rnnt {
......
#include <c10/cuda/CUDAStream.h> #include <c10/cuda/CUDAStream.h>
#include <torch/types.h> #include <torch/types.h>
#ifdef __HIP_PLATFORM_AMD__
#include <torchaudio/csrc/rnnt/hip/gpu_transducer_hip.h>
#else
#include <torchaudio/csrc/rnnt/gpu/gpu_transducer.h> #include <torchaudio/csrc/rnnt/gpu/gpu_transducer.h>
#endif
namespace torchaudio { namespace torchaudio {
namespace rnnt { namespace rnnt {
......
#include <c10/cuda/CUDAStream.h> #include <c10/cuda/CUDAStream.h>
#include <torch/types.h> #include <torch/types.h>
#ifdef __HIP_PLATFORM_AMD__
#include <torchaudio/csrc/rnnt/hip/gpu_transducer_hip.h>
#else
#include <torchaudio/csrc/rnnt/gpu/gpu_transducer.h> #include <torchaudio/csrc/rnnt/gpu/gpu_transducer.h>
#endif
namespace torchaudio { namespace torchaudio {
namespace rnnt { namespace rnnt {
......
...@@ -2,11 +2,7 @@ ...@@ -2,11 +2,7 @@
#ifdef USE_CUDA #ifdef USE_CUDA
#ifdef __HIP_PLATFORM_AMD__
#include <torchaudio/csrc/rnnt/hip/math_hip.cuh>
#else
#include <torchaudio/csrc/rnnt/gpu/math.cuh> #include <torchaudio/csrc/rnnt/gpu/math.cuh>
#endif
namespace torchaudio { namespace torchaudio {
namespace rnnt { namespace rnnt {
...@@ -43,11 +39,7 @@ __global__ void ReduceMax2D( ...@@ -43,11 +39,7 @@ __global__ void ReduceMax2D(
CAST_DTYPE shf; CAST_DTYPE shf;
for (int stride = (WARP_SIZE >> 1); stride > 0; stride >>= 1) { for (int stride = (WARP_SIZE >> 1); stride > 0; stride >>= 1) {
#ifdef __HIP_PLATFORM_AMD__
shf = __shfl_down(val, stride);
#else
shf = __shfl_down_sync(0xFFFFFFFF, val, stride); shf = __shfl_down_sync(0xFFFFFFFF, val, stride);
#endif
if (threadIdx.x < stride && threadIdx.x + stride < dim) { if (threadIdx.x < stride && threadIdx.x + stride < dim) {
if (shf > val) { if (shf > val) {
val = shf; val = shf;
...@@ -89,11 +81,7 @@ __global__ void ReduceLogSumExpGivenMax2D( ...@@ -89,11 +81,7 @@ __global__ void ReduceLogSumExpGivenMax2D(
CAST_DTYPE shf; CAST_DTYPE shf;
for (int stride = (WARP_SIZE >> 1); stride > 0; stride >>= 1) { for (int stride = (WARP_SIZE >> 1); stride > 0; stride >>= 1) {
#ifdef __HIP_PLATFORM_AMD__
shf = __shfl_down(val, stride);
#else
shf = __shfl_down_sync(0xFFFFFFFF, val, stride); shf = __shfl_down_sync(0xFFFFFFFF, val, stride);
#endif
if (threadIdx.x < stride && threadIdx.x + stride < dim) { if (threadIdx.x < stride && threadIdx.x + stride < dim) {
val = val + shf; val = val + shf;
} }
......
...@@ -4,15 +4,9 @@ ...@@ -4,15 +4,9 @@
#include <cassert> #include <cassert>
#ifdef __HIP_PLATFORM_AMD__
#include <torchaudio/csrc/rnnt/hip/kernel_utils.h>
#include <torchaudio/csrc/rnnt/hip/kernels.h>
#include <torchaudio/csrc/rnnt/hip/math_hip.cuh>
#else
#include <torchaudio/csrc/rnnt/gpu/kernel_utils.h> #include <torchaudio/csrc/rnnt/gpu/kernel_utils.h>
#include <torchaudio/csrc/rnnt/gpu/kernels.h> #include <torchaudio/csrc/rnnt/gpu/kernels.h>
#include <torchaudio/csrc/rnnt/gpu/math.cuh> #include <torchaudio/csrc/rnnt/gpu/math.cuh>
#endif
namespace torchaudio { namespace torchaudio {
namespace rnnt { namespace rnnt {
...@@ -132,11 +126,7 @@ __device__ void ComputeAlphas( ...@@ -132,11 +126,7 @@ __device__ void ComputeAlphas(
#pragma unroll #pragma unroll
for (int i = 1; i < warpSize; i <<= 1) { for (int i = 1; i < warpSize; i <<= 1) {
#ifdef __HIP_PLATFORM_AMD__
val = __shfl_up(skip_prob, i);
#else
val = __shfl_up_sync(0xffffffff, skip_prob, i); val = __shfl_up_sync(0xffffffff, skip_prob, i);
#endif
if (i <= threadIdx.x) { if (i <= threadIdx.x) {
skip_prob = skip_prob + val; skip_prob = skip_prob + val;
} }
...@@ -160,11 +150,7 @@ __device__ void ComputeAlphas( ...@@ -160,11 +150,7 @@ __device__ void ComputeAlphas(
CAST_DTYPE out = val; CAST_DTYPE out = val;
for (int i = 1; i < warpSize; ++i) { for (int i = 1; i < warpSize; ++i) {
#ifdef __HIP_PLATFORM_AMD__
val = __shfl_up(val, 1);
#else
val = __shfl_up_sync(0xffffffff, val, 1); val = __shfl_up_sync(0xffffffff, val, 1);
#endif
if (i == threadIdx.x) { if (i == threadIdx.x) {
val = math::lse(val + skip_prob, emit); val = math::lse(val + skip_prob, emit);
out = val; out = val;
...@@ -239,11 +225,7 @@ __device__ void ComputeBetasCosts( ...@@ -239,11 +225,7 @@ __device__ void ComputeBetasCosts(
#pragma unroll #pragma unroll
for (int i = 1; i < warpSize; i <<= 1) { for (int i = 1; i < warpSize; i <<= 1) {
#ifdef __HIP_PLATFORM_AMD__
val = __shfl_up(skip_prob, i);
#else
val = __shfl_up_sync(0xffffffff, skip_prob, i); val = __shfl_up_sync(0xffffffff, skip_prob, i);
#endif
if (i <= threadIdx.x) { if (i <= threadIdx.x) {
skip_prob = skip_prob + val; skip_prob = skip_prob + val;
} }
...@@ -266,11 +248,7 @@ __device__ void ComputeBetasCosts( ...@@ -266,11 +248,7 @@ __device__ void ComputeBetasCosts(
CAST_DTYPE out = val; CAST_DTYPE out = val;
for (int i = 1; i < warpSize; ++i) { for (int i = 1; i < warpSize; ++i) {
#ifdef __HIP_PLATFORM_AMD__
val = __shfl_up(val, 1);
#else
val = __shfl_up_sync(0xffffffff, val, 1); val = __shfl_up_sync(0xffffffff, val, 1);
#endif
if (i == threadIdx.x) { if (i == threadIdx.x) {
val = math::lse(val + skip_prob, emit); val = math::lse(val + skip_prob, emit);
out = val; out = val;
......
...@@ -3,13 +3,8 @@ ...@@ -3,13 +3,8 @@
#ifdef USE_CUDA #ifdef USE_CUDA
#include <torchaudio/csrc/rnnt/workspace.h> #include <torchaudio/csrc/rnnt/workspace.h>
#ifdef __HIP_PLATFORM_AMD__
#include <torchaudio/csrc/rnnt/hip/gpu_kernel_utils_hip.cuh>
#include <torchaudio/csrc/rnnt/hip/gpu_kernels_hip.cuh>
#else
#include <torchaudio/csrc/rnnt/gpu/gpu_kernel_utils.cuh> #include <torchaudio/csrc/rnnt/gpu/gpu_kernel_utils.cuh>
#include <torchaudio/csrc/rnnt/gpu/gpu_kernels.cuh> #include <torchaudio/csrc/rnnt/gpu/gpu_kernels.cuh>
#endif
namespace torchaudio { namespace torchaudio {
namespace rnnt { namespace rnnt {
......
...@@ -2,11 +2,7 @@ ...@@ -2,11 +2,7 @@
#include <cassert> #include <cassert>
#ifdef __HIP_PLATFORM_AMD__
#include <torchaudio/csrc/rnnt/hip/math_hip.cuh>
#else
#include <torchaudio/csrc/rnnt/gpu/math.cuh> #include <torchaudio/csrc/rnnt/gpu/math.cuh>
#endif
namespace torchaudio { namespace torchaudio {
namespace rnnt { namespace rnnt {
......
...@@ -2,13 +2,8 @@ ...@@ -2,13 +2,8 @@
#include <cassert> #include <cassert>
#ifdef __HIP_PLATFORM_AMD__
#include <torchaudio/csrc/rnnt/hip/kernel_utils.h>
#include <torchaudio/csrc/rnnt/hip/math_hip.cuh>
#else
#include <torchaudio/csrc/rnnt/gpu/kernel_utils.h> #include <torchaudio/csrc/rnnt/gpu/kernel_utils.h>
#include <torchaudio/csrc/rnnt/gpu/math.cuh> #include <torchaudio/csrc/rnnt/gpu/math.cuh>
#endif
namespace torchaudio { namespace torchaudio {
namespace rnnt { namespace rnnt {
......
...@@ -8,14 +8,6 @@ ...@@ -8,14 +8,6 @@
#define FORCE_INLINE __forceinline__ #define FORCE_INLINE __forceinline__
#include <cuda_fp16.h> #include <cuda_fp16.h>
#include <cuda_runtime.h> #include <cuda_runtime.h>
#elif USE_ROCM
#define WARP_SIZE 32
#define MAX_THREADS_PER_BLOCK 1024
#define REDUCE_THREADS 256
#define HOST_AND_DEVICE __host__ __device__
#define FORCE_INLINE __forceinline__
#include <hip/hip_fp16.h>
#include <hip/hip_runtime.h>
#else #else
#define HOST_AND_DEVICE #define HOST_AND_DEVICE
#define FORCE_INLINE inline #define FORCE_INLINE inline
......
...@@ -4,12 +4,7 @@ ...@@ -4,12 +4,7 @@
#ifdef USE_CUDA #ifdef USE_CUDA
#include <cuda_runtime.h> #include <cuda_runtime.h>
typedef cudaStream_t gpuStream_t;
#endif // USE_CUDA #endif // USE_CUDA
#ifdef USE_ROCM
#include <hip/hip_runtime.h>
typedef hipStream_t gpuStream_t;
#endif // USE_ROCM
#include <torchaudio/csrc/rnnt/macros.h> #include <torchaudio/csrc/rnnt/macros.h>
#include <torchaudio/csrc/rnnt/types.h> #include <torchaudio/csrc/rnnt/types.h>
...@@ -20,9 +15,9 @@ namespace rnnt { ...@@ -20,9 +15,9 @@ namespace rnnt {
typedef struct Options { typedef struct Options {
// the device to compute transducer loss. // the device to compute transducer loss.
device_t device_; device_t device_;
#if defined(USE_CUDA) || defined(USE_ROCM) #ifdef USE_CUDA
// the stream to launch kernels in when using GPU. // the stream to launch kernels in when using GPU.
gpuStream_t stream_; cudaStream_t stream_;
#endif #endif
// The maximum number of threads that can be used. // The maximum number of threads that can be used.
int numThreads_; int numThreads_;
......
...@@ -131,22 +131,10 @@ class IntWorkspace { ...@@ -131,22 +131,10 @@ class IntWorkspace {
ComputeSizeForBetaCounters(options_) * sizeof(int)); ComputeSizeForBetaCounters(options_) * sizeof(int));
} }
#endif // USE_CUDA #endif // USE_CUDA
#ifdef USE_ROCM
if (data_ != nullptr && options_.device_ == GPU) {
hipMemset(
GetPointerToAlphaCounters(),
0,
ComputeSizeForAlphaCounters(options_) * sizeof(int));
hipMemset(
GetPointerToBetaCounters(),
0,
ComputeSizeForBetaCounters(options_) * sizeof(int));
}
#endif // USE_ROCM
} }
static int ComputeSizeForAlphaCounters(const Options& options) { // B * U static int ComputeSizeForAlphaCounters(const Options& options) { // B * U
#if defined(USE_CUDA) || defined(USE_ROCM) #ifdef USE_CUDA
if (options.device_ == GPU) { if (options.device_ == GPU) {
return options.BU(); return options.BU();
} else { } else {
...@@ -157,7 +145,7 @@ class IntWorkspace { ...@@ -157,7 +145,7 @@ class IntWorkspace {
#endif // USE_CUDA #endif // USE_CUDA
} }
static int ComputeSizeForBetaCounters(const Options& options) { // B * U static int ComputeSizeForBetaCounters(const Options& options) { // B * U
#if defined(USE_CUDA) || defined(USE_ROCM) #ifdef USE_CUDA
if (options.device_ == GPU) { if (options.device_ == GPU) {
return options.BU(); return options.BU();
} else { } else {
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment