Commit 5cf7d2db authored by atalman's avatar atalman Committed by Facebook GitHub Bot
Browse files

Revert "Enable ROCm RNN-T Loss (#2485)" (#3586)

Summary:
This reverts commit c5939616.

Unblock 2.1.0 rc

Pull Request resolved: https://github.com/pytorch/audio/pull/3586

Reviewed By: osalpekar

Differential Revision: D48842032

Pulled By: atalman

fbshipit-source-id: bbdf9e45c9aa5fde00f315a2ff491ed050bc1707
parent bf77b2a0
[submodule "third_party/hipify_torch"]
path = third_party/hipify_torch
url = https://github.com/ROCmSoftwarePlatform/hipify_torch
......@@ -76,11 +76,6 @@ if(USE_ROCM)
if(NOT PYTORCH_FOUND_HIP)
set(USE_ROCM OFF)
endif()
if(CMAKE_VERSION VERSION_LESS 3.21.0)
message("Need at least CMake 3.21.0 to compile ROCm support.")
set(USE_ROCM OFF)
endif()
endif()
if(USE_CUDA)
......@@ -95,11 +90,6 @@ if(USE_CUDA)
)
endif()
if(USE_ROCM)
enable_language(HIP)
endif()
find_package(Torch REQUIRED)
include(cmake/TorchAudioHelper.cmake)
# https://github.com/pytorch/pytorch/issues/54174
......
Subproject commit 083ff9b50c7ed861f7f6eddd983cdedb72e8b964
################################################################################
# libtorchaudio
################################################################################
if(USE_ROCM)
list (APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm)
FIND_PACKAGE(HIP REQUIRED)
MESSAGE(STATUS "hip found ${ROCM_FOUND}")
list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/third_party/hipify_torch/cmake")
include(Hipify)
set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE})
set(CMAKE_CXX_LINKER ${HIP_HIPCC_EXECUTABLE})
set(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE)
list( APPEND CMAKE_INSTALL_RPATH "/opt/rocm/llvm/lib" )
set(OPENMP_LIBRARIES "/opt/rocm/llvm/lib/")
set(OpenMP_CXX "${CMAKE_CXX_COMPILER}")
set(OpenMP_CXX_FLAGS "-fopenmp=libomp")
#set(OpenMP_CXX_LIB_NAMES "omp")
set(OpenMP_omp_LIBRARY omp)
find_package(OpenMP REQUIRED)
endif()
set(
sources
lfilter.cpp
......@@ -62,37 +39,6 @@ if(BUILD_RNNT)
rnnt/gpu/compute.cu
)
endif()
if (USE_ROCM)
hipify(CUDA_SOURCE_DIR ${PROJECT_SOURCE_DIR}/torchaudio/csrc/rnnt/gpu HIP_SOURCE_DIR ${PROJECT_SOURCE_DIR}/torchaudio/csrc/rnnt/hip)
if ( NOT HIP_ADD_LIBRARY_FOUND )
list(APPEND CMAKE_MODULE_PATH /opt/rocm/hip/cmake)
find_package(HIP REQUIRED)
endif()
list(
APPEND
sources
rnnt/hip/compute_alphas.hip
rnnt/hip/compute_betas.hip
rnnt/hip/compute.hip
)
endif()
endif()
if(USE_ROCM)
list(
APPEND
additional_libs
hip::host
hip::device
/opt/rocm/llvm/lib/libomp.so
)
list(
APPEND
compile_definitions
USE_ROCM
)
endif()
if(BUILD_RIR)
......@@ -141,6 +87,7 @@ endif()
#------------------------------------------------------------------------------#
# END OF CUSTOMIZATION LOGICS
#------------------------------------------------------------------------------#
torchaudio_library(
libtorchaudio
"${sources}"
......
#include <c10/cuda/CUDAStream.h>
#include <torch/types.h>
#ifdef __HIP_PLATFORM_AMD__
#include <torchaudio/csrc/rnnt/hip/gpu_transducer_hip.h>
#else
#include <torchaudio/csrc/rnnt/gpu/gpu_transducer.h>
#endif
namespace torchaudio {
namespace rnnt {
......
#include <c10/cuda/CUDAStream.h>
#include <torch/types.h>
#ifdef __HIP_PLATFORM_AMD__
#include <torchaudio/csrc/rnnt/hip/gpu_transducer_hip.h>
#else
#include <torchaudio/csrc/rnnt/gpu/gpu_transducer.h>
#endif
namespace torchaudio {
namespace rnnt {
......
#include <c10/cuda/CUDAStream.h>
#include <torch/types.h>
#ifdef __HIP_PLATFORM_AMD__
#include <torchaudio/csrc/rnnt/hip/gpu_transducer_hip.h>
#else
#include <torchaudio/csrc/rnnt/gpu/gpu_transducer.h>
#endif
namespace torchaudio {
namespace rnnt {
......
......@@ -2,11 +2,7 @@
#ifdef USE_CUDA
#ifdef __HIP_PLATFORM_AMD__
#include <torchaudio/csrc/rnnt/hip/math_hip.cuh>
#else
#include <torchaudio/csrc/rnnt/gpu/math.cuh>
#endif
namespace torchaudio {
namespace rnnt {
......@@ -43,11 +39,7 @@ __global__ void ReduceMax2D(
CAST_DTYPE shf;
for (int stride = (WARP_SIZE >> 1); stride > 0; stride >>= 1) {
#ifdef __HIP_PLATFORM_AMD__
shf = __shfl_down(val, stride);
#else
shf = __shfl_down_sync(0xFFFFFFFF, val, stride);
#endif
if (threadIdx.x < stride && threadIdx.x + stride < dim) {
if (shf > val) {
val = shf;
......@@ -89,11 +81,7 @@ __global__ void ReduceLogSumExpGivenMax2D(
CAST_DTYPE shf;
for (int stride = (WARP_SIZE >> 1); stride > 0; stride >>= 1) {
#ifdef __HIP_PLATFORM_AMD__
shf = __shfl_down(val, stride);
#else
shf = __shfl_down_sync(0xFFFFFFFF, val, stride);
#endif
if (threadIdx.x < stride && threadIdx.x + stride < dim) {
val = val + shf;
}
......
......@@ -4,15 +4,9 @@
#include <cassert>
#ifdef __HIP_PLATFORM_AMD__
#include <torchaudio/csrc/rnnt/hip/kernel_utils.h>
#include <torchaudio/csrc/rnnt/hip/kernels.h>
#include <torchaudio/csrc/rnnt/hip/math_hip.cuh>
#else
#include <torchaudio/csrc/rnnt/gpu/kernel_utils.h>
#include <torchaudio/csrc/rnnt/gpu/kernels.h>
#include <torchaudio/csrc/rnnt/gpu/math.cuh>
#endif
namespace torchaudio {
namespace rnnt {
......@@ -132,11 +126,7 @@ __device__ void ComputeAlphas(
#pragma unroll
for (int i = 1; i < warpSize; i <<= 1) {
#ifdef __HIP_PLATFORM_AMD__
val = __shfl_up(skip_prob, i);
#else
val = __shfl_up_sync(0xffffffff, skip_prob, i);
#endif
if (i <= threadIdx.x) {
skip_prob = skip_prob + val;
}
......@@ -160,11 +150,7 @@ __device__ void ComputeAlphas(
CAST_DTYPE out = val;
for (int i = 1; i < warpSize; ++i) {
#ifdef __HIP_PLATFORM_AMD__
val = __shfl_up(val, 1);
#else
val = __shfl_up_sync(0xffffffff, val, 1);
#endif
if (i == threadIdx.x) {
val = math::lse(val + skip_prob, emit);
out = val;
......@@ -239,11 +225,7 @@ __device__ void ComputeBetasCosts(
#pragma unroll
for (int i = 1; i < warpSize; i <<= 1) {
#ifdef __HIP_PLATFORM_AMD__
val = __shfl_up(skip_prob, i);
#else
val = __shfl_up_sync(0xffffffff, skip_prob, i);
#endif
if (i <= threadIdx.x) {
skip_prob = skip_prob + val;
}
......@@ -266,11 +248,7 @@ __device__ void ComputeBetasCosts(
CAST_DTYPE out = val;
for (int i = 1; i < warpSize; ++i) {
#ifdef __HIP_PLATFORM_AMD__
val = __shfl_up(val, 1);
#else
val = __shfl_up_sync(0xffffffff, val, 1);
#endif
if (i == threadIdx.x) {
val = math::lse(val + skip_prob, emit);
out = val;
......
......@@ -3,13 +3,8 @@
#ifdef USE_CUDA
#include <torchaudio/csrc/rnnt/workspace.h>
#ifdef __HIP_PLATFORM_AMD__
#include <torchaudio/csrc/rnnt/hip/gpu_kernel_utils_hip.cuh>
#include <torchaudio/csrc/rnnt/hip/gpu_kernels_hip.cuh>
#else
#include <torchaudio/csrc/rnnt/gpu/gpu_kernel_utils.cuh>
#include <torchaudio/csrc/rnnt/gpu/gpu_kernels.cuh>
#endif
namespace torchaudio {
namespace rnnt {
......
......@@ -2,11 +2,7 @@
#include <cassert>
#ifdef __HIP_PLATFORM_AMD__
#include <torchaudio/csrc/rnnt/hip/math_hip.cuh>
#else
#include <torchaudio/csrc/rnnt/gpu/math.cuh>
#endif
namespace torchaudio {
namespace rnnt {
......
......@@ -2,13 +2,8 @@
#include <cassert>
#ifdef __HIP_PLATFORM_AMD__
#include <torchaudio/csrc/rnnt/hip/kernel_utils.h>
#include <torchaudio/csrc/rnnt/hip/math_hip.cuh>
#else
#include <torchaudio/csrc/rnnt/gpu/kernel_utils.h>
#include <torchaudio/csrc/rnnt/gpu/math.cuh>
#endif
namespace torchaudio {
namespace rnnt {
......
......@@ -8,14 +8,6 @@
#define FORCE_INLINE __forceinline__
#include <cuda_fp16.h>
#include <cuda_runtime.h>
#elif USE_ROCM
#define WARP_SIZE 32
#define MAX_THREADS_PER_BLOCK 1024
#define REDUCE_THREADS 256
#define HOST_AND_DEVICE __host__ __device__
#define FORCE_INLINE __forceinline__
#include <hip/hip_fp16.h>
#include <hip/hip_runtime.h>
#else
#define HOST_AND_DEVICE
#define FORCE_INLINE inline
......
......@@ -4,12 +4,7 @@
#ifdef USE_CUDA
#include <cuda_runtime.h>
typedef cudaStream_t gpuStream_t;
#endif // USE_CUDA
#ifdef USE_ROCM
#include <hip/hip_runtime.h>
typedef hipStream_t gpuStream_t;
#endif // USE_ROCM
#include <torchaudio/csrc/rnnt/macros.h>
#include <torchaudio/csrc/rnnt/types.h>
......@@ -20,9 +15,9 @@ namespace rnnt {
typedef struct Options {
// the device to compute transducer loss.
device_t device_;
#if defined(USE_CUDA) || defined(USE_ROCM)
#ifdef USE_CUDA
// the stream to launch kernels in when using GPU.
gpuStream_t stream_;
cudaStream_t stream_;
#endif
// The maximum number of threads that can be used.
int numThreads_;
......
......@@ -131,22 +131,10 @@ class IntWorkspace {
ComputeSizeForBetaCounters(options_) * sizeof(int));
}
#endif // USE_CUDA
#ifdef USE_ROCM
if (data_ != nullptr && options_.device_ == GPU) {
hipMemset(
GetPointerToAlphaCounters(),
0,
ComputeSizeForAlphaCounters(options_) * sizeof(int));
hipMemset(
GetPointerToBetaCounters(),
0,
ComputeSizeForBetaCounters(options_) * sizeof(int));
}
#endif // USE_ROCM
}
static int ComputeSizeForAlphaCounters(const Options& options) { // B * U
#if defined(USE_CUDA) || defined(USE_ROCM)
#ifdef USE_CUDA
if (options.device_ == GPU) {
return options.BU();
} else {
......@@ -157,7 +145,7 @@ class IntWorkspace {
#endif // USE_CUDA
}
static int ComputeSizeForBetaCounters(const Options& options) { // B * U
#if defined(USE_CUDA) || defined(USE_ROCM)
#ifdef USE_CUDA
if (options.device_ == GPU) {
return options.BU();
} else {
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment