Commit c5939616 authored by Juan Villamizar's avatar Juan Villamizar Committed by Facebook GitHub Bot
Browse files

Enable ROCm RNN-T Loss (#2485)

Summary:
Added HIPIFY code and small changes for ROCm. Targeting RNN-T loss.

Pull Request resolved: https://github.com/pytorch/audio/pull/2485

Reviewed By: huangruizhe

Differential Revision: D43537864

Pulled By: mthrok

fbshipit-source-id: 4bdb1f291dc51a12232ccd072b97ae94ae20cc0c
parent 1638efee
[submodule "third_party/hipify_torch"]
path = third_party/hipify_torch
url = https://github.com/ROCmSoftwarePlatform/hipify_torch
......@@ -76,6 +76,11 @@ if(USE_ROCM)
if(NOT PYTORCH_FOUND_HIP)
set(USE_ROCM OFF)
endif()
if(CMAKE_VERSION VERSION_LESS 3.21.0)
message("Need at least CMake 3.21.0 to compile ROCm support.")
set(USE_ROCM OFF)
endif()
endif()
if(USE_CUDA)
......@@ -90,6 +95,11 @@ if(USE_CUDA)
)
endif()
if(USE_ROCM)
enable_language(HIP)
endif()
find_package(Torch REQUIRED)
include(cmake/TorchAudioHelper.cmake)
# https://github.com/pytorch/pytorch/issues/54174
......
Subproject commit 083ff9b50c7ed861f7f6eddd983cdedb72e8b964
################################################################################
# libtorchaudio
################################################################################
if(USE_ROCM)
list (APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm)
FIND_PACKAGE(HIP REQUIRED)
MESSAGE(STATUS "hip found ${ROCM_FOUND}")
list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/third_party/hipify_torch/cmake")
include(Hipify)
set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE})
set(CMAKE_CXX_LINKER ${HIP_HIPCC_EXECUTABLE})
set(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE)
list( APPEND CMAKE_INSTALL_RPATH "/opt/rocm/llvm/lib" )
set(OPENMP_LIBRARIES "/opt/rocm/llvm/lib/")
set(OpenMP_CXX "${CMAKE_CXX_COMPILER}")
set(OpenMP_CXX_FLAGS "-fopenmp=libomp")
#set(OpenMP_CXX_LIB_NAMES "omp")
set(OpenMP_omp_LIBRARY omp)
find_package(OpenMP REQUIRED)
endif()
set(
sources
lfilter.cpp
......@@ -39,6 +62,37 @@ if(BUILD_RNNT)
rnnt/gpu/compute.cu
)
endif()
if (USE_ROCM)
hipify(CUDA_SOURCE_DIR ${PROJECT_SOURCE_DIR}/torchaudio/csrc/rnnt/gpu HIP_SOURCE_DIR ${PROJECT_SOURCE_DIR}/torchaudio/csrc/rnnt/hip)
if ( NOT HIP_ADD_LIBRARY_FOUND )
list(APPEND CMAKE_MODULE_PATH /opt/rocm/hip/cmake)
find_package(HIP REQUIRED)
endif()
list(
APPEND
sources
rnnt/hip/compute_alphas.hip
rnnt/hip/compute_betas.hip
rnnt/hip/compute.hip
)
endif()
endif()
if(USE_ROCM)
list(
APPEND
additional_libs
hip::host
hip::device
/opt/rocm/llvm/lib/libomp.so
)
list(
APPEND
compile_definitions
USE_ROCM
)
endif()
if(BUILD_RIR)
......@@ -87,7 +141,6 @@ endif()
#------------------------------------------------------------------------------#
# END OF CUSTOMIZATION LOGICS
#------------------------------------------------------------------------------#
torchaudio_library(
libtorchaudio
"${sources}"
......
#include <c10/cuda/CUDAStream.h>
#include <torch/types.h>
#ifdef __HIP_PLATFORM_AMD__
#include <torchaudio/csrc/rnnt/hip/gpu_transducer_hip.h>
#else
#include <torchaudio/csrc/rnnt/gpu/gpu_transducer.h>
#endif
namespace torchaudio {
namespace rnnt {
......
#include <c10/cuda/CUDAStream.h>
#include <torch/types.h>
#ifdef __HIP_PLATFORM_AMD__
#include <torchaudio/csrc/rnnt/hip/gpu_transducer_hip.h>
#else
#include <torchaudio/csrc/rnnt/gpu/gpu_transducer.h>
#endif
namespace torchaudio {
namespace rnnt {
......
#include <c10/cuda/CUDAStream.h>
#include <torch/types.h>
#ifdef __HIP_PLATFORM_AMD__
#include <torchaudio/csrc/rnnt/hip/gpu_transducer_hip.h>
#else
#include <torchaudio/csrc/rnnt/gpu/gpu_transducer.h>
#endif
namespace torchaudio {
namespace rnnt {
......
......@@ -2,7 +2,11 @@
#ifdef USE_CUDA
#ifdef __HIP_PLATFORM_AMD__
#include <torchaudio/csrc/rnnt/hip/math_hip.cuh>
#else
#include <torchaudio/csrc/rnnt/gpu/math.cuh>
#endif
namespace torchaudio {
namespace rnnt {
......@@ -39,7 +43,11 @@ __global__ void ReduceMax2D(
CAST_DTYPE shf;
for (int stride = (WARP_SIZE >> 1); stride > 0; stride >>= 1) {
#ifdef __HIP_PLATFORM_AMD__
shf = __shfl_down(val, stride);
#else
shf = __shfl_down_sync(0xFFFFFFFF, val, stride);
#endif
if (threadIdx.x < stride && threadIdx.x + stride < dim) {
if (shf > val) {
val = shf;
......@@ -81,7 +89,11 @@ __global__ void ReduceLogSumExpGivenMax2D(
CAST_DTYPE shf;
for (int stride = (WARP_SIZE >> 1); stride > 0; stride >>= 1) {
#ifdef __HIP_PLATFORM_AMD__
shf = __shfl_down(val, stride);
#else
shf = __shfl_down_sync(0xFFFFFFFF, val, stride);
#endif
if (threadIdx.x < stride && threadIdx.x + stride < dim) {
val = val + shf;
}
......
......@@ -4,9 +4,15 @@
#include <cassert>
#ifdef __HIP_PLATFORM_AMD__
#include <torchaudio/csrc/rnnt/hip/kernel_utils.h>
#include <torchaudio/csrc/rnnt/hip/kernels.h>
#include <torchaudio/csrc/rnnt/hip/math_hip.cuh>
#else
#include <torchaudio/csrc/rnnt/gpu/kernel_utils.h>
#include <torchaudio/csrc/rnnt/gpu/kernels.h>
#include <torchaudio/csrc/rnnt/gpu/math.cuh>
#endif
namespace torchaudio {
namespace rnnt {
......@@ -126,7 +132,11 @@ __device__ void ComputeAlphas(
#pragma unroll
for (int i = 1; i < warpSize; i <<= 1) {
#ifdef __HIP_PLATFORM_AMD__
val = __shfl_up(skip_prob, i);
#else
val = __shfl_up_sync(0xffffffff, skip_prob, i);
#endif
if (i <= threadIdx.x) {
skip_prob = skip_prob + val;
}
......@@ -150,7 +160,11 @@ __device__ void ComputeAlphas(
CAST_DTYPE out = val;
for (int i = 1; i < warpSize; ++i) {
#ifdef __HIP_PLATFORM_AMD__
val = __shfl_up(val, 1);
#else
val = __shfl_up_sync(0xffffffff, val, 1);
#endif
if (i == threadIdx.x) {
val = math::lse(val + skip_prob, emit);
out = val;
......@@ -225,7 +239,11 @@ __device__ void ComputeBetasCosts(
#pragma unroll
for (int i = 1; i < warpSize; i <<= 1) {
#ifdef __HIP_PLATFORM_AMD__
val = __shfl_up(skip_prob, i);
#else
val = __shfl_up_sync(0xffffffff, skip_prob, i);
#endif
if (i <= threadIdx.x) {
skip_prob = skip_prob + val;
}
......@@ -248,7 +266,11 @@ __device__ void ComputeBetasCosts(
CAST_DTYPE out = val;
for (int i = 1; i < warpSize; ++i) {
#ifdef __HIP_PLATFORM_AMD__
val = __shfl_up(val, 1);
#else
val = __shfl_up_sync(0xffffffff, val, 1);
#endif
if (i == threadIdx.x) {
val = math::lse(val + skip_prob, emit);
out = val;
......
......@@ -3,8 +3,13 @@
#ifdef USE_CUDA
#include <torchaudio/csrc/rnnt/workspace.h>
#ifdef __HIP_PLATFORM_AMD__
#include <torchaudio/csrc/rnnt/hip/gpu_kernel_utils_hip.cuh>
#include <torchaudio/csrc/rnnt/hip/gpu_kernels_hip.cuh>
#else
#include <torchaudio/csrc/rnnt/gpu/gpu_kernel_utils.cuh>
#include <torchaudio/csrc/rnnt/gpu/gpu_kernels.cuh>
#endif
namespace torchaudio {
namespace rnnt {
......
......@@ -2,7 +2,11 @@
#include <cassert>
#ifdef __HIP_PLATFORM_AMD__
#include <torchaudio/csrc/rnnt/hip/math_hip.cuh>
#else
#include <torchaudio/csrc/rnnt/gpu/math.cuh>
#endif
namespace torchaudio {
namespace rnnt {
......
......@@ -2,8 +2,13 @@
#include <cassert>
#ifdef __HIP_PLATFORM_AMD__
#include <torchaudio/csrc/rnnt/hip/kernel_utils.h>
#include <torchaudio/csrc/rnnt/hip/math_hip.cuh>
#else
#include <torchaudio/csrc/rnnt/gpu/kernel_utils.h>
#include <torchaudio/csrc/rnnt/gpu/math.cuh>
#endif
namespace torchaudio {
namespace rnnt {
......
......@@ -8,6 +8,14 @@
#define FORCE_INLINE __forceinline__
#include <cuda_fp16.h>
#include <cuda_runtime.h>
#elif USE_ROCM
#define WARP_SIZE 32
#define MAX_THREADS_PER_BLOCK 1024
#define REDUCE_THREADS 256
#define HOST_AND_DEVICE __host__ __device__
#define FORCE_INLINE __forceinline__
#include <hip/hip_fp16.h>
#include <hip/hip_runtime.h>
#else
#define HOST_AND_DEVICE
#define FORCE_INLINE inline
......
......@@ -4,7 +4,12 @@
#ifdef USE_CUDA
#include <cuda_runtime.h>
typedef cudaStream_t gpuStream_t;
#endif // USE_CUDA
#ifdef USE_ROCM
#include <hip/hip_runtime.h>
typedef hipStream_t gpuStream_t;
#endif // USE_ROCM
#include <torchaudio/csrc/rnnt/macros.h>
#include <torchaudio/csrc/rnnt/types.h>
......@@ -15,9 +20,9 @@ namespace rnnt {
typedef struct Options {
// the device to compute transducer loss.
device_t device_;
#ifdef USE_CUDA
#if defined(USE_CUDA) || defined(USE_ROCM)
// the stream to launch kernels in when using GPU.
cudaStream_t stream_;
gpuStream_t stream_;
#endif
// The maximum number of threads that can be used.
int numThreads_;
......
......@@ -131,10 +131,22 @@ class IntWorkspace {
ComputeSizeForBetaCounters(options_) * sizeof(int));
}
#endif // USE_CUDA
#ifdef USE_ROCM
if (data_ != nullptr && options_.device_ == GPU) {
hipMemset(
GetPointerToAlphaCounters(),
0,
ComputeSizeForAlphaCounters(options_) * sizeof(int));
hipMemset(
GetPointerToBetaCounters(),
0,
ComputeSizeForBetaCounters(options_) * sizeof(int));
}
#endif // USE_ROCM
}
static int ComputeSizeForAlphaCounters(const Options& options) { // B * U
#ifdef USE_CUDA
#if defined(USE_CUDA) || defined(USE_ROCM)
if (options.device_ == GPU) {
return options.BU();
} else {
......@@ -145,7 +157,7 @@ class IntWorkspace {
#endif // USE_CUDA
}
static int ComputeSizeForBetaCounters(const Options& options) { // B * U
#ifdef USE_CUDA
#if defined(USE_CUDA) || defined(USE_ROCM)
if (options.device_ == GPU) {
return options.BU();
} else {
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment