Commit ca9dbdb2 authored by yuguo-Jack's avatar yuguo-Jack
Browse files

new features

parent bb99f03d
...@@ -21,6 +21,7 @@ ...@@ -21,6 +21,7 @@
#include "paddle/phi/kernels/funcs/math_function.h" #include "paddle/phi/kernels/funcs/math_function.h"
PD_DECLARE_bool(enable_cublas_tensor_op_math); PD_DECLARE_bool(enable_cublas_tensor_op_math);
PD_DECLARE_bool(gemm_use_half_precision_compute_type);
namespace phi { namespace phi {
namespace funcs { namespace funcs {
...@@ -703,6 +704,13 @@ inline void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA, ...@@ -703,6 +704,13 @@ inline void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
float h_alpha = static_cast<float>(alpha); float h_alpha = static_cast<float>(alpha);
float h_beta = static_cast<float>(beta); float h_beta = static_cast<float>(beta);
rocblas_datatype compute_type = rocblas_datatype_f32_r;
if (FLAGS_gemm_use_half_precision_compute_type == true) {
compute_type = rocblas_datatype_f16_r;
}
VLOG(4) << "use_half_precision_compute_type: "
<< FLAGS_gemm_use_half_precision_compute_type;
auto &cuda_ctx = const_cast<phi::GPUContext &>(context_); auto &cuda_ctx = const_cast<phi::GPUContext &>(context_);
CUBlas<phi::dtype::float16>::GEMM_EX(&cuda_ctx, CUBlas<phi::dtype::float16>::GEMM_EX(&cuda_ctx,
cuTransB, cuTransB,
...@@ -721,7 +729,7 @@ inline void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA, ...@@ -721,7 +729,7 @@ inline void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
C, C,
rocblas_datatype_f16_r, rocblas_datatype_f16_r,
N, N,
rocblas_datatype_f32_r); compute_type);
} }
template <> template <>
......
...@@ -661,14 +661,14 @@ void BatchNormGradFunctor(const Context &ctx, ...@@ -661,14 +661,14 @@ void BatchNormGradFunctor(const Context &ctx,
// ------------------- cudnn descriptors --------------------- // ------------------- cudnn descriptors ---------------------
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
// TODO(wangran16): wait for MIOpen to improve the performance of BN // TODO(wangran16): wait for MIOpen to improve the performance of BN
// miopenTensorDescriptor_t data_desc_; miopenTensorDescriptor_t data_desc_;
// miopenTensorDescriptor_t bn_param_desc_; miopenTensorDescriptor_t bn_param_desc_;
// miopenBatchNormMode_t mode_; miopenBatchNormMode_t mode_;
// PADDLE_ENFORCE_GPU_SUCCESS( PADDLE_ENFORCE_GPU_SUCCESS(
// platform::dynload::miopenCreateTensorDescriptor(&data_desc_)); phi::dynload::miopenCreateTensorDescriptor(&data_desc_));
// PADDLE_ENFORCE_GPU_SUCCESS( PADDLE_ENFORCE_GPU_SUCCESS(
// platform::dynload::miopenCreateTensorDescriptor(&bn_param_desc_)); phi::dynload::miopenCreateTensorDescriptor(&bn_param_desc_));
#else #else
cudnnTensorDescriptor_t data_desc_; cudnnTensorDescriptor_t data_desc_;
cudnnTensorDescriptor_t bn_param_desc_; cudnnTensorDescriptor_t bn_param_desc_;
...@@ -687,7 +687,11 @@ void BatchNormGradFunctor(const Context &ctx, ...@@ -687,7 +687,11 @@ void BatchNormGradFunctor(const Context &ctx,
epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON); epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON);
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
// TODO(wangran16): wait for MIOpen to improve the performance of BN // TODO(wangran16): wait for MIOpen to improve the performance of BN
// mode_ = miopenBNSpatial; if (H == 1 && W == 1) {
mode_ = miopenBNPerActivation;
} else {
mode_ = miopenBNSpatial;
}
#elif CUDNN_VERSION_MIN(7, 0, 1) #elif CUDNN_VERSION_MIN(7, 0, 1)
if (FLAGS_cudnn_batchnorm_spatial_persistent) { if (FLAGS_cudnn_batchnorm_spatial_persistent) {
mode_ = CUDNN_BATCHNORM_SPATIAL_PERSISTENT; mode_ = CUDNN_BATCHNORM_SPATIAL_PERSISTENT;
...@@ -706,13 +710,13 @@ void BatchNormGradFunctor(const Context &ctx, ...@@ -706,13 +710,13 @@ void BatchNormGradFunctor(const Context &ctx,
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
// TODO(wangran16): wait for MIOpen to improve the performance of BN // TODO(wangran16): wait for MIOpen to improve the performance of BN
// PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenSetTensorDescriptor( PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenSetTensorDescriptor(
// data_desc_, CudnnDataType<T>::type, data_desc_, CudnnDataType<T>::type,
// x_dims.size() > 3 ? x_dims.size() : 4, const_cast<int *>(dims.data()), x_dims.size() > 3 ? x_dims.size() : 4, const_cast<int *>(dims.data()),
// const_cast<int *>(strides.data()))); const_cast<int *>(strides.data())));
// PADDLE_ENFORCE_GPU_SUCCESS( PADDLE_ENFORCE_GPU_SUCCESS(
// platform::dynload::miopenDeriveBNTensorDescriptor(bn_param_desc_, phi::dynload::miopenDeriveBNTensorDescriptor(bn_param_desc_,
// data_desc_, mode_)); data_desc_, mode_));
#else #else
PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnSetTensorNdDescriptor( PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnSetTensorNdDescriptor(
data_desc_, data_desc_,
...@@ -750,7 +754,23 @@ void BatchNormGradFunctor(const Context &ctx, ...@@ -750,7 +754,23 @@ void BatchNormGradFunctor(const Context &ctx,
if (d_x && d_scale && d_bias) { if (d_x && d_scale && d_bias) {
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
if (compute_format == DataLayout::kNCHW) { if (compute_format == DataLayout::kNCHW) {
BNBackward<T, block, DataLayout::kNCHW> if (FLAGS_cudnn_batchnorm_spatial_persistent == true) {
PADDLE_ENFORCE_GPU_SUCCESS(
phi::dynload::miopenBatchNormalizationBackward(
ctx.cudnn_handle(), mode_, CudnnDataType<T>::kOne(),
CudnnDataType<T>::kZero(), CudnnDataType<T>::kOne(),
CudnnDataType<T>::kZero(), data_desc_,
transformed_x.template data<T>(), data_desc_,
transformed_d_y.template data<T>(), data_desc_,
transformed_d_x.template mutable_data<T>(ctx.GetPlace()),
bn_param_desc_, scale->template data<BatchNormParamType<T>>(),
d_scale->template mutable_data<BatchNormParamType<T>>(
ctx.GetPlace()),
d_bias->template mutable_data<BatchNormParamType<T>>(
ctx.GetPlace()),
epsilon, saved_mean_data, saved_var_data));
} else {
BNBackward<T, block, DataLayout::kNCHW>
<<<grid2, block, 0, ctx.stream()>>>( <<<grid2, block, 0, ctx.stream()>>>(
transformed_d_y.template data<T>(), transformed_d_y.template data<T>(),
transformed_x.template data<T>(), transformed_x.template data<T>(),
...@@ -764,38 +784,41 @@ void BatchNormGradFunctor(const Context &ctx, ...@@ -764,38 +784,41 @@ void BatchNormGradFunctor(const Context &ctx,
transformed_d_x.template data<T>(), transformed_d_x.template data<T>(),
ctx.template Alloc<BatchNormParamType<T>>(d_scale), ctx.template Alloc<BatchNormParamType<T>>(d_scale),
ctx.template Alloc<BatchNormParamType<T>>(d_bias)); ctx.template Alloc<BatchNormParamType<T>>(d_bias));
}
} else { } else {
BNBackward<T, block, DataLayout::kNHWC> if (FLAGS_cudnn_batchnorm_spatial_persistent == true) {
<<<grid2, block, 0, ctx.stream()>>>( PADDLE_ENFORCE_GPU_SUCCESS(
transformed_d_y.template data<T>(), phi::dynload::miopenBatchNormalizationBackward(
transformed_x.template data<T>(), ctx.cudnn_handle(), mode_, CudnnDataType<T>::kOne(),
new_scale.template data<BatchNormParamType<T>>(), CudnnDataType<T>::kZero(), CudnnDataType<T>::kOne(),
saved_mean_data, CudnnDataType<T>::kZero(), data_desc_,
saved_var_data, transformed_x.template data<T>(), data_desc_,
C, transformed_d_y.template data<T>(), data_desc_,
N, transformed_d_x.template mutable_data<T>(ctx.GetPlace()),
H * W * D, bn_param_desc_, scale->template data<BatchNormParamType<T>>(),
epsilon, d_scale->template mutable_data<BatchNormParamType<T>>(
transformed_d_x.template data<T>(), ctx.GetPlace()),
ctx.template Alloc<BatchNormParamType<T>>(d_scale), d_bias->template mutable_data<BatchNormParamType<T>>(
ctx.template Alloc<BatchNormParamType<T>>(d_bias)); ctx.GetPlace()),
epsilon, saved_mean_data, saved_var_data));
} else {
BNBackward<T, block, DataLayout::kNHWC>
<<<grid2, block, 0, ctx.stream()>>>(
transformed_d_y.template data<T>(),
transformed_x.template data<T>(),
new_scale.template data<BatchNormParamType<T>>(),
saved_mean_data,
saved_var_data,
C,
N,
H * W * D,
epsilon,
transformed_d_x.template data<T>(),
ctx.template Alloc<BatchNormParamType<T>>(d_scale),
ctx.template Alloc<BatchNormParamType<T>>(d_bias));
}
} }
// TODO(wangran16): wait for MIOpen to improve the performance of BN
// PADDLE_ENFORCE_GPU_SUCCESS(
// platform::dynload::miopenBatchNormalizationBackward(
// dev_ctx.cudnn_handle(), mode_, CudnnDataType<T>::kOne(),
// CudnnDataType<T>::kZero(), CudnnDataType<T>::kOne(),
// CudnnDataType<T>::kZero(), data_desc_,
// transformed_x.template data<T>(), data_desc_,
// transformed_d_y.template data<T>(), data_desc_,
// transformed_d_x.template mutable_data<T>(ctx.GetPlace()),
// bn_param_desc_, scale->template data<BatchNormParamType<T>>(),
// d_scale->template mutable_data<BatchNormParamType<T>>(
// ctx.GetPlace()),
// d_bias->template mutable_data<BatchNormParamType<T>>(
// ctx.GetPlace()),
// epsilon, saved_mean_data, saved_var_data));
#else #else
} }
// CUDNN only support small batch size // CUDNN only support small batch size
...@@ -1129,10 +1152,10 @@ void BatchNormGradFunctor(const Context &ctx, ...@@ -1129,10 +1152,10 @@ void BatchNormGradFunctor(const Context &ctx,
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
// TODO(wangran16): wait for MIOpen to improve the performance of BN // TODO(wangran16): wait for MIOpen to improve the performance of BN
// clean when exit. // clean when exit.
// PADDLE_ENFORCE_GPU_SUCCESS( PADDLE_ENFORCE_GPU_SUCCESS(
// platform::dynload::miopenDestroyTensorDescriptor(data_desc_)); phi::dynload::miopenDestroyTensorDescriptor(data_desc_));
// PADDLE_ENFORCE_GPU_SUCCESS( PADDLE_ENFORCE_GPU_SUCCESS(
// platform::dynload::miopenDestroyTensorDescriptor(bn_param_desc_)); phi::dynload::miopenDestroyTensorDescriptor(bn_param_desc_));
#else #else
// clean when exit. // clean when exit.
PADDLE_ENFORCE_GPU_SUCCESS( PADDLE_ENFORCE_GPU_SUCCESS(
......
...@@ -604,14 +604,14 @@ void BatchNormKernel(const Context &ctx, ...@@ -604,14 +604,14 @@ void BatchNormKernel(const Context &ctx,
// ------------------- cudnn descriptors --------------------- // ------------------- cudnn descriptors ---------------------
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
// TODO(wangran16): wait for MIOpen to improve the performance of BN // TODO(wangran16): wait for MIOpen to improve the performance of BN
// miopenTensorDescriptor_t data_desc_; miopenTensorDescriptor_t data_desc_;
// miopenTensorDescriptor_t bn_param_desc_; miopenTensorDescriptor_t bn_param_desc_;
// miopenBatchNormMode_t mode_; miopenBatchNormMode_t mode_;
// PADDLE_ENFORCE_GPU_SUCCESS( PADDLE_ENFORCE_GPU_SUCCESS(
// platform::dynload::miopenCreateTensorDescriptor(&data_desc_)); phi::dynload::miopenCreateTensorDescriptor(&data_desc_));
// PADDLE_ENFORCE_GPU_SUCCESS( PADDLE_ENFORCE_GPU_SUCCESS(
// platform::dynload::miopenCreateTensorDescriptor(&bn_param_desc_)); phi::dynload::miopenCreateTensorDescriptor(&bn_param_desc_));
#else #else
cudnnTensorDescriptor_t data_desc_; cudnnTensorDescriptor_t data_desc_;
cudnnTensorDescriptor_t bn_param_desc_; cudnnTensorDescriptor_t bn_param_desc_;
...@@ -632,7 +632,11 @@ void BatchNormKernel(const Context &ctx, ...@@ -632,7 +632,11 @@ void BatchNormKernel(const Context &ctx,
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
// TODO(wangran16): wait for MIOpen to improve the performance of BN // TODO(wangran16): wait for MIOpen to improve the performance of BN
// mode_ = miopenBNSpatial; if (H == 1 && W == 1) {
mode_ = miopenBNPerActivation;
} else {
mode_ = miopenBNSpatial;
}
#elif CUDNN_VERSION_MIN(7, 0, 1) #elif CUDNN_VERSION_MIN(7, 0, 1)
if (FLAGS_cudnn_batchnorm_spatial_persistent) { if (FLAGS_cudnn_batchnorm_spatial_persistent) {
mode_ = CUDNN_BATCHNORM_SPATIAL_PERSISTENT; mode_ = CUDNN_BATCHNORM_SPATIAL_PERSISTENT;
...@@ -662,14 +666,14 @@ void BatchNormKernel(const Context &ctx, ...@@ -662,14 +666,14 @@ void BatchNormKernel(const Context &ctx,
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
// TODO(wangran16): wait for MIOpen to improve the performance of BN // TODO(wangran16): wait for MIOpen to improve the performance of BN
// PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenSetTensorDescriptor( PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenSetTensorDescriptor(
// data_desc_, CudnnDataType<T>::type, data_desc_, CudnnDataType<T>::type,
// x_dims.size() > 3 ? x_dims.size() : 4, const_cast<int *>(dims.data()), x_dims.size() > 3 ? x_dims.size() : 4, const_cast<int *>(dims.data()),
// const_cast<int *>(strides.data()))); const_cast<int *>(strides.data())));
// Note: PERSISTENT not implemented for inference // Note: PERSISTENT not implemented for inference
// PADDLE_ENFORCE_GPU_SUCCESS( PADDLE_ENFORCE_GPU_SUCCESS(
// platform::dynload::miopenDeriveBNTensorDescriptor( phi::dynload::miopenDeriveBNTensorDescriptor(
// bn_param_desc_, data_desc_, test_mode ? miopenBNSpatial : mode_)); bn_param_desc_, data_desc_, mode_));
#else #else
PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnSetTensorNdDescriptor( PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnSetTensorNdDescriptor(
data_desc_, data_desc_,
...@@ -736,7 +740,31 @@ void BatchNormKernel(const Context &ctx, ...@@ -736,7 +740,31 @@ void BatchNormKernel(const Context &ctx,
const int block_size = 256; const int block_size = 256;
const int grid_size = (N * C * H * W * D + block_size - 1) / block_size; const int grid_size = (N * C * H * W * D + block_size - 1) / block_size;
if (compute_format == DataLayout::kNCHW) { if (compute_format == DataLayout::kNCHW) {
BNForwardInference<T, DataLayout::kNCHW> if (FLAGS_cudnn_batchnorm_spatial_persistent == true) {
PADDLE_ENFORCE_GPU_SUCCESS(
phi::dynload::miopenBatchNormalizationForwardInference(
handle, mode_,
const_cast<void *>(
static_cast<const void *>(CudnnDataType<T>::kOne())),
const_cast<void *>(
static_cast<const void *>(CudnnDataType<T>::kZero())),
data_desc_,
static_cast<const void *>(transformed_x.template data<T>()),
data_desc_,
static_cast<void *>(
transformed_y.template mutable_data<T>(ctx.GetPlace())),
bn_param_desc_,
const_cast<void *>(static_cast<const void *>(
scale->template data<BatchNormParamType<T>>())),
const_cast<void *>(static_cast<const void *>(
bias->template data<BatchNormParamType<T>>())),
const_cast<void *>(static_cast<const void *>(
est_mean->template data<BatchNormParamType<T>>())),
const_cast<void *>(static_cast<const void *>(
est_var->template data<BatchNormParamType<T>>())),
epsilon));
} else {
BNForwardInference<T, DataLayout::kNCHW>
<<<grid_size, block_size, 0, ctx.stream()>>>( <<<grid_size, block_size, 0, ctx.stream()>>>(
transformed_x.template data<T>(), transformed_x.template data<T>(),
est_mean->template data<BatchNormParamType<T>>(), est_mean->template data<BatchNormParamType<T>>(),
...@@ -748,43 +776,47 @@ void BatchNormKernel(const Context &ctx, ...@@ -748,43 +776,47 @@ void BatchNormKernel(const Context &ctx,
H * W * D, H * W * D,
epsilon, epsilon,
transformed_y.template data<T>()); transformed_y.template data<T>());
}
} else { } else {
BNForwardInference<T, DataLayout::kNHWC> if (FLAGS_cudnn_batchnorm_spatial_persistent == true) {
<<<grid_size, block_size, 0, ctx.stream()>>>( PADDLE_ENFORCE_GPU_SUCCESS(
transformed_x.template data<T>(), phi::dynload::miopenBatchNormalizationForwardInference(
est_mean->template data<BatchNormParamType<T>>(), handle, mode_,
est_var->template data<BatchNormParamType<T>>(), const_cast<void *>(
new_scale.template data<BatchNormParamType<T>>(), static_cast<const void *>(CudnnDataType<T>::kOne())),
new_bias.template data<BatchNormParamType<T>>(), const_cast<void *>(
C, static_cast<const void *>(CudnnDataType<T>::kZero())),
N, data_desc_,
H * W * D, static_cast<const void *>(transformed_x.template data<T>()),
epsilon, data_desc_,
transformed_y.template data<T>()); static_cast<void *>(
transformed_y.template mutable_data<T>(ctx.GetPlace())),
bn_param_desc_,
const_cast<void *>(static_cast<const void *>(
scale->template data<BatchNormParamType<T>>())),
const_cast<void *>(static_cast<const void *>(
bias->template data<BatchNormParamType<T>>())),
const_cast<void *>(static_cast<const void *>(
est_mean->template data<BatchNormParamType<T>>())),
const_cast<void *>(static_cast<const void *>(
est_var->template data<BatchNormParamType<T>>())),
epsilon));
} else {
BNForwardInference<T, DataLayout::kNHWC>
<<<grid_size, block_size, 0, ctx.stream()>>>(
transformed_x.template data<T>(),
est_mean->template data<BatchNormParamType<T>>(),
est_var->template data<BatchNormParamType<T>>(),
new_scale.template data<BatchNormParamType<T>>(),
new_bias.template data<BatchNormParamType<T>>(),
C,
N,
H * W * D,
epsilon,
transformed_y.template data<T>());
}
} }
// TODO(wangran16): wait for MIOpen to improve the performance of BN
// PADDLE_ENFORCE_GPU_SUCCESS(
// platform::dynload::miopenBatchNormalizationForwardInference(
// handle, miopenBNSpatial,
// const_cast<void *>(
// static_cast<const void *>(CudnnDataType<T>::kOne())),
// const_cast<void *>(
// static_cast<const void *>(CudnnDataType<T>::kZero())),
// data_desc_,
// static_cast<const void *>(transformed_x.template data<T>()),
// data_desc_,
// static_cast<void *>(
// transformed_y.template mutable_data<T>(ctx.GetPlace())),
// bn_param_desc_,
// const_cast<void *>(static_cast<const void *>(
// scale->template data<BatchNormParamType<T>>())),
// const_cast<void *>(static_cast<const void *>(
// bias->template data<BatchNormParamType<T>>())),
// const_cast<void *>(static_cast<const void *>(
// est_mean->template data<BatchNormParamType<T>>())),
// const_cast<void *>(static_cast<const void *>(
// est_var->template data<BatchNormParamType<T>>())),
// epsilon));
#else #else
const bool use_native_kernel = const bool use_native_kernel =
(x_dims.size() == 2 || (x_dims.size() == 2 ||
...@@ -900,7 +932,37 @@ void BatchNormKernel(const Context &ctx, ...@@ -900,7 +932,37 @@ void BatchNormKernel(const Context &ctx,
const int max_blocks = std::max(max_threads / block, 1); const int max_blocks = std::max(max_threads / block, 1);
const int grid = std::min(C, max_blocks); const int grid = std::min(C, max_blocks);
if (compute_format == DataLayout::kNCHW) { if (compute_format == DataLayout::kNCHW) {
BNForwardTraining<T, block, DataLayout::kNCHW> if (FLAGS_cudnn_batchnorm_spatial_persistent == true) {
PADDLE_ENFORCE_GPU_SUCCESS(
phi::dynload::miopenBatchNormalizationForwardTraining(
handle, mode_, const_cast<void *>(static_cast<const void *>(
CudnnDataType<T>::kOne())),
const_cast<void *>(
static_cast<const void *>(CudnnDataType<T>::kZero())),
data_desc_,
static_cast<const void *>(transformed_x.template data<T>()),
data_desc_,
static_cast<void *>(
transformed_y.template mutable_data<T>(ctx.GetPlace())),
bn_param_desc_,
const_cast<void *>(static_cast<const void *>(
scale->template data<BatchNormParamType<T>>())),
const_cast<void *>(static_cast<const void *>(
bias->template data<BatchNormParamType<T>>())),
this_factor,
static_cast<void *>(
mean_out->template mutable_data<BatchNormParamType<T>>(
ctx.GetPlace())),
static_cast<void *>(variance_out->template mutable_data<
BatchNormParamType<T>>(ctx.GetPlace())),
epsilon,
static_cast<void *>(
saved_mean->template mutable_data<BatchNormParamType<T>>(
ctx.GetPlace())),
static_cast<void *>(saved_variance->template mutable_data<
BatchNormParamType<T>>(ctx.GetPlace()))));
} else {
BNForwardTraining<T, block, DataLayout::kNCHW>
<<<grid, block, 0, ctx.stream()>>>( <<<grid, block, 0, ctx.stream()>>>(
transformed_x.template data<T>(), transformed_x.template data<T>(),
new_scale.template data<BatchNormParamType<T>>(), new_scale.template data<BatchNormParamType<T>>(),
...@@ -915,52 +977,56 @@ void BatchNormKernel(const Context &ctx, ...@@ -915,52 +977,56 @@ void BatchNormKernel(const Context &ctx,
variance_out->template data<BatchNormParamType<T>>(), variance_out->template data<BatchNormParamType<T>>(),
saved_mean->template data<BatchNormParamType<T>>(), saved_mean->template data<BatchNormParamType<T>>(),
saved_variance->template data<BatchNormParamType<T>>()); saved_variance->template data<BatchNormParamType<T>>());
}
} else { } else {
BNForwardTraining<T, block, DataLayout::kNHWC> if (FLAGS_cudnn_batchnorm_spatial_persistent == true) {
<<<grid, block, 0, ctx.stream()>>>( PADDLE_ENFORCE_GPU_SUCCESS(
transformed_x.template data<T>(), phi::dynload::miopenBatchNormalizationForwardTraining(
new_scale.template data<BatchNormParamType<T>>(), handle, mode_, const_cast<void *>(static_cast<const void *>(
new_bias.template data<BatchNormParamType<T>>(), CudnnDataType<T>::kOne())),
C, const_cast<void *>(
N, static_cast<const void *>(CudnnDataType<T>::kZero())),
H * W * D, data_desc_,
epsilon, static_cast<const void *>(transformed_x.template data<T>()),
data_desc_,
static_cast<void *>(
transformed_y.template mutable_data<T>(ctx.GetPlace())),
bn_param_desc_,
const_cast<void *>(static_cast<const void *>(
scale->template data<BatchNormParamType<T>>())),
const_cast<void *>(static_cast<const void *>(
bias->template data<BatchNormParamType<T>>())),
this_factor, this_factor,
transformed_y.template data<T>(), static_cast<void *>(
mean_out->template data<BatchNormParamType<T>>(), mean_out->template mutable_data<BatchNormParamType<T>>(
variance_out->template data<BatchNormParamType<T>>(), ctx.GetPlace())),
saved_mean->template data<BatchNormParamType<T>>(), static_cast<void *>(variance_out->template mutable_data<
saved_variance->template data<BatchNormParamType<T>>()); BatchNormParamType<T>>(ctx.GetPlace())),
epsilon,
static_cast<void *>(
saved_mean->template mutable_data<BatchNormParamType<T>>(
ctx.GetPlace())),
static_cast<void *>(saved_variance->template mutable_data<
BatchNormParamType<T>>(ctx.GetPlace()))));
} else {
BNForwardTraining<T, block, DataLayout::kNHWC>
<<<grid, block, 0, ctx.stream()>>>(
transformed_x.template data<T>(),
new_scale.template data<BatchNormParamType<T>>(),
new_bias.template data<BatchNormParamType<T>>(),
C,
N,
H * W * D,
epsilon,
this_factor,
transformed_y.template data<T>(),
mean_out->template data<BatchNormParamType<T>>(),
variance_out->template data<BatchNormParamType<T>>(),
saved_mean->template data<BatchNormParamType<T>>(),
saved_variance->template data<BatchNormParamType<T>>());
}
} }
// TODO(wangran16): wait for MIOpen to improve the performance of BN
// PADDLE_ENFORCE_GPU_SUCCESS(
// platform::dynload::miopenBatchNormalizationForwardTraining(
// handle, mode_, const_cast<void *>(static_cast<const void *>(
// CudnnDataType<T>::kOne())),
// const_cast<void *>(
// static_cast<const void *>(CudnnDataType<T>::kZero())),
// data_desc_,
// static_cast<const void *>(transformed_x.template data<T>()),
// data_desc_,
// static_cast<void *>(
// transformed_y.template mutable_data<T>(ctx.GetPlace())),
// bn_param_desc_,
// const_cast<void *>(static_cast<const void *>(
// scale->template data<BatchNormParamType<T>>())),
// const_cast<void *>(static_cast<const void *>(
// bias->template data<BatchNormParamType<T>>())),
// this_factor,
// static_cast<void *>(
// mean_out->template mutable_data<BatchNormParamType<T>>(
// ctx.GetPlace())),
// static_cast<void *>(variance_out->template mutable_data<
// BatchNormParamType<T>>(ctx.GetPlace())),
// epsilon,
// static_cast<void *>(
// saved_mean->template mutable_data<BatchNormParamType<T>>(
// ctx.GetPlace())),
// static_cast<void *>(saved_variance->template mutable_data<
// BatchNormParamType<T>>(ctx.GetPlace()))));
#else #else
// const size_t CUDNN_PER_ACTIVATION_THRESHOLD = 131070; // const size_t CUDNN_PER_ACTIVATION_THRESHOLD = 131070;
const bool use_native_kernel = const bool use_native_kernel =
...@@ -1221,10 +1287,10 @@ void BatchNormKernel(const Context &ctx, ...@@ -1221,10 +1287,10 @@ void BatchNormKernel(const Context &ctx,
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
// TODO(wangran16): wait for MIOpen to improve the performance of BN // TODO(wangran16): wait for MIOpen to improve the performance of BN
// clean when exit. // clean when exit.
// PADDLE_ENFORCE_GPU_SUCCESS( PADDLE_ENFORCE_GPU_SUCCESS(
// platform::dynload::miopenDestroyTensorDescriptor(data_desc_)); phi::dynload::miopenDestroyTensorDescriptor(data_desc_));
// PADDLE_ENFORCE_GPU_SUCCESS( PADDLE_ENFORCE_GPU_SUCCESS(
// platform::dynload::miopenDestroyTensorDescriptor(bn_param_desc_)); phi::dynload::miopenDestroyTensorDescriptor(bn_param_desc_));
#else #else
// clean when exit. // clean when exit.
PADDLE_ENFORCE_GPU_SUCCESS( PADDLE_ENFORCE_GPU_SUCCESS(
......
...@@ -12,12 +12,23 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,12 +12,23 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#ifndef PADDLE_WITH_HIP
#include "paddle/phi/kernels/multiclass_nms3_kernel.h" #include "paddle/phi/kernels/multiclass_nms3_kernel.h"
#ifdef PADDLE_WITH_HIP
#include <hip/hip_runtime.h>
#include <hipcub/hipcub.hpp>
namespace cub = hipcub;
#else
#include <cub/cub.cuh> #include <cub/cub.cuh>
#include "cuda.h" // NOLINT #include "cuda.h" // NOLINT
#endif
#ifdef PADDLE_WITH_HIP
#define GPU(str) hip##str
#else
#define GPU(str) cuda##str
#endif
#include "paddle/phi/backends/context_pool.h" #include "paddle/phi/backends/context_pool.h"
#include "paddle/phi/common/place.h" #include "paddle/phi/common/place.h"
...@@ -234,7 +245,7 @@ __launch_bounds__(nthds_per_cta) __global__ ...@@ -234,7 +245,7 @@ __launch_bounds__(nthds_per_cta) __global__
} }
template <typename T_SCORE> template <typename T_SCORE>
void SortScoresPerClassGPU(cudaStream_t stream, void SortScoresPerClassGPU(GPU(Stream_t) stream,
const int num, const int num,
const int num_classes, const int num_classes,
const int num_preds_per_class, const int num_preds_per_class,
...@@ -298,7 +309,7 @@ void SortScoresPerClassGPU(cudaStream_t stream, ...@@ -298,7 +309,7 @@ void SortScoresPerClassGPU(cudaStream_t stream,
begin_bit, begin_bit,
end_bit, end_bit,
stream); stream);
PADDLE_ENFORCE_GPU_SUCCESS(cudaGetLastError()); PADDLE_ENFORCE_GPU_SUCCESS(GPU(GetLastError)());
} }
/* =========== /* ===========
...@@ -539,7 +550,7 @@ __global__ void AllClassNMSKernel( ...@@ -539,7 +550,7 @@ __global__ void AllClassNMSKernel(
} }
template <typename T_SCORE, typename T_BBOX> template <typename T_SCORE, typename T_BBOX>
void AllClassNMSGPU(cudaStream_t stream, void AllClassNMSGPU(GPU(Stream_t) stream,
const int num, const int num,
const int num_classes, const int num_classes,
const int num_preds_per_class, const int num_preds_per_class,
...@@ -603,7 +614,7 @@ void AllClassNMSGPU(cudaStream_t stream, ...@@ -603,7 +614,7 @@ void AllClassNMSGPU(cudaStream_t stream,
score_shift, score_shift,
caffe_semantics); caffe_semantics);
PADDLE_ENFORCE_GPU_SUCCESS(cudaGetLastError()); PADDLE_ENFORCE_GPU_SUCCESS(GPU(GetLastError)());
} }
/* ================== /* ==================
...@@ -618,11 +629,15 @@ __launch_bounds__(nthds_per_cta) __global__ ...@@ -618,11 +629,15 @@ __launch_bounds__(nthds_per_cta) __global__
if (idx <= num_segments) d_offsets[idx] = idx * offset; if (idx <= num_segments) d_offsets[idx] = idx * offset;
} }
void SetUniformOffsets(cudaStream_t stream, void SetUniformOffsets(GPU(Stream_t) stream,
const int num_segments, const int num_segments,
const int offset, const int offset,
int* d_offsets) { int* d_offsets) {
#ifdef PADDLE_WITH_HIP
const int BS = 256;
#else
const int BS = 32; const int BS = 32;
#endif
const int GS = (num_segments + 1 + BS - 1) / BS; const int GS = (num_segments + 1 + BS - 1) / BS;
SetUniformOffsetsKernel<BS> SetUniformOffsetsKernel<BS>
<<<GS, BS, 0, stream>>>(num_segments, offset, d_offsets); <<<GS, BS, 0, stream>>>(num_segments, offset, d_offsets);
...@@ -706,7 +721,7 @@ __launch_bounds__(nthds_per_cta) __global__ ...@@ -706,7 +721,7 @@ __launch_bounds__(nthds_per_cta) __global__
} }
template <typename T_BBOX, typename T_SCORE> template <typename T_BBOX, typename T_SCORE>
void GatherNMSOutputsGPU(cudaStream_t stream, void GatherNMSOutputsGPU(GPU(Stream_t) stream,
const bool share_location, const bool share_location,
const int num_images, const int num_images,
const int num_preds_per_class, const int num_preds_per_class,
...@@ -725,8 +740,12 @@ void GatherNMSOutputsGPU(cudaStream_t stream, ...@@ -725,8 +740,12 @@ void GatherNMSOutputsGPU(cudaStream_t stream,
bool clip_boxes, bool clip_boxes,
const float score_shift) { const float score_shift) {
PADDLE_ENFORCE_GPU_SUCCESS( PADDLE_ENFORCE_GPU_SUCCESS(
cudaMemsetAsync(num_detections, 0, num_images * sizeof(int), stream)); GPU(MemsetAsync)(num_detections, 0, num_images * sizeof(int), stream));
#ifdef PADDLE_WITH_HIP
const int BS = 256;
#else
const int BS = 32; const int BS = 32;
#endif
const int GS = 32; const int GS = 32;
GatherNMSOutputsKernel<T_BBOX, T_SCORE, BS> GatherNMSOutputsKernel<T_BBOX, T_SCORE, BS>
<<<GS, BS, 0, stream>>>(share_location, <<<GS, BS, 0, stream>>>(share_location,
...@@ -747,11 +766,11 @@ void GatherNMSOutputsGPU(cudaStream_t stream, ...@@ -747,11 +766,11 @@ void GatherNMSOutputsGPU(cudaStream_t stream,
clip_boxes, clip_boxes,
T_SCORE(score_shift)); T_SCORE(score_shift));
PADDLE_ENFORCE_GPU_SUCCESS(cudaGetLastError()); PADDLE_ENFORCE_GPU_SUCCESS(GPU(GetLastError)());
} }
template <typename T_SCORE> template <typename T_SCORE>
void SortScoresPerImageGPU(cudaStream_t stream, void SortScoresPerImageGPU(GPU(Stream_t) stream,
const int num_images, const int num_images,
const int num_items_per_image, const int num_items_per_image,
void* unsorted_scores, void* unsorted_scores,
...@@ -792,11 +811,11 @@ void SortScoresPerImageGPU(cudaStream_t stream, ...@@ -792,11 +811,11 @@ void SortScoresPerImageGPU(cudaStream_t stream,
begin_bit, begin_bit,
end_bit, end_bit,
stream); stream);
PADDLE_ENFORCE_GPU_SUCCESS(cudaGetLastError()); PADDLE_ENFORCE_GPU_SUCCESS(GPU(GetLastError)());
} }
template <typename T> template <typename T>
void InferNMS(cudaStream_t stream, void InferNMS(GPU(Stream_t) stream,
const int N, const int N,
const int per_batch_boxes_size, const int per_batch_boxes_size,
const int per_batch_scores_size, const int per_batch_scores_size,
...@@ -831,10 +850,10 @@ void InferNMS(cudaStream_t stream, ...@@ -831,10 +850,10 @@ void InferNMS(cudaStream_t stream,
size_t bbox_data_size = size_t bbox_data_size =
CalcDetectionForwardBBoxDataSize<T>(N, per_batch_boxes_size); CalcDetectionForwardBBoxDataSize<T>(N, per_batch_boxes_size);
void* bbox_data_raw = workspace; void* bbox_data_raw = workspace;
PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpyAsync(bbox_data_raw, PADDLE_ENFORCE_GPU_SUCCESS(GPU(MemcpyAsync)(bbox_data_raw,
loc_data, loc_data,
bbox_data_size, bbox_data_size,
cudaMemcpyDeviceToDevice, GPU(MemcpyDeviceToDevice),
stream)); stream));
void* bbox_data = bbox_data_raw; void* bbox_data = bbox_data_raw;
...@@ -843,8 +862,8 @@ void InferNMS(cudaStream_t stream, ...@@ -843,8 +862,8 @@ void InferNMS(cudaStream_t stream,
CalcDetectionForwardPreNMSSize<T>(N, per_batch_scores_size); CalcDetectionForwardPreNMSSize<T>(N, per_batch_scores_size);
void* scores = void* scores =
GetNextWorkspacePtr(reinterpret_cast<int8_t*>(bbox_data), bbox_data_size); GetNextWorkspacePtr(reinterpret_cast<int8_t*>(bbox_data), bbox_data_size);
PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpyAsync( PADDLE_ENFORCE_GPU_SUCCESS(GPU(MemcpyAsync)(
scores, conf_data, total_scores_size, cudaMemcpyDeviceToDevice, stream)); scores, conf_data, total_scores_size, GPU(MemcpyDeviceToDevice), stream));
size_t indices_size = size_t indices_size =
CalcDetectionForwardPreNMSSize<int>(N, per_batch_scores_size); CalcDetectionForwardPreNMSSize<int>(N, per_batch_scores_size);
...@@ -1145,4 +1164,3 @@ PD_REGISTER_KERNEL(multiclass_nms3, // cuda_only ...@@ -1145,4 +1164,3 @@ PD_REGISTER_KERNEL(multiclass_nms3, // cuda_only
kernel->OutputAt(2).SetDataType(phi::DataType::INT32); kernel->OutputAt(2).SetDataType(phi::DataType::INT32);
} }
#endif
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment