Commit ca9dbdb2 authored by yuguo-Jack's avatar yuguo-Jack
Browse files

new features

parent bb99f03d
......@@ -21,6 +21,7 @@
#include "paddle/phi/kernels/funcs/math_function.h"
PD_DECLARE_bool(enable_cublas_tensor_op_math);
PD_DECLARE_bool(gemm_use_half_precision_compute_type);
namespace phi {
namespace funcs {
......@@ -703,6 +704,13 @@ inline void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
float h_alpha = static_cast<float>(alpha);
float h_beta = static_cast<float>(beta);
rocblas_datatype compute_type = rocblas_datatype_f32_r;
if (FLAGS_gemm_use_half_precision_compute_type == true) {
compute_type = rocblas_datatype_f16_r;
}
VLOG(4) << "use_half_precision_compute_type: "
<< FLAGS_gemm_use_half_precision_compute_type;
auto &cuda_ctx = const_cast<phi::GPUContext &>(context_);
CUBlas<phi::dtype::float16>::GEMM_EX(&cuda_ctx,
cuTransB,
......@@ -721,7 +729,7 @@ inline void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
C,
rocblas_datatype_f16_r,
N,
rocblas_datatype_f32_r);
compute_type);
}
template <>
......
......@@ -661,14 +661,14 @@ void BatchNormGradFunctor(const Context &ctx,
// ------------------- cudnn descriptors ---------------------
#ifdef PADDLE_WITH_HIP
// TODO(wangran16): wait for MIOpen to improve the performance of BN
// miopenTensorDescriptor_t data_desc_;
// miopenTensorDescriptor_t bn_param_desc_;
// miopenBatchNormMode_t mode_;
// PADDLE_ENFORCE_GPU_SUCCESS(
// platform::dynload::miopenCreateTensorDescriptor(&data_desc_));
// PADDLE_ENFORCE_GPU_SUCCESS(
// platform::dynload::miopenCreateTensorDescriptor(&bn_param_desc_));
miopenTensorDescriptor_t data_desc_;
miopenTensorDescriptor_t bn_param_desc_;
miopenBatchNormMode_t mode_;
PADDLE_ENFORCE_GPU_SUCCESS(
phi::dynload::miopenCreateTensorDescriptor(&data_desc_));
PADDLE_ENFORCE_GPU_SUCCESS(
phi::dynload::miopenCreateTensorDescriptor(&bn_param_desc_));
#else
cudnnTensorDescriptor_t data_desc_;
cudnnTensorDescriptor_t bn_param_desc_;
......@@ -687,7 +687,11 @@ void BatchNormGradFunctor(const Context &ctx,
epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON);
#ifdef PADDLE_WITH_HIP
// TODO(wangran16): wait for MIOpen to improve the performance of BN
// mode_ = miopenBNSpatial;
if (H == 1 && W == 1) {
mode_ = miopenBNPerActivation;
} else {
mode_ = miopenBNSpatial;
}
#elif CUDNN_VERSION_MIN(7, 0, 1)
if (FLAGS_cudnn_batchnorm_spatial_persistent) {
mode_ = CUDNN_BATCHNORM_SPATIAL_PERSISTENT;
......@@ -706,13 +710,13 @@ void BatchNormGradFunctor(const Context &ctx,
#ifdef PADDLE_WITH_HIP
// TODO(wangran16): wait for MIOpen to improve the performance of BN
// PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenSetTensorDescriptor(
// data_desc_, CudnnDataType<T>::type,
// x_dims.size() > 3 ? x_dims.size() : 4, const_cast<int *>(dims.data()),
// const_cast<int *>(strides.data())));
// PADDLE_ENFORCE_GPU_SUCCESS(
// platform::dynload::miopenDeriveBNTensorDescriptor(bn_param_desc_,
// data_desc_, mode_));
PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenSetTensorDescriptor(
data_desc_, CudnnDataType<T>::type,
x_dims.size() > 3 ? x_dims.size() : 4, const_cast<int *>(dims.data()),
const_cast<int *>(strides.data())));
PADDLE_ENFORCE_GPU_SUCCESS(
phi::dynload::miopenDeriveBNTensorDescriptor(bn_param_desc_,
data_desc_, mode_));
#else
PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnSetTensorNdDescriptor(
data_desc_,
......@@ -750,6 +754,22 @@ void BatchNormGradFunctor(const Context &ctx,
if (d_x && d_scale && d_bias) {
#ifdef PADDLE_WITH_HIP
if (compute_format == DataLayout::kNCHW) {
if (FLAGS_cudnn_batchnorm_spatial_persistent == true) {
PADDLE_ENFORCE_GPU_SUCCESS(
phi::dynload::miopenBatchNormalizationBackward(
ctx.cudnn_handle(), mode_, CudnnDataType<T>::kOne(),
CudnnDataType<T>::kZero(), CudnnDataType<T>::kOne(),
CudnnDataType<T>::kZero(), data_desc_,
transformed_x.template data<T>(), data_desc_,
transformed_d_y.template data<T>(), data_desc_,
transformed_d_x.template mutable_data<T>(ctx.GetPlace()),
bn_param_desc_, scale->template data<BatchNormParamType<T>>(),
d_scale->template mutable_data<BatchNormParamType<T>>(
ctx.GetPlace()),
d_bias->template mutable_data<BatchNormParamType<T>>(
ctx.GetPlace()),
epsilon, saved_mean_data, saved_var_data));
} else {
BNBackward<T, block, DataLayout::kNCHW>
<<<grid2, block, 0, ctx.stream()>>>(
transformed_d_y.template data<T>(),
......@@ -764,6 +784,23 @@ void BatchNormGradFunctor(const Context &ctx,
transformed_d_x.template data<T>(),
ctx.template Alloc<BatchNormParamType<T>>(d_scale),
ctx.template Alloc<BatchNormParamType<T>>(d_bias));
}
} else {
if (FLAGS_cudnn_batchnorm_spatial_persistent == true) {
PADDLE_ENFORCE_GPU_SUCCESS(
phi::dynload::miopenBatchNormalizationBackward(
ctx.cudnn_handle(), mode_, CudnnDataType<T>::kOne(),
CudnnDataType<T>::kZero(), CudnnDataType<T>::kOne(),
CudnnDataType<T>::kZero(), data_desc_,
transformed_x.template data<T>(), data_desc_,
transformed_d_y.template data<T>(), data_desc_,
transformed_d_x.template mutable_data<T>(ctx.GetPlace()),
bn_param_desc_, scale->template data<BatchNormParamType<T>>(),
d_scale->template mutable_data<BatchNormParamType<T>>(
ctx.GetPlace()),
d_bias->template mutable_data<BatchNormParamType<T>>(
ctx.GetPlace()),
epsilon, saved_mean_data, saved_var_data));
} else {
BNBackward<T, block, DataLayout::kNHWC>
<<<grid2, block, 0, ctx.stream()>>>(
......@@ -780,22 +817,8 @@ void BatchNormGradFunctor(const Context &ctx,
ctx.template Alloc<BatchNormParamType<T>>(d_scale),
ctx.template Alloc<BatchNormParamType<T>>(d_bias));
}
}
// TODO(wangran16): wait for MIOpen to improve the performance of BN
// PADDLE_ENFORCE_GPU_SUCCESS(
// platform::dynload::miopenBatchNormalizationBackward(
// dev_ctx.cudnn_handle(), mode_, CudnnDataType<T>::kOne(),
// CudnnDataType<T>::kZero(), CudnnDataType<T>::kOne(),
// CudnnDataType<T>::kZero(), data_desc_,
// transformed_x.template data<T>(), data_desc_,
// transformed_d_y.template data<T>(), data_desc_,
// transformed_d_x.template mutable_data<T>(ctx.GetPlace()),
// bn_param_desc_, scale->template data<BatchNormParamType<T>>(),
// d_scale->template mutable_data<BatchNormParamType<T>>(
// ctx.GetPlace()),
// d_bias->template mutable_data<BatchNormParamType<T>>(
// ctx.GetPlace()),
// epsilon, saved_mean_data, saved_var_data));
#else
}
// CUDNN only support small batch size
......@@ -1129,10 +1152,10 @@ void BatchNormGradFunctor(const Context &ctx,
#ifdef PADDLE_WITH_HIP
// TODO(wangran16): wait for MIOpen to improve the performance of BN
// clean when exit.
// PADDLE_ENFORCE_GPU_SUCCESS(
// platform::dynload::miopenDestroyTensorDescriptor(data_desc_));
// PADDLE_ENFORCE_GPU_SUCCESS(
// platform::dynload::miopenDestroyTensorDescriptor(bn_param_desc_));
PADDLE_ENFORCE_GPU_SUCCESS(
phi::dynload::miopenDestroyTensorDescriptor(data_desc_));
PADDLE_ENFORCE_GPU_SUCCESS(
phi::dynload::miopenDestroyTensorDescriptor(bn_param_desc_));
#else
// clean when exit.
PADDLE_ENFORCE_GPU_SUCCESS(
......
......@@ -604,14 +604,14 @@ void BatchNormKernel(const Context &ctx,
// ------------------- cudnn descriptors ---------------------
#ifdef PADDLE_WITH_HIP
// TODO(wangran16): wait for MIOpen to improve the performance of BN
// miopenTensorDescriptor_t data_desc_;
// miopenTensorDescriptor_t bn_param_desc_;
// miopenBatchNormMode_t mode_;
// PADDLE_ENFORCE_GPU_SUCCESS(
// platform::dynload::miopenCreateTensorDescriptor(&data_desc_));
// PADDLE_ENFORCE_GPU_SUCCESS(
// platform::dynload::miopenCreateTensorDescriptor(&bn_param_desc_));
miopenTensorDescriptor_t data_desc_;
miopenTensorDescriptor_t bn_param_desc_;
miopenBatchNormMode_t mode_;
PADDLE_ENFORCE_GPU_SUCCESS(
phi::dynload::miopenCreateTensorDescriptor(&data_desc_));
PADDLE_ENFORCE_GPU_SUCCESS(
phi::dynload::miopenCreateTensorDescriptor(&bn_param_desc_));
#else
cudnnTensorDescriptor_t data_desc_;
cudnnTensorDescriptor_t bn_param_desc_;
......@@ -632,7 +632,11 @@ void BatchNormKernel(const Context &ctx,
#ifdef PADDLE_WITH_HIP
// TODO(wangran16): wait for MIOpen to improve the performance of BN
// mode_ = miopenBNSpatial;
if (H == 1 && W == 1) {
mode_ = miopenBNPerActivation;
} else {
mode_ = miopenBNSpatial;
}
#elif CUDNN_VERSION_MIN(7, 0, 1)
if (FLAGS_cudnn_batchnorm_spatial_persistent) {
mode_ = CUDNN_BATCHNORM_SPATIAL_PERSISTENT;
......@@ -662,14 +666,14 @@ void BatchNormKernel(const Context &ctx,
#ifdef PADDLE_WITH_HIP
// TODO(wangran16): wait for MIOpen to improve the performance of BN
// PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenSetTensorDescriptor(
// data_desc_, CudnnDataType<T>::type,
// x_dims.size() > 3 ? x_dims.size() : 4, const_cast<int *>(dims.data()),
// const_cast<int *>(strides.data())));
// Note: PERSISTENT not implemented for inference
// PADDLE_ENFORCE_GPU_SUCCESS(
// platform::dynload::miopenDeriveBNTensorDescriptor(
// bn_param_desc_, data_desc_, test_mode ? miopenBNSpatial : mode_));
PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenSetTensorDescriptor(
data_desc_, CudnnDataType<T>::type,
x_dims.size() > 3 ? x_dims.size() : 4, const_cast<int *>(dims.data()),
const_cast<int *>(strides.data())));
// Note: PERSISTENT not implemented for inference
PADDLE_ENFORCE_GPU_SUCCESS(
phi::dynload::miopenDeriveBNTensorDescriptor(
bn_param_desc_, data_desc_, mode_));
#else
PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnSetTensorNdDescriptor(
data_desc_,
......@@ -736,6 +740,30 @@ void BatchNormKernel(const Context &ctx,
const int block_size = 256;
const int grid_size = (N * C * H * W * D + block_size - 1) / block_size;
if (compute_format == DataLayout::kNCHW) {
if (FLAGS_cudnn_batchnorm_spatial_persistent == true) {
PADDLE_ENFORCE_GPU_SUCCESS(
phi::dynload::miopenBatchNormalizationForwardInference(
handle, mode_,
const_cast<void *>(
static_cast<const void *>(CudnnDataType<T>::kOne())),
const_cast<void *>(
static_cast<const void *>(CudnnDataType<T>::kZero())),
data_desc_,
static_cast<const void *>(transformed_x.template data<T>()),
data_desc_,
static_cast<void *>(
transformed_y.template mutable_data<T>(ctx.GetPlace())),
bn_param_desc_,
const_cast<void *>(static_cast<const void *>(
scale->template data<BatchNormParamType<T>>())),
const_cast<void *>(static_cast<const void *>(
bias->template data<BatchNormParamType<T>>())),
const_cast<void *>(static_cast<const void *>(
est_mean->template data<BatchNormParamType<T>>())),
const_cast<void *>(static_cast<const void *>(
est_var->template data<BatchNormParamType<T>>())),
epsilon));
} else {
BNForwardInference<T, DataLayout::kNCHW>
<<<grid_size, block_size, 0, ctx.stream()>>>(
transformed_x.template data<T>(),
......@@ -748,6 +776,31 @@ void BatchNormKernel(const Context &ctx,
H * W * D,
epsilon,
transformed_y.template data<T>());
}
} else {
if (FLAGS_cudnn_batchnorm_spatial_persistent == true) {
PADDLE_ENFORCE_GPU_SUCCESS(
phi::dynload::miopenBatchNormalizationForwardInference(
handle, mode_,
const_cast<void *>(
static_cast<const void *>(CudnnDataType<T>::kOne())),
const_cast<void *>(
static_cast<const void *>(CudnnDataType<T>::kZero())),
data_desc_,
static_cast<const void *>(transformed_x.template data<T>()),
data_desc_,
static_cast<void *>(
transformed_y.template mutable_data<T>(ctx.GetPlace())),
bn_param_desc_,
const_cast<void *>(static_cast<const void *>(
scale->template data<BatchNormParamType<T>>())),
const_cast<void *>(static_cast<const void *>(
bias->template data<BatchNormParamType<T>>())),
const_cast<void *>(static_cast<const void *>(
est_mean->template data<BatchNormParamType<T>>())),
const_cast<void *>(static_cast<const void *>(
est_var->template data<BatchNormParamType<T>>())),
epsilon));
} else {
BNForwardInference<T, DataLayout::kNHWC>
<<<grid_size, block_size, 0, ctx.stream()>>>(
......@@ -762,29 +815,8 @@ void BatchNormKernel(const Context &ctx,
epsilon,
transformed_y.template data<T>());
}
// TODO(wangran16): wait for MIOpen to improve the performance of BN
// PADDLE_ENFORCE_GPU_SUCCESS(
// platform::dynload::miopenBatchNormalizationForwardInference(
// handle, miopenBNSpatial,
// const_cast<void *>(
// static_cast<const void *>(CudnnDataType<T>::kOne())),
// const_cast<void *>(
// static_cast<const void *>(CudnnDataType<T>::kZero())),
// data_desc_,
// static_cast<const void *>(transformed_x.template data<T>()),
// data_desc_,
// static_cast<void *>(
// transformed_y.template mutable_data<T>(ctx.GetPlace())),
// bn_param_desc_,
// const_cast<void *>(static_cast<const void *>(
// scale->template data<BatchNormParamType<T>>())),
// const_cast<void *>(static_cast<const void *>(
// bias->template data<BatchNormParamType<T>>())),
// const_cast<void *>(static_cast<const void *>(
// est_mean->template data<BatchNormParamType<T>>())),
// const_cast<void *>(static_cast<const void *>(
// est_var->template data<BatchNormParamType<T>>())),
// epsilon));
}
#else
const bool use_native_kernel =
(x_dims.size() == 2 ||
......@@ -900,6 +932,36 @@ void BatchNormKernel(const Context &ctx,
const int max_blocks = std::max(max_threads / block, 1);
const int grid = std::min(C, max_blocks);
if (compute_format == DataLayout::kNCHW) {
if (FLAGS_cudnn_batchnorm_spatial_persistent == true) {
PADDLE_ENFORCE_GPU_SUCCESS(
phi::dynload::miopenBatchNormalizationForwardTraining(
handle, mode_, const_cast<void *>(static_cast<const void *>(
CudnnDataType<T>::kOne())),
const_cast<void *>(
static_cast<const void *>(CudnnDataType<T>::kZero())),
data_desc_,
static_cast<const void *>(transformed_x.template data<T>()),
data_desc_,
static_cast<void *>(
transformed_y.template mutable_data<T>(ctx.GetPlace())),
bn_param_desc_,
const_cast<void *>(static_cast<const void *>(
scale->template data<BatchNormParamType<T>>())),
const_cast<void *>(static_cast<const void *>(
bias->template data<BatchNormParamType<T>>())),
this_factor,
static_cast<void *>(
mean_out->template mutable_data<BatchNormParamType<T>>(
ctx.GetPlace())),
static_cast<void *>(variance_out->template mutable_data<
BatchNormParamType<T>>(ctx.GetPlace())),
epsilon,
static_cast<void *>(
saved_mean->template mutable_data<BatchNormParamType<T>>(
ctx.GetPlace())),
static_cast<void *>(saved_variance->template mutable_data<
BatchNormParamType<T>>(ctx.GetPlace()))));
} else {
BNForwardTraining<T, block, DataLayout::kNCHW>
<<<grid, block, 0, ctx.stream()>>>(
transformed_x.template data<T>(),
......@@ -915,6 +977,37 @@ void BatchNormKernel(const Context &ctx,
variance_out->template data<BatchNormParamType<T>>(),
saved_mean->template data<BatchNormParamType<T>>(),
saved_variance->template data<BatchNormParamType<T>>());
}
} else {
if (FLAGS_cudnn_batchnorm_spatial_persistent == true) {
PADDLE_ENFORCE_GPU_SUCCESS(
phi::dynload::miopenBatchNormalizationForwardTraining(
handle, mode_, const_cast<void *>(static_cast<const void *>(
CudnnDataType<T>::kOne())),
const_cast<void *>(
static_cast<const void *>(CudnnDataType<T>::kZero())),
data_desc_,
static_cast<const void *>(transformed_x.template data<T>()),
data_desc_,
static_cast<void *>(
transformed_y.template mutable_data<T>(ctx.GetPlace())),
bn_param_desc_,
const_cast<void *>(static_cast<const void *>(
scale->template data<BatchNormParamType<T>>())),
const_cast<void *>(static_cast<const void *>(
bias->template data<BatchNormParamType<T>>())),
this_factor,
static_cast<void *>(
mean_out->template mutable_data<BatchNormParamType<T>>(
ctx.GetPlace())),
static_cast<void *>(variance_out->template mutable_data<
BatchNormParamType<T>>(ctx.GetPlace())),
epsilon,
static_cast<void *>(
saved_mean->template mutable_data<BatchNormParamType<T>>(
ctx.GetPlace())),
static_cast<void *>(saved_variance->template mutable_data<
BatchNormParamType<T>>(ctx.GetPlace()))));
} else {
BNForwardTraining<T, block, DataLayout::kNHWC>
<<<grid, block, 0, ctx.stream()>>>(
......@@ -932,35 +1025,8 @@ void BatchNormKernel(const Context &ctx,
saved_mean->template data<BatchNormParamType<T>>(),
saved_variance->template data<BatchNormParamType<T>>());
}
// TODO(wangran16): wait for MIOpen to improve the performance of BN
// PADDLE_ENFORCE_GPU_SUCCESS(
// platform::dynload::miopenBatchNormalizationForwardTraining(
// handle, mode_, const_cast<void *>(static_cast<const void *>(
// CudnnDataType<T>::kOne())),
// const_cast<void *>(
// static_cast<const void *>(CudnnDataType<T>::kZero())),
// data_desc_,
// static_cast<const void *>(transformed_x.template data<T>()),
// data_desc_,
// static_cast<void *>(
// transformed_y.template mutable_data<T>(ctx.GetPlace())),
// bn_param_desc_,
// const_cast<void *>(static_cast<const void *>(
// scale->template data<BatchNormParamType<T>>())),
// const_cast<void *>(static_cast<const void *>(
// bias->template data<BatchNormParamType<T>>())),
// this_factor,
// static_cast<void *>(
// mean_out->template mutable_data<BatchNormParamType<T>>(
// ctx.GetPlace())),
// static_cast<void *>(variance_out->template mutable_data<
// BatchNormParamType<T>>(ctx.GetPlace())),
// epsilon,
// static_cast<void *>(
// saved_mean->template mutable_data<BatchNormParamType<T>>(
// ctx.GetPlace())),
// static_cast<void *>(saved_variance->template mutable_data<
// BatchNormParamType<T>>(ctx.GetPlace()))));
}
#else
// const size_t CUDNN_PER_ACTIVATION_THRESHOLD = 131070;
const bool use_native_kernel =
......@@ -1221,10 +1287,10 @@ void BatchNormKernel(const Context &ctx,
#ifdef PADDLE_WITH_HIP
// TODO(wangran16): wait for MIOpen to improve the performance of BN
// clean when exit.
// PADDLE_ENFORCE_GPU_SUCCESS(
// platform::dynload::miopenDestroyTensorDescriptor(data_desc_));
// PADDLE_ENFORCE_GPU_SUCCESS(
// platform::dynload::miopenDestroyTensorDescriptor(bn_param_desc_));
PADDLE_ENFORCE_GPU_SUCCESS(
phi::dynload::miopenDestroyTensorDescriptor(data_desc_));
PADDLE_ENFORCE_GPU_SUCCESS(
phi::dynload::miopenDestroyTensorDescriptor(bn_param_desc_));
#else
// clean when exit.
PADDLE_ENFORCE_GPU_SUCCESS(
......
......@@ -12,12 +12,23 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifndef PADDLE_WITH_HIP
#include "paddle/phi/kernels/multiclass_nms3_kernel.h"
#ifdef PADDLE_WITH_HIP
#include <hip/hip_runtime.h>
#include <hipcub/hipcub.hpp>
namespace cub = hipcub;
#else
#include <cub/cub.cuh>
#include "cuda.h" // NOLINT
#endif
#ifdef PADDLE_WITH_HIP
#define GPU(str) hip##str
#else
#define GPU(str) cuda##str
#endif
#include "paddle/phi/backends/context_pool.h"
#include "paddle/phi/common/place.h"
......@@ -234,7 +245,7 @@ __launch_bounds__(nthds_per_cta) __global__
}
template <typename T_SCORE>
void SortScoresPerClassGPU(cudaStream_t stream,
void SortScoresPerClassGPU(GPU(Stream_t) stream,
const int num,
const int num_classes,
const int num_preds_per_class,
......@@ -298,7 +309,7 @@ void SortScoresPerClassGPU(cudaStream_t stream,
begin_bit,
end_bit,
stream);
PADDLE_ENFORCE_GPU_SUCCESS(cudaGetLastError());
PADDLE_ENFORCE_GPU_SUCCESS(GPU(GetLastError)());
}
/* ===========
......@@ -539,7 +550,7 @@ __global__ void AllClassNMSKernel(
}
template <typename T_SCORE, typename T_BBOX>
void AllClassNMSGPU(cudaStream_t stream,
void AllClassNMSGPU(GPU(Stream_t) stream,
const int num,
const int num_classes,
const int num_preds_per_class,
......@@ -603,7 +614,7 @@ void AllClassNMSGPU(cudaStream_t stream,
score_shift,
caffe_semantics);
PADDLE_ENFORCE_GPU_SUCCESS(cudaGetLastError());
PADDLE_ENFORCE_GPU_SUCCESS(GPU(GetLastError)());
}
/* ==================
......@@ -618,11 +629,15 @@ __launch_bounds__(nthds_per_cta) __global__
if (idx <= num_segments) d_offsets[idx] = idx * offset;
}
void SetUniformOffsets(cudaStream_t stream,
void SetUniformOffsets(GPU(Stream_t) stream,
const int num_segments,
const int offset,
int* d_offsets) {
#ifdef PADDLE_WITH_HIP
const int BS = 256;
#else
const int BS = 32;
#endif
const int GS = (num_segments + 1 + BS - 1) / BS;
SetUniformOffsetsKernel<BS>
<<<GS, BS, 0, stream>>>(num_segments, offset, d_offsets);
......@@ -706,7 +721,7 @@ __launch_bounds__(nthds_per_cta) __global__
}
template <typename T_BBOX, typename T_SCORE>
void GatherNMSOutputsGPU(cudaStream_t stream,
void GatherNMSOutputsGPU(GPU(Stream_t) stream,
const bool share_location,
const int num_images,
const int num_preds_per_class,
......@@ -725,8 +740,12 @@ void GatherNMSOutputsGPU(cudaStream_t stream,
bool clip_boxes,
const float score_shift) {
PADDLE_ENFORCE_GPU_SUCCESS(
cudaMemsetAsync(num_detections, 0, num_images * sizeof(int), stream));
GPU(MemsetAsync)(num_detections, 0, num_images * sizeof(int), stream));
#ifdef PADDLE_WITH_HIP
const int BS = 256;
#else
const int BS = 32;
#endif
const int GS = 32;
GatherNMSOutputsKernel<T_BBOX, T_SCORE, BS>
<<<GS, BS, 0, stream>>>(share_location,
......@@ -747,11 +766,11 @@ void GatherNMSOutputsGPU(cudaStream_t stream,
clip_boxes,
T_SCORE(score_shift));
PADDLE_ENFORCE_GPU_SUCCESS(cudaGetLastError());
PADDLE_ENFORCE_GPU_SUCCESS(GPU(GetLastError)());
}
template <typename T_SCORE>
void SortScoresPerImageGPU(cudaStream_t stream,
void SortScoresPerImageGPU(GPU(Stream_t) stream,
const int num_images,
const int num_items_per_image,
void* unsorted_scores,
......@@ -792,11 +811,11 @@ void SortScoresPerImageGPU(cudaStream_t stream,
begin_bit,
end_bit,
stream);
PADDLE_ENFORCE_GPU_SUCCESS(cudaGetLastError());
PADDLE_ENFORCE_GPU_SUCCESS(GPU(GetLastError)());
}
template <typename T>
void InferNMS(cudaStream_t stream,
void InferNMS(GPU(Stream_t) stream,
const int N,
const int per_batch_boxes_size,
const int per_batch_scores_size,
......@@ -831,10 +850,10 @@ void InferNMS(cudaStream_t stream,
size_t bbox_data_size =
CalcDetectionForwardBBoxDataSize<T>(N, per_batch_boxes_size);
void* bbox_data_raw = workspace;
PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpyAsync(bbox_data_raw,
PADDLE_ENFORCE_GPU_SUCCESS(GPU(MemcpyAsync)(bbox_data_raw,
loc_data,
bbox_data_size,
cudaMemcpyDeviceToDevice,
GPU(MemcpyDeviceToDevice),
stream));
void* bbox_data = bbox_data_raw;
......@@ -843,8 +862,8 @@ void InferNMS(cudaStream_t stream,
CalcDetectionForwardPreNMSSize<T>(N, per_batch_scores_size);
void* scores =
GetNextWorkspacePtr(reinterpret_cast<int8_t*>(bbox_data), bbox_data_size);
PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpyAsync(
scores, conf_data, total_scores_size, cudaMemcpyDeviceToDevice, stream));
PADDLE_ENFORCE_GPU_SUCCESS(GPU(MemcpyAsync)(
scores, conf_data, total_scores_size, GPU(MemcpyDeviceToDevice), stream));
size_t indices_size =
CalcDetectionForwardPreNMSSize<int>(N, per_batch_scores_size);
......@@ -1145,4 +1164,3 @@ PD_REGISTER_KERNEL(multiclass_nms3, // cuda_only
kernel->OutputAt(2).SetDataType(phi::DataType::INT32);
}
#endif
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment