Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
Paddle
Commits
ca9dbdb2
Commit
ca9dbdb2
authored
Apr 21, 2024
by
yuguo-Jack
Browse files
new features
parent
bb99f03d
Changes
4
Show whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
284 additions
and
169 deletions
+284
-169
paddle/phi/kernels/funcs/blas/blas_impl.hip.h
paddle/phi/kernels/funcs/blas/blas_impl.hip.h
+9
-1
paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu
paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu
+73
-50
paddle/phi/kernels/gpu/batch_norm_kernel.cu
paddle/phi/kernels/gpu/batch_norm_kernel.cu
+167
-101
paddle/phi/kernels/gpu/multiclass_nms3_kernel.cu
paddle/phi/kernels/gpu/multiclass_nms3_kernel.cu
+35
-17
No files found.
paddle/phi/kernels/funcs/blas/blas_impl.hip.h
View file @
ca9dbdb2
...
...
@@ -21,6 +21,7 @@
#include "paddle/phi/kernels/funcs/math_function.h"
PD_DECLARE_bool
(
enable_cublas_tensor_op_math
);
PD_DECLARE_bool
(
gemm_use_half_precision_compute_type
);
namespace
phi
{
namespace
funcs
{
...
...
@@ -703,6 +704,13 @@ inline void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
float
h_alpha
=
static_cast
<
float
>
(
alpha
);
float
h_beta
=
static_cast
<
float
>
(
beta
);
rocblas_datatype
compute_type
=
rocblas_datatype_f32_r
;
if
(
FLAGS_gemm_use_half_precision_compute_type
==
true
)
{
compute_type
=
rocblas_datatype_f16_r
;
}
VLOG
(
4
)
<<
"use_half_precision_compute_type: "
<<
FLAGS_gemm_use_half_precision_compute_type
;
auto
&
cuda_ctx
=
const_cast
<
phi
::
GPUContext
&>
(
context_
);
CUBlas
<
phi
::
dtype
::
float16
>::
GEMM_EX
(
&
cuda_ctx
,
cuTransB
,
...
...
@@ -721,7 +729,7 @@ inline void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
C
,
rocblas_datatype_f16_r
,
N
,
rocblas_datatype_f32_r
);
compute_type
);
}
template
<
>
...
...
paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu
View file @
ca9dbdb2
...
...
@@ -661,14 +661,14 @@ void BatchNormGradFunctor(const Context &ctx,
// ------------------- cudnn descriptors ---------------------
#ifdef PADDLE_WITH_HIP
// TODO(wangran16): wait for MIOpen to improve the performance of BN
//
miopenTensorDescriptor_t data_desc_;
//
miopenTensorDescriptor_t bn_param_desc_;
//
miopenBatchNormMode_t mode_;
//
PADDLE_ENFORCE_GPU_SUCCESS(
//
p
latform
::dynload::miopenCreateTensorDescriptor(&data_desc_));
//
PADDLE_ENFORCE_GPU_SUCCESS(
//
p
latform
::dynload::miopenCreateTensorDescriptor(&bn_param_desc_));
miopenTensorDescriptor_t
data_desc_
;
miopenTensorDescriptor_t
bn_param_desc_
;
miopenBatchNormMode_t
mode_
;
PADDLE_ENFORCE_GPU_SUCCESS
(
p
hi
::
dynload
::
miopenCreateTensorDescriptor
(
&
data_desc_
));
PADDLE_ENFORCE_GPU_SUCCESS
(
p
hi
::
dynload
::
miopenCreateTensorDescriptor
(
&
bn_param_desc_
));
#else
cudnnTensorDescriptor_t
data_desc_
;
cudnnTensorDescriptor_t
bn_param_desc_
;
...
...
@@ -687,7 +687,11 @@ void BatchNormGradFunctor(const Context &ctx,
epsilon
=
std
::
max
(
epsilon
,
CUDNN_BN_MIN_EPSILON
);
#ifdef PADDLE_WITH_HIP
// TODO(wangran16): wait for MIOpen to improve the performance of BN
// mode_ = miopenBNSpatial;
if
(
H
==
1
&&
W
==
1
)
{
mode_
=
miopenBNPerActivation
;
}
else
{
mode_
=
miopenBNSpatial
;
}
#elif CUDNN_VERSION_MIN(7, 0, 1)
if
(
FLAGS_cudnn_batchnorm_spatial_persistent
)
{
mode_
=
CUDNN_BATCHNORM_SPATIAL_PERSISTENT
;
...
...
@@ -706,13 +710,13 @@ void BatchNormGradFunctor(const Context &ctx,
#ifdef PADDLE_WITH_HIP
// TODO(wangran16): wait for MIOpen to improve the performance of BN
//
PADDLE_ENFORCE_GPU_SUCCESS(p
latform
::dynload::miopenSetTensorDescriptor(
//
data_desc_, CudnnDataType<T>::type,
//
x_dims.size() > 3 ? x_dims.size() : 4, const_cast<int *>(dims.data()),
//
const_cast<int *>(strides.data())));
//
PADDLE_ENFORCE_GPU_SUCCESS(
//
p
latform
::dynload::miopenDeriveBNTensorDescriptor(bn_param_desc_,
//
data_desc_, mode_));
PADDLE_ENFORCE_GPU_SUCCESS
(
p
hi
::
dynload
::
miopenSetTensorDescriptor
(
data_desc_
,
CudnnDataType
<
T
>::
type
,
x_dims
.
size
()
>
3
?
x_dims
.
size
()
:
4
,
const_cast
<
int
*>
(
dims
.
data
()),
const_cast
<
int
*>
(
strides
.
data
())));
PADDLE_ENFORCE_GPU_SUCCESS
(
p
hi
::
dynload
::
miopenDeriveBNTensorDescriptor
(
bn_param_desc_
,
data_desc_
,
mode_
));
#else
PADDLE_ENFORCE_GPU_SUCCESS
(
phi
::
dynload
::
cudnnSetTensorNdDescriptor
(
data_desc_
,
...
...
@@ -750,6 +754,22 @@ void BatchNormGradFunctor(const Context &ctx,
if
(
d_x
&&
d_scale
&&
d_bias
)
{
#ifdef PADDLE_WITH_HIP
if
(
compute_format
==
DataLayout
::
kNCHW
)
{
if
(
FLAGS_cudnn_batchnorm_spatial_persistent
==
true
)
{
PADDLE_ENFORCE_GPU_SUCCESS
(
phi
::
dynload
::
miopenBatchNormalizationBackward
(
ctx
.
cudnn_handle
(),
mode_
,
CudnnDataType
<
T
>::
kOne
(),
CudnnDataType
<
T
>::
kZero
(),
CudnnDataType
<
T
>::
kOne
(),
CudnnDataType
<
T
>::
kZero
(),
data_desc_
,
transformed_x
.
template
data
<
T
>(),
data_desc_
,
transformed_d_y
.
template
data
<
T
>(),
data_desc_
,
transformed_d_x
.
template
mutable_data
<
T
>(
ctx
.
GetPlace
()),
bn_param_desc_
,
scale
->
template
data
<
BatchNormParamType
<
T
>
>
(),
d_scale
->
template
mutable_data
<
BatchNormParamType
<
T
>
>
(
ctx
.
GetPlace
()),
d_bias
->
template
mutable_data
<
BatchNormParamType
<
T
>
>
(
ctx
.
GetPlace
()),
epsilon
,
saved_mean_data
,
saved_var_data
));
}
else
{
BNBackward
<
T
,
block
,
DataLayout
::
kNCHW
>
<<<
grid2
,
block
,
0
,
ctx
.
stream
()
>>>
(
transformed_d_y
.
template
data
<
T
>(),
...
...
@@ -764,6 +784,23 @@ void BatchNormGradFunctor(const Context &ctx,
transformed_d_x
.
template
data
<
T
>(),
ctx
.
template
Alloc
<
BatchNormParamType
<
T
>
>
(
d_scale
),
ctx
.
template
Alloc
<
BatchNormParamType
<
T
>
>
(
d_bias
));
}
}
else
{
if
(
FLAGS_cudnn_batchnorm_spatial_persistent
==
true
)
{
PADDLE_ENFORCE_GPU_SUCCESS
(
phi
::
dynload
::
miopenBatchNormalizationBackward
(
ctx
.
cudnn_handle
(),
mode_
,
CudnnDataType
<
T
>::
kOne
(),
CudnnDataType
<
T
>::
kZero
(),
CudnnDataType
<
T
>::
kOne
(),
CudnnDataType
<
T
>::
kZero
(),
data_desc_
,
transformed_x
.
template
data
<
T
>(),
data_desc_
,
transformed_d_y
.
template
data
<
T
>(),
data_desc_
,
transformed_d_x
.
template
mutable_data
<
T
>(
ctx
.
GetPlace
()),
bn_param_desc_
,
scale
->
template
data
<
BatchNormParamType
<
T
>
>
(),
d_scale
->
template
mutable_data
<
BatchNormParamType
<
T
>
>
(
ctx
.
GetPlace
()),
d_bias
->
template
mutable_data
<
BatchNormParamType
<
T
>
>
(
ctx
.
GetPlace
()),
epsilon
,
saved_mean_data
,
saved_var_data
));
}
else
{
BNBackward
<
T
,
block
,
DataLayout
::
kNHWC
>
<<<
grid2
,
block
,
0
,
ctx
.
stream
()
>>>
(
...
...
@@ -780,22 +817,8 @@ void BatchNormGradFunctor(const Context &ctx,
ctx
.
template
Alloc
<
BatchNormParamType
<
T
>
>
(
d_scale
),
ctx
.
template
Alloc
<
BatchNormParamType
<
T
>
>
(
d_bias
));
}
}
// TODO(wangran16): wait for MIOpen to improve the performance of BN
// PADDLE_ENFORCE_GPU_SUCCESS(
// platform::dynload::miopenBatchNormalizationBackward(
// dev_ctx.cudnn_handle(), mode_, CudnnDataType<T>::kOne(),
// CudnnDataType<T>::kZero(), CudnnDataType<T>::kOne(),
// CudnnDataType<T>::kZero(), data_desc_,
// transformed_x.template data<T>(), data_desc_,
// transformed_d_y.template data<T>(), data_desc_,
// transformed_d_x.template mutable_data<T>(ctx.GetPlace()),
// bn_param_desc_, scale->template data<BatchNormParamType<T>>(),
// d_scale->template mutable_data<BatchNormParamType<T>>(
// ctx.GetPlace()),
// d_bias->template mutable_data<BatchNormParamType<T>>(
// ctx.GetPlace()),
// epsilon, saved_mean_data, saved_var_data));
#else
}
// CUDNN only support small batch size
...
...
@@ -1129,10 +1152,10 @@ void BatchNormGradFunctor(const Context &ctx,
#ifdef PADDLE_WITH_HIP
// TODO(wangran16): wait for MIOpen to improve the performance of BN
// clean when exit.
//
PADDLE_ENFORCE_GPU_SUCCESS(
//
p
latform
::dynload::miopenDestroyTensorDescriptor(data_desc_));
//
PADDLE_ENFORCE_GPU_SUCCESS(
//
p
latform
::dynload::miopenDestroyTensorDescriptor(bn_param_desc_));
PADDLE_ENFORCE_GPU_SUCCESS
(
p
hi
::
dynload
::
miopenDestroyTensorDescriptor
(
data_desc_
));
PADDLE_ENFORCE_GPU_SUCCESS
(
p
hi
::
dynload
::
miopenDestroyTensorDescriptor
(
bn_param_desc_
));
#else
// clean when exit.
PADDLE_ENFORCE_GPU_SUCCESS
(
...
...
paddle/phi/kernels/gpu/batch_norm_kernel.cu
View file @
ca9dbdb2
...
...
@@ -604,14 +604,14 @@ void BatchNormKernel(const Context &ctx,
// ------------------- cudnn descriptors ---------------------
#ifdef PADDLE_WITH_HIP
// TODO(wangran16): wait for MIOpen to improve the performance of BN
//
miopenTensorDescriptor_t data_desc_;
//
miopenTensorDescriptor_t bn_param_desc_;
//
miopenBatchNormMode_t mode_;
//
PADDLE_ENFORCE_GPU_SUCCESS(
//
p
latform
::dynload::miopenCreateTensorDescriptor(&data_desc_));
//
PADDLE_ENFORCE_GPU_SUCCESS(
//
p
latform
::dynload::miopenCreateTensorDescriptor(&bn_param_desc_));
miopenTensorDescriptor_t
data_desc_
;
miopenTensorDescriptor_t
bn_param_desc_
;
miopenBatchNormMode_t
mode_
;
PADDLE_ENFORCE_GPU_SUCCESS
(
p
hi
::
dynload
::
miopenCreateTensorDescriptor
(
&
data_desc_
));
PADDLE_ENFORCE_GPU_SUCCESS
(
p
hi
::
dynload
::
miopenCreateTensorDescriptor
(
&
bn_param_desc_
));
#else
cudnnTensorDescriptor_t
data_desc_
;
cudnnTensorDescriptor_t
bn_param_desc_
;
...
...
@@ -632,7 +632,11 @@ void BatchNormKernel(const Context &ctx,
#ifdef PADDLE_WITH_HIP
// TODO(wangran16): wait for MIOpen to improve the performance of BN
// mode_ = miopenBNSpatial;
if
(
H
==
1
&&
W
==
1
)
{
mode_
=
miopenBNPerActivation
;
}
else
{
mode_
=
miopenBNSpatial
;
}
#elif CUDNN_VERSION_MIN(7, 0, 1)
if
(
FLAGS_cudnn_batchnorm_spatial_persistent
)
{
mode_
=
CUDNN_BATCHNORM_SPATIAL_PERSISTENT
;
...
...
@@ -662,14 +666,14 @@ void BatchNormKernel(const Context &ctx,
#ifdef PADDLE_WITH_HIP
// TODO(wangran16): wait for MIOpen to improve the performance of BN
//
PADDLE_ENFORCE_GPU_SUCCESS(p
latform
::dynload::miopenSetTensorDescriptor(
//
data_desc_, CudnnDataType<T>::type,
//
x_dims.size() > 3 ? x_dims.size() : 4, const_cast<int *>(dims.data()),
//
const_cast<int *>(strides.data())));
// Note: PERSISTENT not implemented for inference
//
PADDLE_ENFORCE_GPU_SUCCESS(
//
p
latform
::dynload::miopenDeriveBNTensorDescriptor(
//
bn_param_desc_, data_desc_,
test_mode ? miopenBNSpatial :
mode_));
PADDLE_ENFORCE_GPU_SUCCESS
(
p
hi
::
dynload
::
miopenSetTensorDescriptor
(
data_desc_
,
CudnnDataType
<
T
>::
type
,
x_dims
.
size
()
>
3
?
x_dims
.
size
()
:
4
,
const_cast
<
int
*>
(
dims
.
data
()),
const_cast
<
int
*>
(
strides
.
data
())));
// Note: PERSISTENT not implemented for inference
PADDLE_ENFORCE_GPU_SUCCESS
(
p
hi
::
dynload
::
miopenDeriveBNTensorDescriptor
(
bn_param_desc_
,
data_desc_
,
mode_
));
#else
PADDLE_ENFORCE_GPU_SUCCESS
(
phi
::
dynload
::
cudnnSetTensorNdDescriptor
(
data_desc_
,
...
...
@@ -736,6 +740,30 @@ void BatchNormKernel(const Context &ctx,
const
int
block_size
=
256
;
const
int
grid_size
=
(
N
*
C
*
H
*
W
*
D
+
block_size
-
1
)
/
block_size
;
if
(
compute_format
==
DataLayout
::
kNCHW
)
{
if
(
FLAGS_cudnn_batchnorm_spatial_persistent
==
true
)
{
PADDLE_ENFORCE_GPU_SUCCESS
(
phi
::
dynload
::
miopenBatchNormalizationForwardInference
(
handle
,
mode_
,
const_cast
<
void
*>
(
static_cast
<
const
void
*>
(
CudnnDataType
<
T
>::
kOne
())),
const_cast
<
void
*>
(
static_cast
<
const
void
*>
(
CudnnDataType
<
T
>::
kZero
())),
data_desc_
,
static_cast
<
const
void
*>
(
transformed_x
.
template
data
<
T
>()),
data_desc_
,
static_cast
<
void
*>
(
transformed_y
.
template
mutable_data
<
T
>(
ctx
.
GetPlace
())),
bn_param_desc_
,
const_cast
<
void
*>
(
static_cast
<
const
void
*>
(
scale
->
template
data
<
BatchNormParamType
<
T
>
>
())),
const_cast
<
void
*>
(
static_cast
<
const
void
*>
(
bias
->
template
data
<
BatchNormParamType
<
T
>
>
())),
const_cast
<
void
*>
(
static_cast
<
const
void
*>
(
est_mean
->
template
data
<
BatchNormParamType
<
T
>
>
())),
const_cast
<
void
*>
(
static_cast
<
const
void
*>
(
est_var
->
template
data
<
BatchNormParamType
<
T
>
>
())),
epsilon
));
}
else
{
BNForwardInference
<
T
,
DataLayout
::
kNCHW
>
<<<
grid_size
,
block_size
,
0
,
ctx
.
stream
()
>>>
(
transformed_x
.
template
data
<
T
>(),
...
...
@@ -748,6 +776,31 @@ void BatchNormKernel(const Context &ctx,
H
*
W
*
D
,
epsilon
,
transformed_y
.
template
data
<
T
>());
}
}
else
{
if
(
FLAGS_cudnn_batchnorm_spatial_persistent
==
true
)
{
PADDLE_ENFORCE_GPU_SUCCESS
(
phi
::
dynload
::
miopenBatchNormalizationForwardInference
(
handle
,
mode_
,
const_cast
<
void
*>
(
static_cast
<
const
void
*>
(
CudnnDataType
<
T
>::
kOne
())),
const_cast
<
void
*>
(
static_cast
<
const
void
*>
(
CudnnDataType
<
T
>::
kZero
())),
data_desc_
,
static_cast
<
const
void
*>
(
transformed_x
.
template
data
<
T
>()),
data_desc_
,
static_cast
<
void
*>
(
transformed_y
.
template
mutable_data
<
T
>(
ctx
.
GetPlace
())),
bn_param_desc_
,
const_cast
<
void
*>
(
static_cast
<
const
void
*>
(
scale
->
template
data
<
BatchNormParamType
<
T
>
>
())),
const_cast
<
void
*>
(
static_cast
<
const
void
*>
(
bias
->
template
data
<
BatchNormParamType
<
T
>
>
())),
const_cast
<
void
*>
(
static_cast
<
const
void
*>
(
est_mean
->
template
data
<
BatchNormParamType
<
T
>
>
())),
const_cast
<
void
*>
(
static_cast
<
const
void
*>
(
est_var
->
template
data
<
BatchNormParamType
<
T
>
>
())),
epsilon
));
}
else
{
BNForwardInference
<
T
,
DataLayout
::
kNHWC
>
<<<
grid_size
,
block_size
,
0
,
ctx
.
stream
()
>>>
(
...
...
@@ -762,29 +815,8 @@ void BatchNormKernel(const Context &ctx,
epsilon
,
transformed_y
.
template
data
<
T
>());
}
// TODO(wangran16): wait for MIOpen to improve the performance of BN
// PADDLE_ENFORCE_GPU_SUCCESS(
// platform::dynload::miopenBatchNormalizationForwardInference(
// handle, miopenBNSpatial,
// const_cast<void *>(
// static_cast<const void *>(CudnnDataType<T>::kOne())),
// const_cast<void *>(
// static_cast<const void *>(CudnnDataType<T>::kZero())),
// data_desc_,
// static_cast<const void *>(transformed_x.template data<T>()),
// data_desc_,
// static_cast<void *>(
// transformed_y.template mutable_data<T>(ctx.GetPlace())),
// bn_param_desc_,
// const_cast<void *>(static_cast<const void *>(
// scale->template data<BatchNormParamType<T>>())),
// const_cast<void *>(static_cast<const void *>(
// bias->template data<BatchNormParamType<T>>())),
// const_cast<void *>(static_cast<const void *>(
// est_mean->template data<BatchNormParamType<T>>())),
// const_cast<void *>(static_cast<const void *>(
// est_var->template data<BatchNormParamType<T>>())),
// epsilon));
}
#else
const
bool
use_native_kernel
=
(
x_dims
.
size
()
==
2
||
...
...
@@ -900,6 +932,36 @@ void BatchNormKernel(const Context &ctx,
const
int
max_blocks
=
std
::
max
(
max_threads
/
block
,
1
);
const
int
grid
=
std
::
min
(
C
,
max_blocks
);
if
(
compute_format
==
DataLayout
::
kNCHW
)
{
if
(
FLAGS_cudnn_batchnorm_spatial_persistent
==
true
)
{
PADDLE_ENFORCE_GPU_SUCCESS
(
phi
::
dynload
::
miopenBatchNormalizationForwardTraining
(
handle
,
mode_
,
const_cast
<
void
*>
(
static_cast
<
const
void
*>
(
CudnnDataType
<
T
>::
kOne
())),
const_cast
<
void
*>
(
static_cast
<
const
void
*>
(
CudnnDataType
<
T
>::
kZero
())),
data_desc_
,
static_cast
<
const
void
*>
(
transformed_x
.
template
data
<
T
>()),
data_desc_
,
static_cast
<
void
*>
(
transformed_y
.
template
mutable_data
<
T
>(
ctx
.
GetPlace
())),
bn_param_desc_
,
const_cast
<
void
*>
(
static_cast
<
const
void
*>
(
scale
->
template
data
<
BatchNormParamType
<
T
>
>
())),
const_cast
<
void
*>
(
static_cast
<
const
void
*>
(
bias
->
template
data
<
BatchNormParamType
<
T
>
>
())),
this_factor
,
static_cast
<
void
*>
(
mean_out
->
template
mutable_data
<
BatchNormParamType
<
T
>
>
(
ctx
.
GetPlace
())),
static_cast
<
void
*>
(
variance_out
->
template
mutable_data
<
BatchNormParamType
<
T
>
>
(
ctx
.
GetPlace
())),
epsilon
,
static_cast
<
void
*>
(
saved_mean
->
template
mutable_data
<
BatchNormParamType
<
T
>
>
(
ctx
.
GetPlace
())),
static_cast
<
void
*>
(
saved_variance
->
template
mutable_data
<
BatchNormParamType
<
T
>
>
(
ctx
.
GetPlace
()))));
}
else
{
BNForwardTraining
<
T
,
block
,
DataLayout
::
kNCHW
>
<<<
grid
,
block
,
0
,
ctx
.
stream
()
>>>
(
transformed_x
.
template
data
<
T
>(),
...
...
@@ -915,6 +977,37 @@ void BatchNormKernel(const Context &ctx,
variance_out
->
template
data
<
BatchNormParamType
<
T
>
>
(),
saved_mean
->
template
data
<
BatchNormParamType
<
T
>
>
(),
saved_variance
->
template
data
<
BatchNormParamType
<
T
>
>
());
}
}
else
{
if
(
FLAGS_cudnn_batchnorm_spatial_persistent
==
true
)
{
PADDLE_ENFORCE_GPU_SUCCESS
(
phi
::
dynload
::
miopenBatchNormalizationForwardTraining
(
handle
,
mode_
,
const_cast
<
void
*>
(
static_cast
<
const
void
*>
(
CudnnDataType
<
T
>::
kOne
())),
const_cast
<
void
*>
(
static_cast
<
const
void
*>
(
CudnnDataType
<
T
>::
kZero
())),
data_desc_
,
static_cast
<
const
void
*>
(
transformed_x
.
template
data
<
T
>()),
data_desc_
,
static_cast
<
void
*>
(
transformed_y
.
template
mutable_data
<
T
>(
ctx
.
GetPlace
())),
bn_param_desc_
,
const_cast
<
void
*>
(
static_cast
<
const
void
*>
(
scale
->
template
data
<
BatchNormParamType
<
T
>
>
())),
const_cast
<
void
*>
(
static_cast
<
const
void
*>
(
bias
->
template
data
<
BatchNormParamType
<
T
>
>
())),
this_factor
,
static_cast
<
void
*>
(
mean_out
->
template
mutable_data
<
BatchNormParamType
<
T
>
>
(
ctx
.
GetPlace
())),
static_cast
<
void
*>
(
variance_out
->
template
mutable_data
<
BatchNormParamType
<
T
>
>
(
ctx
.
GetPlace
())),
epsilon
,
static_cast
<
void
*>
(
saved_mean
->
template
mutable_data
<
BatchNormParamType
<
T
>
>
(
ctx
.
GetPlace
())),
static_cast
<
void
*>
(
saved_variance
->
template
mutable_data
<
BatchNormParamType
<
T
>
>
(
ctx
.
GetPlace
()))));
}
else
{
BNForwardTraining
<
T
,
block
,
DataLayout
::
kNHWC
>
<<<
grid
,
block
,
0
,
ctx
.
stream
()
>>>
(
...
...
@@ -932,35 +1025,8 @@ void BatchNormKernel(const Context &ctx,
saved_mean
->
template
data
<
BatchNormParamType
<
T
>
>
(),
saved_variance
->
template
data
<
BatchNormParamType
<
T
>
>
());
}
// TODO(wangran16): wait for MIOpen to improve the performance of BN
// PADDLE_ENFORCE_GPU_SUCCESS(
// platform::dynload::miopenBatchNormalizationForwardTraining(
// handle, mode_, const_cast<void *>(static_cast<const void *>(
// CudnnDataType<T>::kOne())),
// const_cast<void *>(
// static_cast<const void *>(CudnnDataType<T>::kZero())),
// data_desc_,
// static_cast<const void *>(transformed_x.template data<T>()),
// data_desc_,
// static_cast<void *>(
// transformed_y.template mutable_data<T>(ctx.GetPlace())),
// bn_param_desc_,
// const_cast<void *>(static_cast<const void *>(
// scale->template data<BatchNormParamType<T>>())),
// const_cast<void *>(static_cast<const void *>(
// bias->template data<BatchNormParamType<T>>())),
// this_factor,
// static_cast<void *>(
// mean_out->template mutable_data<BatchNormParamType<T>>(
// ctx.GetPlace())),
// static_cast<void *>(variance_out->template mutable_data<
// BatchNormParamType<T>>(ctx.GetPlace())),
// epsilon,
// static_cast<void *>(
// saved_mean->template mutable_data<BatchNormParamType<T>>(
// ctx.GetPlace())),
// static_cast<void *>(saved_variance->template mutable_data<
// BatchNormParamType<T>>(ctx.GetPlace()))));
}
#else
// const size_t CUDNN_PER_ACTIVATION_THRESHOLD = 131070;
const
bool
use_native_kernel
=
...
...
@@ -1221,10 +1287,10 @@ void BatchNormKernel(const Context &ctx,
#ifdef PADDLE_WITH_HIP
// TODO(wangran16): wait for MIOpen to improve the performance of BN
// clean when exit.
//
PADDLE_ENFORCE_GPU_SUCCESS(
//
p
latform
::dynload::miopenDestroyTensorDescriptor(data_desc_));
//
PADDLE_ENFORCE_GPU_SUCCESS(
//
p
latform
::dynload::miopenDestroyTensorDescriptor(bn_param_desc_));
PADDLE_ENFORCE_GPU_SUCCESS
(
p
hi
::
dynload
::
miopenDestroyTensorDescriptor
(
data_desc_
));
PADDLE_ENFORCE_GPU_SUCCESS
(
p
hi
::
dynload
::
miopenDestroyTensorDescriptor
(
bn_param_desc_
));
#else
// clean when exit.
PADDLE_ENFORCE_GPU_SUCCESS
(
...
...
paddle/phi/kernels/gpu/multiclass_nms3_kernel.cu
View file @
ca9dbdb2
...
...
@@ -12,12 +12,23 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifndef PADDLE_WITH_HIP
#include "paddle/phi/kernels/multiclass_nms3_kernel.h"
#ifdef PADDLE_WITH_HIP
#include <hip/hip_runtime.h>
#include <hipcub/hipcub.hpp>
namespace
cub
=
hipcub
;
#else
#include <cub/cub.cuh>
#include "cuda.h" // NOLINT
#endif
#ifdef PADDLE_WITH_HIP
#define GPU(str) hip##str
#else
#define GPU(str) cuda##str
#endif
#include "paddle/phi/backends/context_pool.h"
#include "paddle/phi/common/place.h"
...
...
@@ -234,7 +245,7 @@ __launch_bounds__(nthds_per_cta) __global__
}
template
<
typename
T_SCORE
>
void
SortScoresPerClassGPU
(
cuda
Stream_t
stream
,
void
SortScoresPerClassGPU
(
GPU
(
Stream_t
)
stream
,
const
int
num
,
const
int
num_classes
,
const
int
num_preds_per_class
,
...
...
@@ -298,7 +309,7 @@ void SortScoresPerClassGPU(cudaStream_t stream,
begin_bit
,
end_bit
,
stream
);
PADDLE_ENFORCE_GPU_SUCCESS
(
cuda
GetLastError
());
PADDLE_ENFORCE_GPU_SUCCESS
(
GPU
(
GetLastError
)
());
}
/* ===========
...
...
@@ -539,7 +550,7 @@ __global__ void AllClassNMSKernel(
}
template
<
typename
T_SCORE
,
typename
T_BBOX
>
void
AllClassNMSGPU
(
cuda
Stream_t
stream
,
void
AllClassNMSGPU
(
GPU
(
Stream_t
)
stream
,
const
int
num
,
const
int
num_classes
,
const
int
num_preds_per_class
,
...
...
@@ -603,7 +614,7 @@ void AllClassNMSGPU(cudaStream_t stream,
score_shift
,
caffe_semantics
);
PADDLE_ENFORCE_GPU_SUCCESS
(
cuda
GetLastError
());
PADDLE_ENFORCE_GPU_SUCCESS
(
GPU
(
GetLastError
)
());
}
/* ==================
...
...
@@ -618,11 +629,15 @@ __launch_bounds__(nthds_per_cta) __global__
if
(
idx
<=
num_segments
)
d_offsets
[
idx
]
=
idx
*
offset
;
}
void
SetUniformOffsets
(
cuda
Stream_t
stream
,
void
SetUniformOffsets
(
GPU
(
Stream_t
)
stream
,
const
int
num_segments
,
const
int
offset
,
int
*
d_offsets
)
{
#ifdef PADDLE_WITH_HIP
const
int
BS
=
256
;
#else
const
int
BS
=
32
;
#endif
const
int
GS
=
(
num_segments
+
1
+
BS
-
1
)
/
BS
;
SetUniformOffsetsKernel
<
BS
>
<<<
GS
,
BS
,
0
,
stream
>>>
(
num_segments
,
offset
,
d_offsets
);
...
...
@@ -706,7 +721,7 @@ __launch_bounds__(nthds_per_cta) __global__
}
template
<
typename
T_BBOX
,
typename
T_SCORE
>
void
GatherNMSOutputsGPU
(
cuda
Stream_t
stream
,
void
GatherNMSOutputsGPU
(
GPU
(
Stream_t
)
stream
,
const
bool
share_location
,
const
int
num_images
,
const
int
num_preds_per_class
,
...
...
@@ -725,8 +740,12 @@ void GatherNMSOutputsGPU(cudaStream_t stream,
bool
clip_boxes
,
const
float
score_shift
)
{
PADDLE_ENFORCE_GPU_SUCCESS
(
cudaMemsetAsync
(
num_detections
,
0
,
num_images
*
sizeof
(
int
),
stream
));
GPU
(
MemsetAsync
)(
num_detections
,
0
,
num_images
*
sizeof
(
int
),
stream
));
#ifdef PADDLE_WITH_HIP
const
int
BS
=
256
;
#else
const
int
BS
=
32
;
#endif
const
int
GS
=
32
;
GatherNMSOutputsKernel
<
T_BBOX
,
T_SCORE
,
BS
>
<<<
GS
,
BS
,
0
,
stream
>>>
(
share_location
,
...
...
@@ -747,11 +766,11 @@ void GatherNMSOutputsGPU(cudaStream_t stream,
clip_boxes
,
T_SCORE
(
score_shift
));
PADDLE_ENFORCE_GPU_SUCCESS
(
cuda
GetLastError
());
PADDLE_ENFORCE_GPU_SUCCESS
(
GPU
(
GetLastError
)
());
}
template
<
typename
T_SCORE
>
void
SortScoresPerImageGPU
(
cuda
Stream_t
stream
,
void
SortScoresPerImageGPU
(
GPU
(
Stream_t
)
stream
,
const
int
num_images
,
const
int
num_items_per_image
,
void
*
unsorted_scores
,
...
...
@@ -792,11 +811,11 @@ void SortScoresPerImageGPU(cudaStream_t stream,
begin_bit
,
end_bit
,
stream
);
PADDLE_ENFORCE_GPU_SUCCESS
(
cuda
GetLastError
());
PADDLE_ENFORCE_GPU_SUCCESS
(
GPU
(
GetLastError
)
());
}
template
<
typename
T
>
void
InferNMS
(
cuda
Stream_t
stream
,
void
InferNMS
(
GPU
(
Stream_t
)
stream
,
const
int
N
,
const
int
per_batch_boxes_size
,
const
int
per_batch_scores_size
,
...
...
@@ -831,10 +850,10 @@ void InferNMS(cudaStream_t stream,
size_t
bbox_data_size
=
CalcDetectionForwardBBoxDataSize
<
T
>
(
N
,
per_batch_boxes_size
);
void
*
bbox_data_raw
=
workspace
;
PADDLE_ENFORCE_GPU_SUCCESS
(
cuda
MemcpyAsync
(
bbox_data_raw
,
PADDLE_ENFORCE_GPU_SUCCESS
(
GPU
(
MemcpyAsync
)
(
bbox_data_raw
,
loc_data
,
bbox_data_size
,
cuda
MemcpyDeviceToDevice
,
GPU
(
MemcpyDeviceToDevice
)
,
stream
));
void
*
bbox_data
=
bbox_data_raw
;
...
...
@@ -843,8 +862,8 @@ void InferNMS(cudaStream_t stream,
CalcDetectionForwardPreNMSSize
<
T
>
(
N
,
per_batch_scores_size
);
void
*
scores
=
GetNextWorkspacePtr
(
reinterpret_cast
<
int8_t
*>
(
bbox_data
),
bbox_data_size
);
PADDLE_ENFORCE_GPU_SUCCESS
(
cuda
MemcpyAsync
(
scores
,
conf_data
,
total_scores_size
,
cuda
MemcpyDeviceToDevice
,
stream
));
PADDLE_ENFORCE_GPU_SUCCESS
(
GPU
(
MemcpyAsync
)
(
scores
,
conf_data
,
total_scores_size
,
GPU
(
MemcpyDeviceToDevice
)
,
stream
));
size_t
indices_size
=
CalcDetectionForwardPreNMSSize
<
int
>
(
N
,
per_batch_scores_size
);
...
...
@@ -1145,4 +1164,3 @@ PD_REGISTER_KERNEL(multiclass_nms3, // cuda_only
kernel
->
OutputAt
(
2
).
SetDataType
(
phi
::
DataType
::
INT32
);
}
#endif
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment