Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
Paddle
Commits
dbe08e9b
Commit
dbe08e9b
authored
Jun 12, 2023
by
yuguo960516yuguo
Browse files
2.4.2
parent
b5499578
Changes
302
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
739 additions
and
116 deletions
+739
-116
paddle/fluid/inference/api/paddle_pass_builder.h
paddle/fluid/inference/api/paddle_pass_builder.h
+0
-1
paddle/fluid/inference/tensorrt/engine.h
paddle/fluid/inference/tensorrt/engine.h
+0
-27
paddle/fluid/inference/tensorrt/op_teller.cc
paddle/fluid/inference/tensorrt/op_teller.cc
+10
-4
paddle/fluid/inference/tensorrt/plugin_arg_mapping_context.cc
...le/fluid/inference/tensorrt/plugin_arg_mapping_context.cc
+6
-0
paddle/fluid/inference/tensorrt/plugin_arg_mapping_context.h
paddle/fluid/inference/tensorrt/plugin_arg_mapping_context.h
+2
-0
paddle/fluid/inference/tests/api/CMakeLists.txt
paddle/fluid/inference/tests/api/CMakeLists.txt
+3
-0
paddle/fluid/inference/tests/api/gpu_ernie_half_test.cc
paddle/fluid/inference/tests/api/gpu_ernie_half_test.cc
+290
-0
paddle/fluid/inference/tests/api/paddle_infer_api_test.cc
paddle/fluid/inference/tests/api/paddle_infer_api_test.cc
+2
-6
paddle/fluid/operators/collective/barrier_op_mlu.cc
paddle/fluid/operators/collective/barrier_op_mlu.cc
+63
-0
paddle/fluid/operators/collective/c_allgather_op_mlu.cc
paddle/fluid/operators/collective/c_allgather_op_mlu.cc
+46
-8
paddle/fluid/operators/detection/CMakeLists.txt
paddle/fluid/operators/detection/CMakeLists.txt
+5
-2
paddle/fluid/operators/detection/prior_box_op_mlu.cc
paddle/fluid/operators/detection/prior_box_op_mlu.cc
+104
-0
paddle/fluid/operators/detection/yolo_box_op_mlu.cc
paddle/fluid/operators/detection/yolo_box_op_mlu.cc
+137
-0
paddle/fluid/operators/dropout_op_mlu.cc
paddle/fluid/operators/dropout_op_mlu.cc
+37
-34
paddle/fluid/operators/fused/CMakeLists.txt
paddle/fluid/operators/fused/CMakeLists.txt
+1
-1
paddle/fluid/operators/fused/cudnn_norm_conv.cu.h
paddle/fluid/operators/fused/cudnn_norm_conv.cu.h
+8
-0
paddle/fluid/operators/fused/cudnn_norm_conv_test.cc
paddle/fluid/operators/fused/cudnn_norm_conv_test.cc
+4
-4
paddle/fluid/operators/fused/fused_dropout_act_bias.h
paddle/fluid/operators/fused/fused_dropout_act_bias.h
+13
-11
paddle/fluid/operators/fused/fused_fc_elementwise_layernorm_op.cu
...luid/operators/fused/fused_fc_elementwise_layernorm_op.cu
+6
-15
paddle/fluid/operators/fused/fused_gemm_epilogue_op.cc
paddle/fluid/operators/fused/fused_gemm_epilogue_op.cc
+2
-3
No files found.
paddle/fluid/inference/api/paddle_pass_builder.h
View file @
dbe08e9b
...
...
@@ -115,7 +115,6 @@ class PD_INFER_DECL PaddlePassBuilder {
/// \cond Protected
std
::
vector
<
std
::
string
>
analysis_passes_
{
{
"ir_graph_build_pass"
,
"ir_graph_clean_pass"
,
"ir_analysis_pass"
,
"ir_params_sync_among_devices_pass"
,
"adjust_cudnn_workspace_size_pass"
,
...
...
paddle/fluid/inference/tensorrt/engine.h
View file @
dbe08e9b
...
...
@@ -294,15 +294,6 @@ class TensorRTEngine {
nvinfer1
::
ICudaEngine
*
engine
()
{
return
infer_engine_
.
get
();
}
nvinfer1
::
IExecutionContext
*
context
()
{
#ifndef PADDLE_WITH_TESTING
PADDLE_ENFORCE_GT
(
predictor_id_per_thread
,
-
1
,
platform
::
errors
::
InvalidArgument
(
"thread local var predictor_id_per_thread must be "
"initialized to >= 0, but now predictor_id_per_thread = %d"
,
predictor_id_per_thread
));
#endif
std
::
unique_lock
<
std
::
mutex
>
lock
(
mutex_
);
if
(
infer_context_
.
find
(
predictor_id_per_thread
)
==
infer_context_
.
end
())
{
PADDLE_ENFORCE_NOT_NULL
(
...
...
@@ -329,15 +320,6 @@ class TensorRTEngine {
int
GetProfileIndex
()
{
if
(
max_profile_num_
>
1
)
{
#ifndef PADDLE_WITH_TESTING
PADDLE_ENFORCE_GT
(
predictor_id_per_thread
,
-
1
,
platform
::
errors
::
InvalidArgument
(
"thread local var predictor_id_per_thread must be "
"initialized to >= 0, but now predictor_id_per_thread = %d"
,
predictor_id_per_thread
));
#endif
std
::
unique_lock
<
std
::
mutex
>
lock
(
mutex_
);
return
profile_index_
[
predictor_id_per_thread
];
}
else
{
...
...
@@ -356,15 +338,6 @@ class TensorRTEngine {
infer_engine_
,
platform
::
errors
::
InvalidArgument
(
"You should build engine first and then set the context."
));
#ifndef PADDLE_WITH_TESTING
PADDLE_ENFORCE_GT
(
predictor_id_per_thread
,
-
1
,
platform
::
errors
::
InvalidArgument
(
"thread local var predictor_id_per_thread must be "
"initialized to >= 0, but now predictor_id_per_thread = %d"
,
predictor_id_per_thread
));
#endif
std
::
unique_lock
<
std
::
mutex
>
lock
(
mutex_
);
infer_context_
[
predictor_id_per_thread
].
reset
(
nullptr
);
infer_context_
.
erase
(
predictor_id_per_thread
);
...
...
paddle/fluid/inference/tensorrt/op_teller.cc
View file @
dbe08e9b
...
...
@@ -639,8 +639,12 @@ struct SimpleOpTypeSetTeller : public Teller {
int
axis
=
desc
.
HasAttr
(
"axis"
)
?
PADDLE_GET_CONST
(
int64_t
,
desc
.
GetAttr
(
"axis"
))
:
-
1
;
bool
flatten
=
PADDLE_GET_CONST
(
bool
,
desc
.
GetAttr
(
"flatten"
));
int
dtype
=
PADDLE_GET_CONST
(
int
,
desc
.
GetAttr
(
"dtype"
));
bool
flatten
=
desc
.
HasAttr
(
"flatten"
)
?
PADDLE_GET_CONST
(
bool
,
desc
.
GetAttr
(
"flatten"
))
:
false
;
int
dtype
=
desc
.
HasAttr
(
"dtype"
)
?
PADDLE_GET_CONST
(
int
,
desc
.
GetAttr
(
"dtype"
))
:
3
;
if
(
axis
==
0
||
flatten
||
dtype
!=
2
)
return
false
;
}
...
...
@@ -1708,8 +1712,10 @@ struct SimpleOpTypeSetTeller : public Teller {
return
false
;
}
}
else
{
#if !IS_TRT_VERSION_GE(8000)
VLOG
(
3
)
<<
"The version of TRT must be greater than 8000"
;
#if (IS_TRT_VERSION_GE(8000) && IS_TRT_VERSION_LT(8100)) || \
(IS_TRT_VERSION_LT(7200))
VLOG
(
3
)
<<
"There are some bugs in v8.0.* and the versions lower than "
"v7.2 are not supported"
;
return
false
;
#endif
}
...
...
paddle/fluid/inference/tensorrt/plugin_arg_mapping_context.cc
View file @
dbe08e9b
...
...
@@ -104,6 +104,7 @@ bool PluginArgumentMappingContext::IsSelectedRowsInput(
const
std
::
string
&
name
)
const
{
return
false
;
}
bool
PluginArgumentMappingContext
::
IsSparseCooTensorInput
(
const
std
::
string
&
name
)
const
{
return
false
;
...
...
@@ -112,6 +113,11 @@ bool PluginArgumentMappingContext::IsSparseCsrTensorInput(
const
std
::
string
&
name
)
const
{
return
false
;
}
bool
PluginArgumentMappingContext
::
IsSelectedRowsInputs
(
const
std
::
string
&
name
)
const
{
return
false
;
}
bool
PluginArgumentMappingContext
::
IsDenseTensorVectorInput
(
const
std
::
string
&
name
)
const
{
return
false
;
...
...
paddle/fluid/inference/tensorrt/plugin_arg_mapping_context.h
View file @
dbe08e9b
...
...
@@ -50,6 +50,8 @@ class PluginArgumentMappingContext : public ::phi::ArgumentMappingContext {
bool
IsSparseCsrTensorInput
(
const
std
::
string
&
name
)
const
override
;
bool
IsSelectedRowsInputs
(
const
std
::
string
&
name
)
const
override
;
bool
IsDenseTensorVectorInput
(
const
std
::
string
&
name
)
const
override
;
bool
IsDenseTensorOutput
(
const
std
::
string
&
name
)
const
override
;
...
...
paddle/fluid/inference/tests/api/CMakeLists.txt
View file @
dbe08e9b
...
...
@@ -416,6 +416,9 @@ download_result(${ERNIE_INSTALL_DIR} "Ernie_result.txt.tar.gz"
if
(
WITH_GPU
)
inference_analysis_api_test
(
test_analyzer_ernie
${
ERNIE_INSTALL_DIR
}
analyzer_ernie_tester.cc
)
inference_analysis_api_test
(
gpu_ernie_half_test
${
ERNIE_INSTALL_DIR
}
gpu_ernie_half_test.cc
)
set_tests_properties
(
gpu_ernie_half_test PROPERTIES TIMEOUT 60
)
endif
()
inference_analysis_api_int8_test
(
test_analyzer_ernie_int8
${
ERNIE_INSTALL_DIR
}
analyzer_ernie_int8_tester.cc
)
...
...
paddle/fluid/inference/tests/api/gpu_ernie_half_test.cc
0 → 100644
View file @
dbe08e9b
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "paddle/fluid/inference/tests/api/tester_helper.h"
namespace
paddle
{
namespace
inference
{
using
paddle
::
PaddleTensor
;
template
<
typename
T
>
void
GetValueFromStream
(
std
::
stringstream
*
ss
,
T
*
t
)
{
(
*
ss
)
>>
(
*
t
);
}
template
<
>
void
GetValueFromStream
<
std
::
string
>
(
std
::
stringstream
*
ss
,
std
::
string
*
t
)
{
*
t
=
ss
->
str
();
}
// Split string to vector
template
<
typename
T
>
void
Split
(
const
std
::
string
&
line
,
char
sep
,
std
::
vector
<
T
>
*
v
)
{
std
::
stringstream
ss
;
T
t
;
for
(
auto
c
:
line
)
{
if
(
c
!=
sep
)
{
ss
<<
c
;
}
else
{
GetValueFromStream
<
T
>
(
&
ss
,
&
t
);
v
->
push_back
(
std
::
move
(
t
));
ss
.
str
({});
ss
.
clear
();
}
}
if
(
!
ss
.
str
().
empty
())
{
GetValueFromStream
<
T
>
(
&
ss
,
&
t
);
v
->
push_back
(
std
::
move
(
t
));
ss
.
str
({});
ss
.
clear
();
}
}
// Parse tensor from string
template
<
typename
T
>
bool
ParseTensor
(
const
std
::
string
&
field
,
paddle
::
PaddleTensor
*
tensor
)
{
std
::
vector
<
std
::
string
>
data
;
Split
(
field
,
':'
,
&
data
);
if
(
data
.
size
()
<
2
)
return
false
;
std
::
string
shape_str
=
data
[
0
];
std
::
vector
<
int
>
shape
;
Split
(
shape_str
,
' '
,
&
shape
);
std
::
string
mat_str
=
data
[
1
];
std
::
vector
<
T
>
mat
;
Split
(
mat_str
,
' '
,
&
mat
);
tensor
->
shape
=
shape
;
auto
size
=
std
::
accumulate
(
shape
.
begin
(),
shape
.
end
(),
1
,
std
::
multiplies
<
int
>
())
*
sizeof
(
T
);
tensor
->
data
.
Resize
(
size
);
std
::
copy
(
mat
.
begin
(),
mat
.
end
(),
static_cast
<
T
*>
(
tensor
->
data
.
data
()));
tensor
->
dtype
=
GetPaddleDType
<
T
>
();
return
true
;
}
// Parse input tensors from string
bool
ParseLine
(
const
std
::
string
&
line
,
std
::
vector
<
paddle
::
PaddleTensor
>
*
tensors
)
{
std
::
vector
<
std
::
string
>
fields
;
Split
(
line
,
';'
,
&
fields
);
tensors
->
clear
();
tensors
->
reserve
(
4
);
int
i
=
0
;
auto
input_name
=
FLAGS_ernie_large
?
"eval_placeholder_"
:
"placeholder_"
;
for
(;
i
<
3
;
i
++
)
{
paddle
::
PaddleTensor
temp
;
ParseTensor
<
int64_t
>
(
fields
[
i
],
&
temp
);
temp
.
name
=
input_name
+
std
::
to_string
(
i
);
tensors
->
push_back
(
temp
);
}
// input_mask
paddle
::
PaddleTensor
input_mask
;
ParseTensor
<
float
>
(
fields
[
i
],
&
input_mask
);
input_mask
.
name
=
input_name
+
std
::
to_string
(
i
);
tensors
->
push_back
(
input_mask
);
return
true
;
}
bool
LoadInputData
(
std
::
vector
<
std
::
vector
<
paddle
::
PaddleTensor
>>
*
inputs
,
int
batch_size
=
1
)
{
if
(
FLAGS_infer_data
.
empty
())
{
LOG
(
ERROR
)
<<
"please set input data path"
;
return
false
;
}
std
::
ifstream
fin
(
FLAGS_infer_data
);
std
::
string
line
;
int
sample
=
0
;
// The unit-test dataset only have 10 samples, each sample have 5 feeds.
while
(
std
::
getline
(
fin
,
line
))
{
std
::
vector
<
paddle
::
PaddleTensor
>
feed_data
;
ParseLine
(
line
,
&
feed_data
);
inputs
->
push_back
(
std
::
move
(
feed_data
));
sample
++
;
if
(
!
FLAGS_test_all_data
&&
sample
==
batch_size
)
break
;
}
LOG
(
INFO
)
<<
"number of samples: "
<<
sample
;
return
true
;
}
// Compare results
TEST
(
Ernie_gpu_fp16_no_ir
,
compare_results
)
{
AnalysisConfig
config
;
config
.
SetModel
(
FLAGS_infer_model
);
config
.
EnableUseGpu
(
512
,
0
,
paddle_infer
::
PrecisionType
::
kHalf
);
config
.
SwitchIrOptim
(
false
);
auto
predictor
=
CreatePaddlePredictor
(
config
);
std
::
vector
<
std
::
vector
<
PaddleTensor
>>
input_slots_all
;
LoadInputData
(
&
input_slots_all
);
std
::
ifstream
fin
(
FLAGS_refer_result
);
std
::
string
line
;
std
::
vector
<
float
>
ref
;
while
(
std
::
getline
(
fin
,
line
))
{
Split
(
line
,
' '
,
&
ref
);
}
std
::
vector
<
PaddleTensor
>
outputs
;
for
(
size_t
i
=
0
;
i
<
input_slots_all
.
size
();
i
++
)
{
outputs
.
clear
();
predictor
->
Run
(
input_slots_all
[
i
],
&
outputs
);
auto
output
=
outputs
.
front
();
size_t
outputs_size
=
1
;
for
(
auto
dim
:
output
.
shape
)
{
outputs_size
*=
dim
;
}
float
*
result
=
reinterpret_cast
<
float
*>
(
output
.
data
.
data
());
for
(
size_t
j
=
0
;
j
<
outputs_size
;
++
j
)
{
EXPECT_NEAR
(
ref
[
i
*
outputs_size
+
j
],
result
[
j
],
8e-3
);
}
}
}
// Compare results
TEST
(
Ernie_gpu_fp16_with_ir
,
compare_results
)
{
AnalysisConfig
config
;
config
.
SetModel
(
FLAGS_infer_model
);
config
.
EnableUseGpu
(
512
,
0
,
paddle_infer
::
PrecisionType
::
kHalf
);
config
.
SwitchIrOptim
(
true
);
// There is a problem with the model itself, which has nothing to do with
// constant_folding_pass.
config
.
pass_builder
()
->
DeletePass
(
"constant_folding_pass"
);
auto
predictor
=
CreatePaddlePredictor
(
config
);
std
::
vector
<
std
::
vector
<
PaddleTensor
>>
input_slots_all
;
LoadInputData
(
&
input_slots_all
);
std
::
ifstream
fin
(
FLAGS_refer_result
);
std
::
string
line
;
std
::
vector
<
float
>
ref
;
while
(
std
::
getline
(
fin
,
line
))
{
Split
(
line
,
' '
,
&
ref
);
}
std
::
vector
<
PaddleTensor
>
outputs
;
for
(
size_t
i
=
0
;
i
<
input_slots_all
.
size
();
i
++
)
{
outputs
.
clear
();
predictor
->
Run
(
input_slots_all
[
i
],
&
outputs
);
auto
output
=
outputs
.
front
();
size_t
outputs_size
=
1
;
for
(
auto
dim
:
output
.
shape
)
{
outputs_size
*=
dim
;
}
float
*
result
=
reinterpret_cast
<
float
*>
(
output
.
data
.
data
());
for
(
size_t
j
=
0
;
j
<
outputs_size
;
++
j
)
{
EXPECT_NEAR
(
ref
[
i
*
outputs_size
+
j
],
result
[
j
],
2e-2
);
}
}
}
// Compare results
TEST
(
Ernie_gpu_bf16_no_ir
,
compare_results
)
{
AnalysisConfig
config
;
config
.
SetModel
(
FLAGS_infer_model
);
config
.
EnableUseGpu
(
512
,
0
,
paddle_infer
::
PrecisionType
::
kBf16
);
config
.
SwitchIrOptim
(
false
);
auto
predictor
=
CreatePaddlePredictor
(
config
);
std
::
vector
<
std
::
vector
<
PaddleTensor
>>
input_slots_all
;
LoadInputData
(
&
input_slots_all
);
std
::
ifstream
fin
(
FLAGS_refer_result
);
std
::
string
line
;
std
::
vector
<
float
>
ref
;
while
(
std
::
getline
(
fin
,
line
))
{
Split
(
line
,
' '
,
&
ref
);
}
std
::
vector
<
PaddleTensor
>
outputs
;
for
(
size_t
i
=
0
;
i
<
input_slots_all
.
size
();
i
++
)
{
outputs
.
clear
();
predictor
->
Run
(
input_slots_all
[
i
],
&
outputs
);
auto
output
=
outputs
.
front
();
size_t
outputs_size
=
1
;
for
(
auto
dim
:
output
.
shape
)
{
outputs_size
*=
dim
;
}
float
*
result
=
reinterpret_cast
<
float
*>
(
output
.
data
.
data
());
for
(
size_t
j
=
0
;
j
<
outputs_size
;
++
j
)
{
EXPECT_NEAR
(
ref
[
i
*
outputs_size
+
j
],
result
[
j
],
1e-2
);
}
}
}
// Compare results
TEST
(
Ernie_gpu_bf16_with_ir
,
compare_results
)
{
AnalysisConfig
config
;
config
.
SetModel
(
FLAGS_infer_model
);
config
.
EnableUseGpu
(
512
,
0
,
paddle_infer
::
PrecisionType
::
kBf16
);
config
.
SwitchIrOptim
(
true
);
// There is a problem with the model itself, which has nothing to do with
// constant_folding_pass.
config
.
pass_builder
()
->
DeletePass
(
"constant_folding_pass"
);
auto
predictor
=
CreatePaddlePredictor
(
config
);
std
::
vector
<
std
::
vector
<
PaddleTensor
>>
input_slots_all
;
LoadInputData
(
&
input_slots_all
);
std
::
ifstream
fin
(
FLAGS_refer_result
);
std
::
string
line
;
std
::
vector
<
float
>
ref
;
while
(
std
::
getline
(
fin
,
line
))
{
Split
(
line
,
' '
,
&
ref
);
}
std
::
vector
<
PaddleTensor
>
outputs
;
for
(
size_t
i
=
0
;
i
<
input_slots_all
.
size
();
i
++
)
{
outputs
.
clear
();
predictor
->
Run
(
input_slots_all
[
i
],
&
outputs
);
auto
output
=
outputs
.
front
();
size_t
outputs_size
=
1
;
for
(
auto
dim
:
output
.
shape
)
{
outputs_size
*=
dim
;
}
float
*
result
=
reinterpret_cast
<
float
*>
(
output
.
data
.
data
());
for
(
size_t
j
=
0
;
j
<
outputs_size
;
++
j
)
{
EXPECT_NEAR
(
ref
[
i
*
outputs_size
+
j
],
result
[
j
],
5e-3
);
}
}
}
}
// namespace inference
}
// namespace paddle
paddle/fluid/inference/tests/api/paddle_infer_api_test.cc
View file @
dbe08e9b
/* Copyright (c) 20
18
PaddlePaddle Authors. All Rights Reserved.
/* Copyright (c) 20
22
PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
...
...
@@ -12,15 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <cuda_runtime.h>
#include <glog/logging.h>
#include <gtest/gtest.h>
#include <cstring>
#include <numeric>
#include "gflags/gflags.h"
#include "paddle/fluid/inference/tests/api/
trt_
test_helper.h"
#include "paddle/fluid/inference/tests/api/test
er
_helper.h"
namespace
paddle_infer
{
...
...
paddle/fluid/operators/collective/barrier_op_mlu.cc
0 → 100644
View file @
dbe08e9b
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/collective/barrier_op.h"
#if defined(PADDLE_WITH_CNCL)
#include "paddle/fluid/platform/collective_helper.h"
#include "paddle/fluid/platform/device/mlu/cncl_helper.h"
#endif
namespace
paddle
{
namespace
operators
{
template
<
typename
T
>
class
BarrierOpMLUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
#if defined(PADDLE_WITH_CNCL)
auto
in
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"X"
);
auto
out
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
"Out"
);
auto
place
=
ctx
.
GetPlace
();
cnclDataType_t
dtype
=
platform
::
ToCNCLDataType
(
framework
::
TransToProtoVarType
(
in
->
dtype
()));
int64_t
numel
=
in
->
numel
();
const
void
*
sendbuff
=
in
->
data
();
void
*
recvbuff
=
out
->
mutable_data
<
T
>
(
place
);
int
rid
=
ctx
.
Attr
<
int
>
(
"ring_id"
);
auto
cncl_comm
=
platform
::
CNCLCommContext
::
Instance
().
Get
(
rid
,
place
);
auto
*
comm
=
cncl_comm
->
comm
();
auto
comm_stream
=
cncl_comm
->
stream
();
auto
&
dev_ctx
=
ctx
.
template
device_context
<
paddle
::
platform
::
MLUDeviceContext
>();
cnclReduceOp_t
cncl_red_type
=
cnclSum
;
dev_ctx
.
Wait
();
PADDLE_ENFORCE_MLU_SUCCESS
(
cnclAllReduce
(
sendbuff
,
recvbuff
,
numel
,
dtype
,
cncl_red_type
,
comm
,
comm_stream
));
PADDLE_ENFORCE_MLU_SUCCESS
(
cnrtQueueSync
(
comm_stream
));
#else
PADDLE_THROW
(
platform
::
errors
::
Unavailable
(
"PaddlePaddle should compile with CNCL."
));
#endif
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
namespace
plat
=
paddle
::
platform
;
REGISTER_OP_MLU_KERNEL
(
barrier
,
ops
::
BarrierOpMLUKernel
<
int
>
);
paddle/fluid/operators/collective/c_allgather_op_mlu.cc
View file @
dbe08e9b
...
...
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/collective/c_allgather_op.h"
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
#if defined(PADDLE_WITH_CNCL)
#include "paddle/fluid/platform/collective_helper.h"
...
...
@@ -27,15 +28,14 @@ template <typename T>
class
CAllGatherOpMLUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
place
=
ctx
.
GetPlace
();
auto
dev_ctx
=
platform
::
DeviceContextPool
::
Instance
().
Get
(
place
);
#if defined(PADDLE_WITH_CNCL)
auto
x
=
ctx
.
Input
<
framework
::
Tensor
>
(
"X"
);
auto
out
=
ctx
.
Output
<
framework
::
Tensor
>
(
"Out"
);
cnclDataType_t
dtype
=
platform
::
ToCNCLDataType
(
framework
::
TransToProtoVarType
(
x
->
dtype
()));
auto
x
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"X"
);
auto
out
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
"Out"
);
int
nranks
=
ctx
.
Attr
<
int
>
(
"nranks"
);
int
rid
=
ctx
.
Attr
<
int
>
(
"ring_id"
);
auto
place
=
ctx
.
GetPlace
();
auto
comm
=
platform
::
CNCLCommContext
::
Instance
().
Get
(
rid
,
place
);
PADDLE_ENFORCE_EQ
(
nranks
,
...
...
@@ -48,19 +48,56 @@ class CAllGatherOpMLUKernel : public framework::OpKernel<T> {
out
->
mutable_data
<
T
>
(
out_dims
,
place
);
uint32_t
send_numel
=
x
->
numel
();
void
*
send_buff
=
reinterpret_cast
<
void
*>
(
const_cast
<
T
*>
(
x
->
data
<
T
>
()));
void
*
recv_buff
=
reinterpret_cast
<
void
*>
(
out
->
data
<
T
>
());
void
*
send_buff
;
void
*
recv_buff
;
phi
::
DenseTensor
in_tensor
,
out_tensor
;
if
(
framework
::
TransToProtoVarType
(
x
->
dtype
())
==
framework
::
proto
::
VarType
::
INT64
)
{
// cast from int64 to int32 since cncl do not support int64
in_tensor
.
mutable_data
<
int32_t
>
(
x
->
dims
(),
place
);
out_tensor
.
mutable_data
<
int32_t
>
(
out
->
dims
(),
place
);
MLUCnnlTensorDesc
x_int64_desc
(
*
x
);
MLUCnnlTensorDesc
x_int32_desc
(
in_tensor
);
cnnlCastDataType_t
cast_type
=
GetCastDataType
(
VT
::
INT64
,
VT
::
INT32
);
MLUCnnl
::
Cast
(
ctx
,
cast_type
,
x_int64_desc
.
get
(),
GetBasePtr
(
x
),
x_int32_desc
.
get
(),
GetBasePtr
(
&
in_tensor
));
send_buff
=
reinterpret_cast
<
void
*>
(
in_tensor
.
data
<
int32_t
>
());
recv_buff
=
reinterpret_cast
<
void
*>
(
out_tensor
.
data
<
int32_t
>
());
}
else
{
in_tensor
.
ShareDataWith
(
*
x
);
out_tensor
.
ShareDataWith
(
*
out
);
send_buff
=
reinterpret_cast
<
void
*>
(
in_tensor
.
data
<
T
>
());
recv_buff
=
reinterpret_cast
<
void
*>
(
out_tensor
.
data
<
T
>
());
}
mluStream
stream
=
nullptr
;
if
(
ctx
.
Attr
<
bool
>
(
"use_calc_stream"
))
{
auto
dev_ctx
=
platform
::
DeviceContextPool
::
Instance
().
Get
(
place
);
stream
=
static_cast
<
platform
::
MLUDeviceContext
*>
(
dev_ctx
)
->
stream
();
}
else
{
stream
=
comm
->
stream
();
}
cnclDataType_t
dtype
=
platform
::
ToCNCLDataType
(
framework
::
TransToProtoVarType
(
in_tensor
.
dtype
()));
PADDLE_ENFORCE_MLU_SUCCESS
(
cnclAllGather
(
send_buff
,
recv_buff
,
send_numel
,
dtype
,
comm
->
comm
(),
stream
));
if
(
framework
::
TransToProtoVarType
(
x
->
dtype
())
==
framework
::
proto
::
VarType
::
INT64
)
{
// cast back from int64 out_tensor to out
MLUCnnlTensorDesc
out_int64_desc
(
*
out
);
MLUCnnlTensorDesc
out_int32_desc
(
out_tensor
);
cnnlCastDataType_t
cast_type
=
GetCastDataType
(
VT
::
INT32
,
VT
::
INT64
);
MLUCnnl
::
Cast
(
ctx
,
cast_type
,
out_int32_desc
.
get
(),
GetBasePtr
(
&
out_tensor
),
out_int64_desc
.
get
(),
GetBasePtr
(
out
));
}
#else
PADDLE_THROW
(
platform
::
errors
::
PreconditionNotMet
(
"PaddlePaddle should compile with MLU."
));
...
...
@@ -80,4 +117,5 @@ REGISTER_OP_MLU_KERNEL(c_allgather,
ops
::
CAllGatherOpMLUKernel
<
int
>
,
ops
::
CAllGatherOpMLUKernel
<
int8_t
>
,
ops
::
CAllGatherOpMLUKernel
<
int16_t
>
,
ops
::
CAllGatherOpMLUKernel
<
int64_t
>
,
ops
::
CAllGatherOpMLUKernel
<
plat
::
float16
>
);
paddle/fluid/operators/detection/CMakeLists.txt
View file @
dbe08e9b
...
...
@@ -42,19 +42,23 @@ if(WITH_XPU)
detection_library
(
iou_similarity_op SRCS iou_similarity_op.cc
iou_similarity_op_xpu.cc
)
detection_library
(
prior_box_op SRCS prior_box_op.cc
)
detection_library
(
yolo_box_op SRCS yolo_box_op.cc
)
detection_library
(
generate_proposals_v2_op SRCS generate_proposals_v2_op.cc
)
elseif
(
WITH_MLU
)
detection_library
(
iou_similarity_op SRCS iou_similarity_op.cc
iou_similarity_op_mlu.cc
)
detection_library
(
prior_box_op SRCS prior_box_op.cc
)
detection_library
(
prior_box_op SRCS prior_box_op.cc prior_box_op_mlu.cc
)
detection_library
(
yolo_box_op SRCS yolo_box_op.cc yolo_box_op_mlu.cc
)
elseif
(
WITH_ASCEND_CL
)
detection_library
(
iou_similarity_op SRCS iou_similarity_op.cc
iou_similarity_op_npu.cc
)
detection_library
(
prior_box_op SRCS prior_box_op.cc prior_box_op_npu.cc
)
detection_library
(
yolo_box_op SRCS yolo_box_op.cc
)
else
()
detection_library
(
iou_similarity_op SRCS iou_similarity_op.cc
iou_similarity_op.cu
)
detection_library
(
prior_box_op SRCS prior_box_op.cc
)
detection_library
(
yolo_box_op SRCS yolo_box_op.cc
)
# detection_library(generate_proposals_v2_op SRCS generate_proposals_v2_op.cc)
endif
()
...
...
@@ -73,7 +77,6 @@ detection_library(locality_aware_nms_op SRCS locality_aware_nms_op.cc DEPS gpc)
detection_library
(
matrix_nms_op SRCS matrix_nms_op.cc DEPS gpc
)
detection_library
(
box_clip_op SRCS box_clip_op.cc box_clip_op.cu
)
detection_library
(
yolov3_loss_op SRCS yolov3_loss_op.cc
)
detection_library
(
yolo_box_op SRCS yolo_box_op.cc
)
detection_library
(
box_decoder_and_assign_op SRCS box_decoder_and_assign_op.cc
box_decoder_and_assign_op.cu
)
detection_library
(
sigmoid_focal_loss_op SRCS sigmoid_focal_loss_op.cc
...
...
paddle/fluid/operators/detection/prior_box_op_mlu.cc
0 → 100644
View file @
dbe08e9b
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/operators/detection/prior_box_op.h"
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
namespace
paddle
{
namespace
operators
{
template
<
typename
T
>
class
PriorBoxMLUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
*
input
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"Input"
);
auto
*
image
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"Image"
);
auto
*
boxes
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
"Boxes"
);
auto
*
variances
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
"Variances"
);
float
step_w
=
ctx
.
Attr
<
float
>
(
"step_w"
);
float
step_h
=
ctx
.
Attr
<
float
>
(
"step_h"
);
float
offset
=
ctx
.
Attr
<
float
>
(
"offset"
);
bool
clip
=
ctx
.
Attr
<
bool
>
(
"clip"
);
bool
min_max_aspect_ratios_order
=
ctx
.
Attr
<
bool
>
(
"min_max_aspect_ratios_order"
);
int
im_width
=
image
->
dims
()[
3
];
int
im_height
=
image
->
dims
()[
2
];
int
width
=
input
->
dims
()[
3
];
int
height
=
input
->
dims
()[
2
];
auto
aspect_ratios
=
ctx
.
Attr
<
std
::
vector
<
float
>>
(
"aspect_ratios"
);
bool
flip
=
ctx
.
Attr
<
bool
>
(
"flip"
);
std
::
vector
<
float
>
new_aspect_ratios
;
ExpandAspectRatios
(
aspect_ratios
,
flip
,
&
new_aspect_ratios
);
auto
&
dev_ctx
=
ctx
.
template
device_context
<
platform
::
MLUDeviceContext
>();
phi
::
DenseTensor
ratios
;
paddle
::
framework
::
TensorFromVector
(
new_aspect_ratios
,
dev_ctx
,
&
ratios
);
MLUOpTensorDesc
new_aspect_ratios_desc
(
ratios
);
auto
min_sizes
=
ctx
.
Attr
<
std
::
vector
<
float
>>
(
"min_sizes"
);
phi
::
DenseTensor
min
;
paddle
::
framework
::
TensorFromVector
(
min_sizes
,
dev_ctx
,
&
min
);
MLUOpTensorDesc
min_sizes_desc
(
min
);
auto
max_sizes
=
ctx
.
Attr
<
std
::
vector
<
float
>>
(
"max_sizes"
);
phi
::
DenseTensor
max
;
paddle
::
framework
::
TensorFromVector
(
max_sizes
,
dev_ctx
,
&
max
);
MLUOpTensorDesc
max_sizes_desc
(
max
);
auto
variances_attr
=
ctx
.
Attr
<
std
::
vector
<
float
>>
(
"variances"
);
phi
::
DenseTensor
var_tensor
;
paddle
::
framework
::
TensorFromVector
(
variances_attr
,
dev_ctx
,
&
var_tensor
);
MLUOpTensorDesc
variances_attr_desc
(
var_tensor
);
auto
place
=
ctx
.
GetPlace
();
boxes
->
mutable_data
<
T
>
(
place
);
variances
->
mutable_data
<
T
>
(
place
);
MLUOpTensorDesc
var_desc
(
*
variances
);
MLUOpTensorDesc
output_desc
(
*
boxes
);
MLUOP
::
OpPriorBox
(
ctx
,
min_sizes_desc
.
get
(),
GetBasePtr
(
&
min
),
new_aspect_ratios_desc
.
get
(),
GetBasePtr
(
&
ratios
),
variances_attr_desc
.
get
(),
GetBasePtr
(
&
var_tensor
),
max_sizes_desc
.
get
(),
GetBasePtr
(
&
max
),
height
,
width
,
im_height
,
im_width
,
step_h
,
step_w
,
offset
,
clip
,
min_max_aspect_ratios_order
,
output_desc
.
get
(),
GetBasePtr
(
boxes
),
var_desc
.
get
(),
GetBasePtr
(
variances
));
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
namespace
plat
=
paddle
::
platform
;
REGISTER_OP_MLU_KERNEL
(
prior_box
,
ops
::
PriorBoxMLUKernel
<
float
>
);
paddle/fluid/operators/detection/yolo_box_op_mlu.cc
0 → 100644
View file @
dbe08e9b
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
namespace
paddle
{
namespace
operators
{
template
<
typename
T
>
class
YoloBoxMLUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
*
x
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"X"
);
auto
*
img_size
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"ImgSize"
);
auto
*
boxes
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
"Boxes"
);
auto
*
scores
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
"Scores"
);
const
std
::
vector
<
int
>
anchors
=
ctx
.
Attr
<
std
::
vector
<
int
>>
(
"anchors"
);
auto
class_num
=
ctx
.
Attr
<
int
>
(
"class_num"
);
auto
conf_thresh
=
ctx
.
Attr
<
float
>
(
"conf_thresh"
);
auto
downsample_ratio
=
ctx
.
Attr
<
int
>
(
"downsample_ratio"
);
auto
clip_bbox
=
ctx
.
Attr
<
bool
>
(
"clip_bbox"
);
auto
scale
=
ctx
.
Attr
<
float
>
(
"scale_x_y"
);
auto
iou_aware
=
ctx
.
Attr
<
bool
>
(
"iou_aware"
);
auto
iou_aware_factor
=
ctx
.
Attr
<
float
>
(
"iou_aware_factor"
);
int
anchor_num
=
anchors
.
size
()
/
2
;
int64_t
size
=
anchors
.
size
();
auto
dim_x
=
x
->
dims
();
int
n
=
dim_x
[
0
];
int
s
=
anchor_num
;
int
h
=
dim_x
[
2
];
int
w
=
dim_x
[
3
];
// The output of mluOpYoloBox: A 4-D tensor with shape [N, anchor_num, 4,
// H*W], the coordinates of boxes, and a 4-D tensor with shape [N,
// anchor_num, :attr:`class_num`, H*W], the classification scores of boxes.
std
::
vector
<
int64_t
>
boxes_dim_mluops
({
n
,
s
,
4
,
h
*
w
});
std
::
vector
<
int64_t
>
scores_dim_mluops
({
n
,
s
,
class_num
,
h
*
w
});
// In Paddle framework: A 3-D tensor with shape [N, M, 4], the coordinates
// of boxes, and a 3-D tensor with shape [N, M, :attr:`class_num`], the
// classification scores of boxes.
std
::
vector
<
int64_t
>
boxes_out_dim
({
n
,
s
,
h
*
w
,
4
});
std
::
vector
<
int64_t
>
scores_out_dim
({
n
,
s
,
h
*
w
,
class_num
});
auto
&
dev_ctx
=
ctx
.
template
device_context
<
MLUDeviceContext
>();
phi
::
DenseTensor
boxes_tensor_mluops
=
ctx
.
AllocateTmpTensor
<
T
,
MLUDeviceContext
>
({
n
,
s
,
4
,
h
*
w
},
dev_ctx
);
phi
::
DenseTensor
scores_tensor_mluops
=
ctx
.
AllocateTmpTensor
<
T
,
MLUDeviceContext
>
({
n
,
s
,
class_num
,
h
*
w
},
dev_ctx
);
MLUOpTensorDesc
boxes_trans_desc_mluops
(
4
,
boxes_dim_mluops
.
data
(),
ToMluOpDataType
<
T
>
());
MLUCnnlTensorDesc
boxes_trans_desc_cnnl
(
4
,
boxes_dim_mluops
.
data
(),
ToCnnlDataType
<
T
>
());
MLUOpTensorDesc
scores_trans_desc_mluops
(
4
,
scores_dim_mluops
.
data
(),
ToMluOpDataType
<
T
>
());
MLUCnnlTensorDesc
scores_trans_desc_cnnl
(
4
,
scores_dim_mluops
.
data
(),
ToCnnlDataType
<
T
>
());
boxes
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
scores
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
FillMLUTensorWithHostValue
(
ctx
,
static_cast
<
T
>
(
0
),
boxes
);
FillMLUTensorWithHostValue
(
ctx
,
static_cast
<
T
>
(
0
),
scores
);
MLUOpTensorDesc
x_desc
(
*
x
,
MLUOP_LAYOUT_ARRAY
,
ToMluOpDataType
<
T
>
());
MLUOpTensorDesc
img_size_desc
(
*
img_size
,
MLUOP_LAYOUT_ARRAY
,
ToMluOpDataType
<
int32_t
>
());
Tensor
anchors_temp
(
framework
::
TransToPhiDataType
(
VT
::
INT32
));
anchors_temp
.
Resize
({
size
});
paddle
::
framework
::
TensorFromVector
(
anchors
,
ctx
.
device_context
(),
&
anchors_temp
);
MLUOpTensorDesc
anchors_desc
(
anchors_temp
);
MLUCnnlTensorDesc
boxes_desc_cnnl
(
4
,
boxes_out_dim
.
data
(),
ToCnnlDataType
<
T
>
());
MLUCnnlTensorDesc
scores_desc_cnnl
(
4
,
scores_out_dim
.
data
(),
ToCnnlDataType
<
T
>
());
MLUOP
::
OpYoloBox
(
ctx
,
x_desc
.
get
(),
GetBasePtr
(
x
),
img_size_desc
.
get
(),
GetBasePtr
(
img_size
),
anchors_desc
.
get
(),
GetBasePtr
(
&
anchors_temp
),
class_num
,
conf_thresh
,
downsample_ratio
,
clip_bbox
,
scale
,
iou_aware
,
iou_aware_factor
,
boxes_trans_desc_mluops
.
get
(),
GetBasePtr
(
&
boxes_tensor_mluops
),
scores_trans_desc_mluops
.
get
(),
GetBasePtr
(
&
scores_tensor_mluops
));
const
std
::
vector
<
int
>
perm
=
{
0
,
1
,
3
,
2
};
// transpose the boxes from [N, S, 4, H*W] to [N, S, H*W, 4]
MLUCnnl
::
Transpose
(
ctx
,
perm
,
4
,
boxes_trans_desc_cnnl
.
get
(),
GetBasePtr
(
&
boxes_tensor_mluops
),
boxes_desc_cnnl
.
get
(),
GetBasePtr
(
boxes
));
// transpose the scores from [N, S, class_num, H*W] to [N, S, H*W,
// class_num]
MLUCnnl
::
Transpose
(
ctx
,
perm
,
4
,
scores_trans_desc_cnnl
.
get
(),
GetBasePtr
(
&
scores_tensor_mluops
),
scores_desc_cnnl
.
get
(),
GetBasePtr
(
scores
));
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
namespace
plat
=
paddle
::
platform
;
REGISTER_OP_MLU_KERNEL
(
yolo_box
,
ops
::
YoloBoxMLUKernel
<
float
>
);
paddle/fluid/operators/dropout_op_mlu.cc
View file @
dbe08e9b
...
...
@@ -39,8 +39,17 @@ class DropoutMLUKernel : public framework::OpKernel<T> {
MLUCnnlTensorDesc
x_desc
(
*
x
);
MLUCnnlTensorDesc
out_desc
(
*
out
);
if
(
!
is_test
)
{
// exec dropout op for training only.
if
(
is_test
&&
is_upscale
)
{
// dropout op for inference: out = input.
framework
::
TensorCopy
(
*
x
,
ctx
.
GetPlace
(),
ctx
.
template
device_context
<
platform
::
MLUDeviceContext
>(),
out
);
return
;
}
else
if
(
!
is_test
)
{
// dropout op for training: out = input * mask / ( 1.0 - dropout_prob ) or
// out = input * mask.
int
seed_data
=
0
;
if
(
seed_tensor
)
{
if
(
platform
::
is_mlu_place
(
seed_tensor
->
place
()))
{
...
...
@@ -79,50 +88,44 @@ class DropoutMLUKernel : public framework::OpKernel<T> {
const
int
device_id
=
ctx
.
GetPlace
().
GetDeviceId
();
auto
mlu_gen_random
=
GetMLURandomGenerator
(
ctx
,
device_id
,
seed_data
);
const
float
prob
=
is_upscale
?
dropout_prob
:
0.0
f
;
// compute out = input * mask / ( 1.0 -
dropout_prob
)
MLUCnnl
::
FusedDropout
(
ctx
,
mlu_gen_random
->
get
(),
x_desc
.
get
(),
GetBasePtr
(
x
),
prob
,
dropout_
prob
,
GetBasePtr
(
&
(
mlu_gen_random
->
get_state
())),
mask_desc
.
get
(),
GetBasePtr
(
mask
),
out_desc
.
get
(),
GetBasePtr
(
out
));
}
else
{
// exec dropout op for inference only.
if
(
is_upscale
)
{
framework
::
TensorCopy
(
*
x
,
ctx
.
GetPlace
(),
ctx
.
template
device_context
<
platform
::
MLUDeviceContext
>(),
out
);
}
else
{
auto
scale
=
static_cast
<
T
>
(
1.0
f
-
dropout_prob
);
Tensor
scale_tensor
(
x
->
dtype
());
scale_tensor
.
mutable_data
<
T
>
({
1
},
ctx
.
GetPlace
());
MLUCnnlTensorDesc
scale_desc
(
scale_tensor
);
MLUCnnl
::
Fill
(
ctx
,
CNNL_POINTER_MODE_HOST
,
&
scale
,
scale_desc
.
get
(),
GetBasePtr
(
&
scale_tensor
));
auto
data_type
=
ToCnnlDataType
<
T
>
();
MLUCnnlOpTensorDesc
op_tensor_desc
(
CNNL_OP_TENSOR_MUL
,
data_type
,
CNNL_NOT_PROPAGATE_NAN
);
MLUCnnl
::
OpTensor
(
ctx
,
op_tensor_desc
.
get
(),
x_desc
.
get
(),
GetBasePtr
(
x
),
scale_desc
.
get
(),
GetBasePtr
(
&
scale_tensor
),
out_desc
.
get
(),
GetBasePtr
(
out
),
data_type
);
return
;
}
}
// In downgrade_in_infer mode, need to multiply (1.0f - dropout_prob).
Tensor
scale_tensor
(
x
->
dtype
());
Tensor
bias_tensor
(
x
->
dtype
());
scale_tensor
.
mutable_data
<
T
>
({
1
},
ctx
.
GetPlace
());
bias_tensor
.
mutable_data
<
T
>
({
1
},
ctx
.
GetPlace
());
MLUCnnlTensorDesc
scale_desc
(
scale_tensor
);
MLUCnnlTensorDesc
bias_desc
(
bias_tensor
);
FillMLUTensorWithHostValue
(
ctx
,
static_cast
<
T
>
(
1.0
f
-
dropout_prob
),
&
scale_tensor
);
FillMLUTensorWithHostValue
(
ctx
,
static_cast
<
T
>
(
0.0
f
),
&
bias_tensor
);
MLUCnnl
::
Scale
(
ctx
,
0
,
is_test
?
x_desc
.
get
()
:
out_desc
.
get
(),
is_test
?
GetBasePtr
(
x
)
:
GetBasePtr
(
out
),
scale_desc
.
get
(),
GetBasePtr
(
&
scale_tensor
),
bias_desc
.
get
(),
GetBasePtr
(
&
bias_tensor
),
out_desc
.
get
(),
GetBasePtr
(
out
));
}
};
...
...
paddle/fluid/operators/fused/CMakeLists.txt
View file @
dbe08e9b
...
...
@@ -67,7 +67,7 @@ if(WITH_GPU OR WITH_ROCM)
op_library
(
skip_layernorm_op
)
op_library
(
yolo_box_head_op
)
op_library
(
yolo_box_post_op
)
op_library
(
fused_embedding_eltwise_layernorm_op
)
op_library
(
fused_embedding_eltwise_layernorm_op
DEPS bert_encoder_functor
)
op_library
(
fused_gate_attention_op
)
# fusion_group
if
(
NOT APPLE AND NOT WIN32
)
...
...
paddle/fluid/operators/fused/cudnn_norm_conv.cu.h
View file @
dbe08e9b
...
...
@@ -45,6 +45,14 @@ struct NormConvolutionArgs {
int
stride
,
int
dilation
,
int
group
)
{
PADDLE_ENFORCE_LT
(
ctx
.
GetComputeCapability
(),
90
,
phi
::
errors
::
PreconditionNotMet
(
"Expect compute compatiblity to be less than 90, but got %d. "
"CUDNN FusedOps is no longer available on H100 and later "
"devices."
,
ctx
.
GetComputeCapability
()));
PADDLE_ENFORCE_EQ
(
input_shape
.
size
(),
4U
,
...
...
paddle/fluid/operators/fused/cudnn_norm_conv_test.cc
View file @
dbe08e9b
...
...
@@ -442,7 +442,7 @@ TEST(CudnnNormConvFp16, K1S1) {
phi
::
GPUContext
*
ctx
=
static_cast
<
phi
::
GPUContext
*>
(
platform
::
DeviceContextPool
::
Instance
().
Get
(
platform
::
CUDAPlace
(
0
)));
if
(
ctx
->
GetComputeCapability
()
<
70
)
{
if
(
ctx
->
GetComputeCapability
()
<
70
||
ctx
->
GetComputeCapability
()
>=
90
)
{
ASSERT_THROW
(
test
.
CheckForward
(
1e-3
,
true
),
paddle
::
platform
::
EnforceNotMet
);
ASSERT_THROW
(
test
.
CheckBackward
(
1e-3
,
true
),
...
...
@@ -472,7 +472,7 @@ TEST(CudnnNormConvFp16, K3S1) {
phi
::
GPUContext
*
ctx
=
static_cast
<
phi
::
GPUContext
*>
(
platform
::
DeviceContextPool
::
Instance
().
Get
(
platform
::
CUDAPlace
(
0
)));
if
(
ctx
->
GetComputeCapability
()
<
70
)
{
if
(
ctx
->
GetComputeCapability
()
<
70
||
ctx
->
GetComputeCapability
()
>=
90
)
{
ASSERT_THROW
(
test
.
CheckForward
(
1e-3
,
true
),
paddle
::
platform
::
EnforceNotMet
);
ASSERT_THROW
(
test
.
CheckBackward
(
1e-3
,
true
),
...
...
@@ -502,7 +502,7 @@ TEST(CudnnNormConvFp16, K1S1O4) {
phi
::
GPUContext
*
ctx
=
static_cast
<
phi
::
GPUContext
*>
(
platform
::
DeviceContextPool
::
Instance
().
Get
(
platform
::
CUDAPlace
(
0
)));
if
(
ctx
->
GetComputeCapability
()
<
70
)
{
if
(
ctx
->
GetComputeCapability
()
<
70
||
ctx
->
GetComputeCapability
()
>=
90
)
{
ASSERT_THROW
(
test
.
CheckForward
(
1e-3
,
true
),
paddle
::
platform
::
EnforceNotMet
);
ASSERT_THROW
(
test
.
CheckBackward
(
1e-3
,
true
),
...
...
@@ -532,7 +532,7 @@ TEST(CudnnNormConvFp16, K1S2O4) {
phi
::
GPUContext
*
ctx
=
static_cast
<
phi
::
GPUContext
*>
(
platform
::
DeviceContextPool
::
Instance
().
Get
(
platform
::
CUDAPlace
(
0
)));
if
(
ctx
->
GetComputeCapability
()
<=
70
)
{
if
(
ctx
->
GetComputeCapability
()
<=
70
||
ctx
->
GetComputeCapability
()
>=
90
)
{
ASSERT_THROW
(
test
.
CheckForward
(
1e-3
,
true
),
paddle
::
platform
::
EnforceNotMet
);
ASSERT_THROW
(
test
.
CheckBackward
(
1e-3
),
paddle
::
platform
::
EnforceNotMet
);
...
...
paddle/fluid/operators/fused/fused_dropout_act_bias.h
View file @
dbe08e9b
...
...
@@ -256,17 +256,19 @@ template <typename T,
int
BlockSizeX
,
int
BlockSizeY
,
int
VecSize
,
typename
Functor
>
__global__
void
FusedDropoutActBiasGrad
(
Functor
act_grad
,
const
T
*
dout
,
const
MaskType
*
mask
,
const
T
*
src
,
const
T
*
bias
,
const
T
factor
,
const
int64_t
rows
,
const
int64_t
cols
,
T
*
dx
,
T
*
dbias
)
{
typename
Functor
,
int
THREADS_PER_CTA
=
BlockSizeX
*
BlockSizeY
>
__global__
__launch_bounds__
(
THREADS_PER_CTA
)
void
FusedDropoutActBiasGrad
(
Functor
act_grad
,
const
T
*
dout
,
const
MaskType
*
mask
,
const
T
*
src
,
const
T
*
bias
,
const
T
factor
,
const
int64_t
rows
,
const
int64_t
cols
,
T
*
dx
,
T
*
dbias
)
{
int64_t
col_id
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
using
LoadT
=
phi
::
AlignedVector
<
T
,
VecSize
>
;
...
...
paddle/fluid/operators/fused/fused_fc_elementwise_layernorm_op.cu
View file @
dbe08e9b
...
...
@@ -223,13 +223,7 @@ __global__ void InplaceAddReluAddLayerNormKernel(const float16* y_data,
// For layer_norm, reduce to calculate mean and std
sum_i
+=
static_cast
<
float
>
(
tmp_3
);
#if defined(PADDLE_WITH_CUDA) && __CUDA_ARCH__ >= 530
square_sum_i
+=
static_cast
<
float
>
(
__hmul
(
tmp_3
,
tmp_3
));
#elif defined(PADDLE_WITH_CUDA)
square_sum_i
+=
static_cast
<
float
>
(
tmp_3
)
*
static_cast
<
float
>
(
tmp_3
);
#else
square_sum_i
+=
static_cast
<
float
>
(
tmp_3
*
tmp_3
);
#endif
}
auto
pair
=
BlockReduce
(
temp_storage
)
.
Reduce
(
PairForLayerNorm
<
float
>
(
sum_i
,
square_sum_i
),
...
...
@@ -282,9 +276,9 @@ __global__ void InplaceAddReluAddLayerNormKernel(const float16* y_data,
half
tmp_0
=
__hdiv
(
__hsub
(
save_ptr
[
save_index
],
mean_i
),
std_i
);
half
tmp_1
=
scale
?
__hmul
(
scale
[
j
],
tmp_0
)
:
tmp_0
;
#else
half
tmp_0
=
static_cast
<
float
>
(
static_cast
<
float
>
(
save_ptr
[
save_index
])
+
static_cast
<
float
>
(
mean_i
)
/
static_cast
<
float
>
(
std_i
));
half
tmp_0
=
static_cast
<
half
>
(
(
static_cast
<
float
>
(
save_ptr
[
save_index
])
-
static_cast
<
float
>
(
mean_i
)
)
/
static_cast
<
float
>
(
std_i
));
half
tmp_1
=
scale
?
static_cast
<
half
>
(
static_cast
<
float
>
(
scale
[
j
])
*
static_cast
<
float
>
(
tmp_0
))
:
tmp_0
;
...
...
@@ -400,19 +394,16 @@ class FusedFCElementwiseLayerNormOpKernel : public framework::OpKernel<T> {
auto
*
out_data
=
dev_ctx
.
template
Alloc
<
T
>(
out
,
out
->
numel
()
*
sizeof
(
T
));
auto
blas
=
phi
::
funcs
::
GetBlas
<
phi
::
GPUContext
,
T
>
(
dev_ctx
);
blas
.
GEMM
(
false
,
false
,
blas
.
GEMM
(
CblasNoTrans
,
CblasNoTrans
,
M
,
N
,
K
,
static_cast
<
T
>
(
1.0
),
x_data
,
K
,
w_data
,
N
,
static_cast
<
T
>
(
0.0
),
out_data
,
N
);
out_data
);
auto
*
y
=
ctx
.
Input
<
framework
::
Tensor
>
(
"Y"
);
auto
*
bias_0
=
ctx
.
Input
<
framework
::
Tensor
>
(
"Bias0"
);
auto
*
bias_1
=
ctx
.
Input
<
framework
::
Tensor
>
(
"Bias1"
);
...
...
paddle/fluid/operators/fused/fused_gemm_epilogue_op.cc
View file @
dbe08e9b
...
...
@@ -139,9 +139,8 @@ class FusedGemmEpilogueOp : public framework::OperatorWithKernel {
}
ctx
->
SetOutputDim
(
"Out"
,
phi
::
make_ddim
(
out_dims
));
// Note (Ming Huang): Reserve space of relu is a bit-mask,
// which cannot pass nan_and_inf checking if shape is set.
if
(
activation
==
"gelu"
&&
ctx
->
HasOutput
(
"ReserveSpace"
))
{
if
(
ctx
->
HasOutput
(
"ReserveSpace"
))
{
ctx
->
SetOutputDim
(
"ReserveSpace"
,
phi
::
make_ddim
(
out_dims
));
}
}
...
...
Prev
1
2
3
4
5
6
7
8
…
16
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment