Commit dbe08e9b authored by yuguo960516yuguo's avatar yuguo960516yuguo
Browse files

2.4.2

parent b5499578
...@@ -115,7 +115,6 @@ class PD_INFER_DECL PaddlePassBuilder { ...@@ -115,7 +115,6 @@ class PD_INFER_DECL PaddlePassBuilder {
/// \cond Protected /// \cond Protected
std::vector<std::string> analysis_passes_{ std::vector<std::string> analysis_passes_{
{"ir_graph_build_pass", {"ir_graph_build_pass",
"ir_graph_clean_pass",
"ir_analysis_pass", "ir_analysis_pass",
"ir_params_sync_among_devices_pass", "ir_params_sync_among_devices_pass",
"adjust_cudnn_workspace_size_pass", "adjust_cudnn_workspace_size_pass",
......
...@@ -294,15 +294,6 @@ class TensorRTEngine { ...@@ -294,15 +294,6 @@ class TensorRTEngine {
nvinfer1::ICudaEngine* engine() { return infer_engine_.get(); } nvinfer1::ICudaEngine* engine() { return infer_engine_.get(); }
nvinfer1::IExecutionContext* context() { nvinfer1::IExecutionContext* context() {
#ifndef PADDLE_WITH_TESTING
PADDLE_ENFORCE_GT(
predictor_id_per_thread,
-1,
platform::errors::InvalidArgument(
"thread local var predictor_id_per_thread must be "
"initialized to >= 0, but now predictor_id_per_thread = %d",
predictor_id_per_thread));
#endif
std::unique_lock<std::mutex> lock(mutex_); std::unique_lock<std::mutex> lock(mutex_);
if (infer_context_.find(predictor_id_per_thread) == infer_context_.end()) { if (infer_context_.find(predictor_id_per_thread) == infer_context_.end()) {
PADDLE_ENFORCE_NOT_NULL( PADDLE_ENFORCE_NOT_NULL(
...@@ -329,15 +320,6 @@ class TensorRTEngine { ...@@ -329,15 +320,6 @@ class TensorRTEngine {
int GetProfileIndex() { int GetProfileIndex() {
if (max_profile_num_ > 1) { if (max_profile_num_ > 1) {
#ifndef PADDLE_WITH_TESTING
PADDLE_ENFORCE_GT(
predictor_id_per_thread,
-1,
platform::errors::InvalidArgument(
"thread local var predictor_id_per_thread must be "
"initialized to >= 0, but now predictor_id_per_thread = %d",
predictor_id_per_thread));
#endif
std::unique_lock<std::mutex> lock(mutex_); std::unique_lock<std::mutex> lock(mutex_);
return profile_index_[predictor_id_per_thread]; return profile_index_[predictor_id_per_thread];
} else { } else {
...@@ -356,15 +338,6 @@ class TensorRTEngine { ...@@ -356,15 +338,6 @@ class TensorRTEngine {
infer_engine_, infer_engine_,
platform::errors::InvalidArgument( platform::errors::InvalidArgument(
"You should build engine first and then set the context.")); "You should build engine first and then set the context."));
#ifndef PADDLE_WITH_TESTING
PADDLE_ENFORCE_GT(
predictor_id_per_thread,
-1,
platform::errors::InvalidArgument(
"thread local var predictor_id_per_thread must be "
"initialized to >= 0, but now predictor_id_per_thread = %d",
predictor_id_per_thread));
#endif
std::unique_lock<std::mutex> lock(mutex_); std::unique_lock<std::mutex> lock(mutex_);
infer_context_[predictor_id_per_thread].reset(nullptr); infer_context_[predictor_id_per_thread].reset(nullptr);
infer_context_.erase(predictor_id_per_thread); infer_context_.erase(predictor_id_per_thread);
......
...@@ -639,8 +639,12 @@ struct SimpleOpTypeSetTeller : public Teller { ...@@ -639,8 +639,12 @@ struct SimpleOpTypeSetTeller : public Teller {
int axis = desc.HasAttr("axis") int axis = desc.HasAttr("axis")
? PADDLE_GET_CONST(int64_t, desc.GetAttr("axis")) ? PADDLE_GET_CONST(int64_t, desc.GetAttr("axis"))
: -1; : -1;
bool flatten = PADDLE_GET_CONST(bool, desc.GetAttr("flatten")); bool flatten = desc.HasAttr("flatten")
int dtype = PADDLE_GET_CONST(int, desc.GetAttr("dtype")); ? PADDLE_GET_CONST(bool, desc.GetAttr("flatten"))
: false;
int dtype = desc.HasAttr("dtype")
? PADDLE_GET_CONST(int, desc.GetAttr("dtype"))
: 3;
if (axis == 0 || flatten || dtype != 2) return false; if (axis == 0 || flatten || dtype != 2) return false;
} }
...@@ -1708,8 +1712,10 @@ struct SimpleOpTypeSetTeller : public Teller { ...@@ -1708,8 +1712,10 @@ struct SimpleOpTypeSetTeller : public Teller {
return false; return false;
} }
} else { } else {
#if !IS_TRT_VERSION_GE(8000) #if (IS_TRT_VERSION_GE(8000) && IS_TRT_VERSION_LT(8100)) || \
VLOG(3) << "The version of TRT must be greater than 8000"; (IS_TRT_VERSION_LT(7200))
VLOG(3) << "There are some bugs in v8.0.* and the versions lower than "
"v7.2 are not supported";
return false; return false;
#endif #endif
} }
......
...@@ -104,6 +104,7 @@ bool PluginArgumentMappingContext::IsSelectedRowsInput( ...@@ -104,6 +104,7 @@ bool PluginArgumentMappingContext::IsSelectedRowsInput(
const std::string& name) const { const std::string& name) const {
return false; return false;
} }
bool PluginArgumentMappingContext::IsSparseCooTensorInput( bool PluginArgumentMappingContext::IsSparseCooTensorInput(
const std::string& name) const { const std::string& name) const {
return false; return false;
...@@ -112,6 +113,11 @@ bool PluginArgumentMappingContext::IsSparseCsrTensorInput( ...@@ -112,6 +113,11 @@ bool PluginArgumentMappingContext::IsSparseCsrTensorInput(
const std::string& name) const { const std::string& name) const {
return false; return false;
} }
bool PluginArgumentMappingContext::IsSelectedRowsInputs(
const std::string& name) const {
return false;
}
bool PluginArgumentMappingContext::IsDenseTensorVectorInput( bool PluginArgumentMappingContext::IsDenseTensorVectorInput(
const std::string& name) const { const std::string& name) const {
return false; return false;
......
...@@ -50,6 +50,8 @@ class PluginArgumentMappingContext : public ::phi::ArgumentMappingContext { ...@@ -50,6 +50,8 @@ class PluginArgumentMappingContext : public ::phi::ArgumentMappingContext {
bool IsSparseCsrTensorInput(const std::string& name) const override; bool IsSparseCsrTensorInput(const std::string& name) const override;
bool IsSelectedRowsInputs(const std::string& name) const override;
bool IsDenseTensorVectorInput(const std::string& name) const override; bool IsDenseTensorVectorInput(const std::string& name) const override;
bool IsDenseTensorOutput(const std::string& name) const override; bool IsDenseTensorOutput(const std::string& name) const override;
......
...@@ -416,6 +416,9 @@ download_result(${ERNIE_INSTALL_DIR} "Ernie_result.txt.tar.gz" ...@@ -416,6 +416,9 @@ download_result(${ERNIE_INSTALL_DIR} "Ernie_result.txt.tar.gz"
if(WITH_GPU) if(WITH_GPU)
inference_analysis_api_test(test_analyzer_ernie ${ERNIE_INSTALL_DIR} inference_analysis_api_test(test_analyzer_ernie ${ERNIE_INSTALL_DIR}
analyzer_ernie_tester.cc) analyzer_ernie_tester.cc)
inference_analysis_api_test(gpu_ernie_half_test ${ERNIE_INSTALL_DIR}
gpu_ernie_half_test.cc)
set_tests_properties(gpu_ernie_half_test PROPERTIES TIMEOUT 60)
endif() endif()
inference_analysis_api_int8_test(test_analyzer_ernie_int8 ${ERNIE_INSTALL_DIR} inference_analysis_api_int8_test(test_analyzer_ernie_int8 ${ERNIE_INSTALL_DIR}
analyzer_ernie_int8_tester.cc) analyzer_ernie_int8_tester.cc)
......
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "paddle/fluid/inference/tests/api/tester_helper.h"
namespace paddle {
namespace inference {
using paddle::PaddleTensor;
template <typename T>
void GetValueFromStream(std::stringstream *ss, T *t) {
(*ss) >> (*t);
}
template <>
void GetValueFromStream<std::string>(std::stringstream *ss, std::string *t) {
*t = ss->str();
}
// Split string to vector
template <typename T>
void Split(const std::string &line, char sep, std::vector<T> *v) {
std::stringstream ss;
T t;
for (auto c : line) {
if (c != sep) {
ss << c;
} else {
GetValueFromStream<T>(&ss, &t);
v->push_back(std::move(t));
ss.str({});
ss.clear();
}
}
if (!ss.str().empty()) {
GetValueFromStream<T>(&ss, &t);
v->push_back(std::move(t));
ss.str({});
ss.clear();
}
}
// Parse tensor from string
template <typename T>
bool ParseTensor(const std::string &field, paddle::PaddleTensor *tensor) {
std::vector<std::string> data;
Split(field, ':', &data);
if (data.size() < 2) return false;
std::string shape_str = data[0];
std::vector<int> shape;
Split(shape_str, ' ', &shape);
std::string mat_str = data[1];
std::vector<T> mat;
Split(mat_str, ' ', &mat);
tensor->shape = shape;
auto size =
std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<int>()) *
sizeof(T);
tensor->data.Resize(size);
std::copy(mat.begin(), mat.end(), static_cast<T *>(tensor->data.data()));
tensor->dtype = GetPaddleDType<T>();
return true;
}
// Parse input tensors from string
bool ParseLine(const std::string &line,
std::vector<paddle::PaddleTensor> *tensors) {
std::vector<std::string> fields;
Split(line, ';', &fields);
tensors->clear();
tensors->reserve(4);
int i = 0;
auto input_name = FLAGS_ernie_large ? "eval_placeholder_" : "placeholder_";
for (; i < 3; i++) {
paddle::PaddleTensor temp;
ParseTensor<int64_t>(fields[i], &temp);
temp.name = input_name + std::to_string(i);
tensors->push_back(temp);
}
// input_mask
paddle::PaddleTensor input_mask;
ParseTensor<float>(fields[i], &input_mask);
input_mask.name = input_name + std::to_string(i);
tensors->push_back(input_mask);
return true;
}
bool LoadInputData(std::vector<std::vector<paddle::PaddleTensor>> *inputs,
int batch_size = 1) {
if (FLAGS_infer_data.empty()) {
LOG(ERROR) << "please set input data path";
return false;
}
std::ifstream fin(FLAGS_infer_data);
std::string line;
int sample = 0;
// The unit-test dataset only have 10 samples, each sample have 5 feeds.
while (std::getline(fin, line)) {
std::vector<paddle::PaddleTensor> feed_data;
ParseLine(line, &feed_data);
inputs->push_back(std::move(feed_data));
sample++;
if (!FLAGS_test_all_data && sample == batch_size) break;
}
LOG(INFO) << "number of samples: " << sample;
return true;
}
// Compare results
TEST(Ernie_gpu_fp16_no_ir, compare_results) {
AnalysisConfig config;
config.SetModel(FLAGS_infer_model);
config.EnableUseGpu(512, 0, paddle_infer::PrecisionType::kHalf);
config.SwitchIrOptim(false);
auto predictor = CreatePaddlePredictor(config);
std::vector<std::vector<PaddleTensor>> input_slots_all;
LoadInputData(&input_slots_all);
std::ifstream fin(FLAGS_refer_result);
std::string line;
std::vector<float> ref;
while (std::getline(fin, line)) {
Split(line, ' ', &ref);
}
std::vector<PaddleTensor> outputs;
for (size_t i = 0; i < input_slots_all.size(); i++) {
outputs.clear();
predictor->Run(input_slots_all[i], &outputs);
auto output = outputs.front();
size_t outputs_size = 1;
for (auto dim : output.shape) {
outputs_size *= dim;
}
float *result = reinterpret_cast<float *>(output.data.data());
for (size_t j = 0; j < outputs_size; ++j) {
EXPECT_NEAR(ref[i * outputs_size + j], result[j], 8e-3);
}
}
}
// Compare results
TEST(Ernie_gpu_fp16_with_ir, compare_results) {
AnalysisConfig config;
config.SetModel(FLAGS_infer_model);
config.EnableUseGpu(512, 0, paddle_infer::PrecisionType::kHalf);
config.SwitchIrOptim(true);
// There is a problem with the model itself, which has nothing to do with
// constant_folding_pass.
config.pass_builder()->DeletePass("constant_folding_pass");
auto predictor = CreatePaddlePredictor(config);
std::vector<std::vector<PaddleTensor>> input_slots_all;
LoadInputData(&input_slots_all);
std::ifstream fin(FLAGS_refer_result);
std::string line;
std::vector<float> ref;
while (std::getline(fin, line)) {
Split(line, ' ', &ref);
}
std::vector<PaddleTensor> outputs;
for (size_t i = 0; i < input_slots_all.size(); i++) {
outputs.clear();
predictor->Run(input_slots_all[i], &outputs);
auto output = outputs.front();
size_t outputs_size = 1;
for (auto dim : output.shape) {
outputs_size *= dim;
}
float *result = reinterpret_cast<float *>(output.data.data());
for (size_t j = 0; j < outputs_size; ++j) {
EXPECT_NEAR(ref[i * outputs_size + j], result[j], 2e-2);
}
}
}
// Compare results
TEST(Ernie_gpu_bf16_no_ir, compare_results) {
AnalysisConfig config;
config.SetModel(FLAGS_infer_model);
config.EnableUseGpu(512, 0, paddle_infer::PrecisionType::kBf16);
config.SwitchIrOptim(false);
auto predictor = CreatePaddlePredictor(config);
std::vector<std::vector<PaddleTensor>> input_slots_all;
LoadInputData(&input_slots_all);
std::ifstream fin(FLAGS_refer_result);
std::string line;
std::vector<float> ref;
while (std::getline(fin, line)) {
Split(line, ' ', &ref);
}
std::vector<PaddleTensor> outputs;
for (size_t i = 0; i < input_slots_all.size(); i++) {
outputs.clear();
predictor->Run(input_slots_all[i], &outputs);
auto output = outputs.front();
size_t outputs_size = 1;
for (auto dim : output.shape) {
outputs_size *= dim;
}
float *result = reinterpret_cast<float *>(output.data.data());
for (size_t j = 0; j < outputs_size; ++j) {
EXPECT_NEAR(ref[i * outputs_size + j], result[j], 1e-2);
}
}
}
// Compare results
TEST(Ernie_gpu_bf16_with_ir, compare_results) {
AnalysisConfig config;
config.SetModel(FLAGS_infer_model);
config.EnableUseGpu(512, 0, paddle_infer::PrecisionType::kBf16);
config.SwitchIrOptim(true);
// There is a problem with the model itself, which has nothing to do with
// constant_folding_pass.
config.pass_builder()->DeletePass("constant_folding_pass");
auto predictor = CreatePaddlePredictor(config);
std::vector<std::vector<PaddleTensor>> input_slots_all;
LoadInputData(&input_slots_all);
std::ifstream fin(FLAGS_refer_result);
std::string line;
std::vector<float> ref;
while (std::getline(fin, line)) {
Split(line, ' ', &ref);
}
std::vector<PaddleTensor> outputs;
for (size_t i = 0; i < input_slots_all.size(); i++) {
outputs.clear();
predictor->Run(input_slots_all[i], &outputs);
auto output = outputs.front();
size_t outputs_size = 1;
for (auto dim : output.shape) {
outputs_size *= dim;
}
float *result = reinterpret_cast<float *>(output.data.data());
for (size_t j = 0; j < outputs_size; ++j) {
EXPECT_NEAR(ref[i * outputs_size + j], result[j], 5e-3);
}
}
}
} // namespace inference
} // namespace paddle
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. /* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License"); Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License. you may not use this file except in compliance with the License.
...@@ -12,15 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,15 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <cuda_runtime.h>
#include <glog/logging.h> #include <glog/logging.h>
#include <gtest/gtest.h> #include <gtest/gtest.h>
#include <cstring>
#include <numeric>
#include "gflags/gflags.h" #include "gflags/gflags.h"
#include "paddle/fluid/inference/tests/api/trt_test_helper.h" #include "paddle/fluid/inference/tests/api/tester_helper.h"
namespace paddle_infer { namespace paddle_infer {
......
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/collective/barrier_op.h"
#if defined(PADDLE_WITH_CNCL)
#include "paddle/fluid/platform/collective_helper.h"
#include "paddle/fluid/platform/device/mlu/cncl_helper.h"
#endif
namespace paddle {
namespace operators {
template <typename T>
class BarrierOpMLUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
#if defined(PADDLE_WITH_CNCL)
auto in = ctx.Input<phi::DenseTensor>("X");
auto out = ctx.Output<phi::DenseTensor>("Out");
auto place = ctx.GetPlace();
cnclDataType_t dtype =
platform::ToCNCLDataType(framework::TransToProtoVarType(in->dtype()));
int64_t numel = in->numel();
const void* sendbuff = in->data();
void* recvbuff = out->mutable_data<T>(place);
int rid = ctx.Attr<int>("ring_id");
auto cncl_comm = platform::CNCLCommContext::Instance().Get(rid, place);
auto* comm = cncl_comm->comm();
auto comm_stream = cncl_comm->stream();
auto& dev_ctx =
ctx.template device_context<paddle::platform::MLUDeviceContext>();
cnclReduceOp_t cncl_red_type = cnclSum;
dev_ctx.Wait();
PADDLE_ENFORCE_MLU_SUCCESS(cnclAllReduce(
sendbuff, recvbuff, numel, dtype, cncl_red_type, comm, comm_stream));
PADDLE_ENFORCE_MLU_SUCCESS(cnrtQueueSync(comm_stream));
#else
PADDLE_THROW(platform::errors::Unavailable(
"PaddlePaddle should compile with CNCL."));
#endif
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_MLU_KERNEL(barrier, ops::BarrierOpMLUKernel<int>);
...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/collective/c_allgather_op.h" #include "paddle/fluid/operators/collective/c_allgather_op.h"
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
#if defined(PADDLE_WITH_CNCL) #if defined(PADDLE_WITH_CNCL)
#include "paddle/fluid/platform/collective_helper.h" #include "paddle/fluid/platform/collective_helper.h"
...@@ -27,15 +28,14 @@ template <typename T> ...@@ -27,15 +28,14 @@ template <typename T>
class CAllGatherOpMLUKernel : public framework::OpKernel<T> { class CAllGatherOpMLUKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
auto place = ctx.GetPlace();
auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
#if defined(PADDLE_WITH_CNCL) #if defined(PADDLE_WITH_CNCL)
auto x = ctx.Input<framework::Tensor>("X"); auto x = ctx.Input<phi::DenseTensor>("X");
auto out = ctx.Output<framework::Tensor>("Out"); auto out = ctx.Output<phi::DenseTensor>("Out");
cnclDataType_t dtype =
platform::ToCNCLDataType(framework::TransToProtoVarType(x->dtype()));
int nranks = ctx.Attr<int>("nranks"); int nranks = ctx.Attr<int>("nranks");
int rid = ctx.Attr<int>("ring_id"); int rid = ctx.Attr<int>("ring_id");
auto place = ctx.GetPlace();
auto comm = platform::CNCLCommContext::Instance().Get(rid, place); auto comm = platform::CNCLCommContext::Instance().Get(rid, place);
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
nranks, nranks,
...@@ -48,19 +48,56 @@ class CAllGatherOpMLUKernel : public framework::OpKernel<T> { ...@@ -48,19 +48,56 @@ class CAllGatherOpMLUKernel : public framework::OpKernel<T> {
out->mutable_data<T>(out_dims, place); out->mutable_data<T>(out_dims, place);
uint32_t send_numel = x->numel(); uint32_t send_numel = x->numel();
void* send_buff = reinterpret_cast<void*>(const_cast<T*>(x->data<T>())); void* send_buff;
void* recv_buff = reinterpret_cast<void*>(out->data<T>()); void* recv_buff;
phi::DenseTensor in_tensor, out_tensor;
if (framework::TransToProtoVarType(x->dtype()) ==
framework::proto::VarType::INT64) {
// cast from int64 to int32 since cncl do not support int64
in_tensor.mutable_data<int32_t>(x->dims(), place);
out_tensor.mutable_data<int32_t>(out->dims(), place);
MLUCnnlTensorDesc x_int64_desc(*x);
MLUCnnlTensorDesc x_int32_desc(in_tensor);
cnnlCastDataType_t cast_type = GetCastDataType(VT::INT64, VT::INT32);
MLUCnnl::Cast(ctx,
cast_type,
x_int64_desc.get(),
GetBasePtr(x),
x_int32_desc.get(),
GetBasePtr(&in_tensor));
send_buff = reinterpret_cast<void*>(in_tensor.data<int32_t>());
recv_buff = reinterpret_cast<void*>(out_tensor.data<int32_t>());
} else {
in_tensor.ShareDataWith(*x);
out_tensor.ShareDataWith(*out);
send_buff = reinterpret_cast<void*>(in_tensor.data<T>());
recv_buff = reinterpret_cast<void*>(out_tensor.data<T>());
}
mluStream stream = nullptr; mluStream stream = nullptr;
if (ctx.Attr<bool>("use_calc_stream")) { if (ctx.Attr<bool>("use_calc_stream")) {
auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
stream = static_cast<platform::MLUDeviceContext*>(dev_ctx)->stream(); stream = static_cast<platform::MLUDeviceContext*>(dev_ctx)->stream();
} else { } else {
stream = comm->stream(); stream = comm->stream();
} }
cnclDataType_t dtype = platform::ToCNCLDataType(
framework::TransToProtoVarType(in_tensor.dtype()));
PADDLE_ENFORCE_MLU_SUCCESS(cnclAllGather( PADDLE_ENFORCE_MLU_SUCCESS(cnclAllGather(
send_buff, recv_buff, send_numel, dtype, comm->comm(), stream)); send_buff, recv_buff, send_numel, dtype, comm->comm(), stream));
if (framework::TransToProtoVarType(x->dtype()) ==
framework::proto::VarType::INT64) {
// cast back from int64 out_tensor to out
MLUCnnlTensorDesc out_int64_desc(*out);
MLUCnnlTensorDesc out_int32_desc(out_tensor);
cnnlCastDataType_t cast_type = GetCastDataType(VT::INT32, VT::INT64);
MLUCnnl::Cast(ctx,
cast_type,
out_int32_desc.get(),
GetBasePtr(&out_tensor),
out_int64_desc.get(),
GetBasePtr(out));
}
#else #else
PADDLE_THROW(platform::errors::PreconditionNotMet( PADDLE_THROW(platform::errors::PreconditionNotMet(
"PaddlePaddle should compile with MLU.")); "PaddlePaddle should compile with MLU."));
...@@ -80,4 +117,5 @@ REGISTER_OP_MLU_KERNEL(c_allgather, ...@@ -80,4 +117,5 @@ REGISTER_OP_MLU_KERNEL(c_allgather,
ops::CAllGatherOpMLUKernel<int>, ops::CAllGatherOpMLUKernel<int>,
ops::CAllGatherOpMLUKernel<int8_t>, ops::CAllGatherOpMLUKernel<int8_t>,
ops::CAllGatherOpMLUKernel<int16_t>, ops::CAllGatherOpMLUKernel<int16_t>,
ops::CAllGatherOpMLUKernel<int64_t>,
ops::CAllGatherOpMLUKernel<plat::float16>); ops::CAllGatherOpMLUKernel<plat::float16>);
...@@ -42,19 +42,23 @@ if(WITH_XPU) ...@@ -42,19 +42,23 @@ if(WITH_XPU)
detection_library(iou_similarity_op SRCS iou_similarity_op.cc detection_library(iou_similarity_op SRCS iou_similarity_op.cc
iou_similarity_op_xpu.cc) iou_similarity_op_xpu.cc)
detection_library(prior_box_op SRCS prior_box_op.cc) detection_library(prior_box_op SRCS prior_box_op.cc)
detection_library(yolo_box_op SRCS yolo_box_op.cc)
detection_library(generate_proposals_v2_op SRCS generate_proposals_v2_op.cc) detection_library(generate_proposals_v2_op SRCS generate_proposals_v2_op.cc)
elseif(WITH_MLU) elseif(WITH_MLU)
detection_library(iou_similarity_op SRCS iou_similarity_op.cc detection_library(iou_similarity_op SRCS iou_similarity_op.cc
iou_similarity_op_mlu.cc) iou_similarity_op_mlu.cc)
detection_library(prior_box_op SRCS prior_box_op.cc) detection_library(prior_box_op SRCS prior_box_op.cc prior_box_op_mlu.cc)
detection_library(yolo_box_op SRCS yolo_box_op.cc yolo_box_op_mlu.cc)
elseif(WITH_ASCEND_CL) elseif(WITH_ASCEND_CL)
detection_library(iou_similarity_op SRCS iou_similarity_op.cc detection_library(iou_similarity_op SRCS iou_similarity_op.cc
iou_similarity_op_npu.cc) iou_similarity_op_npu.cc)
detection_library(prior_box_op SRCS prior_box_op.cc prior_box_op_npu.cc) detection_library(prior_box_op SRCS prior_box_op.cc prior_box_op_npu.cc)
detection_library(yolo_box_op SRCS yolo_box_op.cc)
else() else()
detection_library(iou_similarity_op SRCS iou_similarity_op.cc detection_library(iou_similarity_op SRCS iou_similarity_op.cc
iou_similarity_op.cu) iou_similarity_op.cu)
detection_library(prior_box_op SRCS prior_box_op.cc) detection_library(prior_box_op SRCS prior_box_op.cc)
detection_library(yolo_box_op SRCS yolo_box_op.cc)
# detection_library(generate_proposals_v2_op SRCS generate_proposals_v2_op.cc) # detection_library(generate_proposals_v2_op SRCS generate_proposals_v2_op.cc)
endif() endif()
...@@ -73,7 +77,6 @@ detection_library(locality_aware_nms_op SRCS locality_aware_nms_op.cc DEPS gpc) ...@@ -73,7 +77,6 @@ detection_library(locality_aware_nms_op SRCS locality_aware_nms_op.cc DEPS gpc)
detection_library(matrix_nms_op SRCS matrix_nms_op.cc DEPS gpc) detection_library(matrix_nms_op SRCS matrix_nms_op.cc DEPS gpc)
detection_library(box_clip_op SRCS box_clip_op.cc box_clip_op.cu) detection_library(box_clip_op SRCS box_clip_op.cc box_clip_op.cu)
detection_library(yolov3_loss_op SRCS yolov3_loss_op.cc) detection_library(yolov3_loss_op SRCS yolov3_loss_op.cc)
detection_library(yolo_box_op SRCS yolo_box_op.cc)
detection_library(box_decoder_and_assign_op SRCS box_decoder_and_assign_op.cc detection_library(box_decoder_and_assign_op SRCS box_decoder_and_assign_op.cc
box_decoder_and_assign_op.cu) box_decoder_and_assign_op.cu)
detection_library(sigmoid_focal_loss_op SRCS sigmoid_focal_loss_op.cc detection_library(sigmoid_focal_loss_op SRCS sigmoid_focal_loss_op.cc
......
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/operators/detection/prior_box_op.h"
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
namespace paddle {
namespace operators {
template <typename T>
class PriorBoxMLUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* input = ctx.Input<phi::DenseTensor>("Input");
auto* image = ctx.Input<phi::DenseTensor>("Image");
auto* boxes = ctx.Output<phi::DenseTensor>("Boxes");
auto* variances = ctx.Output<phi::DenseTensor>("Variances");
float step_w = ctx.Attr<float>("step_w");
float step_h = ctx.Attr<float>("step_h");
float offset = ctx.Attr<float>("offset");
bool clip = ctx.Attr<bool>("clip");
bool min_max_aspect_ratios_order =
ctx.Attr<bool>("min_max_aspect_ratios_order");
int im_width = image->dims()[3];
int im_height = image->dims()[2];
int width = input->dims()[3];
int height = input->dims()[2];
auto aspect_ratios = ctx.Attr<std::vector<float>>("aspect_ratios");
bool flip = ctx.Attr<bool>("flip");
std::vector<float> new_aspect_ratios;
ExpandAspectRatios(aspect_ratios, flip, &new_aspect_ratios);
auto& dev_ctx = ctx.template device_context<platform::MLUDeviceContext>();
phi::DenseTensor ratios;
paddle::framework::TensorFromVector(new_aspect_ratios, dev_ctx, &ratios);
MLUOpTensorDesc new_aspect_ratios_desc(ratios);
auto min_sizes = ctx.Attr<std::vector<float>>("min_sizes");
phi::DenseTensor min;
paddle::framework::TensorFromVector(min_sizes, dev_ctx, &min);
MLUOpTensorDesc min_sizes_desc(min);
auto max_sizes = ctx.Attr<std::vector<float>>("max_sizes");
phi::DenseTensor max;
paddle::framework::TensorFromVector(max_sizes, dev_ctx, &max);
MLUOpTensorDesc max_sizes_desc(max);
auto variances_attr = ctx.Attr<std::vector<float>>("variances");
phi::DenseTensor var_tensor;
paddle::framework::TensorFromVector(variances_attr, dev_ctx, &var_tensor);
MLUOpTensorDesc variances_attr_desc(var_tensor);
auto place = ctx.GetPlace();
boxes->mutable_data<T>(place);
variances->mutable_data<T>(place);
MLUOpTensorDesc var_desc(*variances);
MLUOpTensorDesc output_desc(*boxes);
MLUOP::OpPriorBox(ctx,
min_sizes_desc.get(),
GetBasePtr(&min),
new_aspect_ratios_desc.get(),
GetBasePtr(&ratios),
variances_attr_desc.get(),
GetBasePtr(&var_tensor),
max_sizes_desc.get(),
GetBasePtr(&max),
height,
width,
im_height,
im_width,
step_h,
step_w,
offset,
clip,
min_max_aspect_ratios_order,
output_desc.get(),
GetBasePtr(boxes),
var_desc.get(),
GetBasePtr(variances));
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_MLU_KERNEL(prior_box, ops::PriorBoxMLUKernel<float>);
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
namespace paddle {
namespace operators {
template <typename T>
class YoloBoxMLUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* x = ctx.Input<phi::DenseTensor>("X");
auto* img_size = ctx.Input<phi::DenseTensor>("ImgSize");
auto* boxes = ctx.Output<phi::DenseTensor>("Boxes");
auto* scores = ctx.Output<phi::DenseTensor>("Scores");
const std::vector<int> anchors = ctx.Attr<std::vector<int>>("anchors");
auto class_num = ctx.Attr<int>("class_num");
auto conf_thresh = ctx.Attr<float>("conf_thresh");
auto downsample_ratio = ctx.Attr<int>("downsample_ratio");
auto clip_bbox = ctx.Attr<bool>("clip_bbox");
auto scale = ctx.Attr<float>("scale_x_y");
auto iou_aware = ctx.Attr<bool>("iou_aware");
auto iou_aware_factor = ctx.Attr<float>("iou_aware_factor");
int anchor_num = anchors.size() / 2;
int64_t size = anchors.size();
auto dim_x = x->dims();
int n = dim_x[0];
int s = anchor_num;
int h = dim_x[2];
int w = dim_x[3];
// The output of mluOpYoloBox: A 4-D tensor with shape [N, anchor_num, 4,
// H*W], the coordinates of boxes, and a 4-D tensor with shape [N,
// anchor_num, :attr:`class_num`, H*W], the classification scores of boxes.
std::vector<int64_t> boxes_dim_mluops({n, s, 4, h * w});
std::vector<int64_t> scores_dim_mluops({n, s, class_num, h * w});
// In Paddle framework: A 3-D tensor with shape [N, M, 4], the coordinates
// of boxes, and a 3-D tensor with shape [N, M, :attr:`class_num`], the
// classification scores of boxes.
std::vector<int64_t> boxes_out_dim({n, s, h * w, 4});
std::vector<int64_t> scores_out_dim({n, s, h * w, class_num});
auto& dev_ctx = ctx.template device_context<MLUDeviceContext>();
phi::DenseTensor boxes_tensor_mluops =
ctx.AllocateTmpTensor<T, MLUDeviceContext>({n, s, 4, h * w}, dev_ctx);
phi::DenseTensor scores_tensor_mluops =
ctx.AllocateTmpTensor<T, MLUDeviceContext>({n, s, class_num, h * w},
dev_ctx);
MLUOpTensorDesc boxes_trans_desc_mluops(
4, boxes_dim_mluops.data(), ToMluOpDataType<T>());
MLUCnnlTensorDesc boxes_trans_desc_cnnl(
4, boxes_dim_mluops.data(), ToCnnlDataType<T>());
MLUOpTensorDesc scores_trans_desc_mluops(
4, scores_dim_mluops.data(), ToMluOpDataType<T>());
MLUCnnlTensorDesc scores_trans_desc_cnnl(
4, scores_dim_mluops.data(), ToCnnlDataType<T>());
boxes->mutable_data<T>(ctx.GetPlace());
scores->mutable_data<T>(ctx.GetPlace());
FillMLUTensorWithHostValue(ctx, static_cast<T>(0), boxes);
FillMLUTensorWithHostValue(ctx, static_cast<T>(0), scores);
MLUOpTensorDesc x_desc(*x, MLUOP_LAYOUT_ARRAY, ToMluOpDataType<T>());
MLUOpTensorDesc img_size_desc(
*img_size, MLUOP_LAYOUT_ARRAY, ToMluOpDataType<int32_t>());
Tensor anchors_temp(framework::TransToPhiDataType(VT::INT32));
anchors_temp.Resize({size});
paddle::framework::TensorFromVector(
anchors, ctx.device_context(), &anchors_temp);
MLUOpTensorDesc anchors_desc(anchors_temp);
MLUCnnlTensorDesc boxes_desc_cnnl(
4, boxes_out_dim.data(), ToCnnlDataType<T>());
MLUCnnlTensorDesc scores_desc_cnnl(
4, scores_out_dim.data(), ToCnnlDataType<T>());
MLUOP::OpYoloBox(ctx,
x_desc.get(),
GetBasePtr(x),
img_size_desc.get(),
GetBasePtr(img_size),
anchors_desc.get(),
GetBasePtr(&anchors_temp),
class_num,
conf_thresh,
downsample_ratio,
clip_bbox,
scale,
iou_aware,
iou_aware_factor,
boxes_trans_desc_mluops.get(),
GetBasePtr(&boxes_tensor_mluops),
scores_trans_desc_mluops.get(),
GetBasePtr(&scores_tensor_mluops));
const std::vector<int> perm = {0, 1, 3, 2};
// transpose the boxes from [N, S, 4, H*W] to [N, S, H*W, 4]
MLUCnnl::Transpose(ctx,
perm,
4,
boxes_trans_desc_cnnl.get(),
GetBasePtr(&boxes_tensor_mluops),
boxes_desc_cnnl.get(),
GetBasePtr(boxes));
// transpose the scores from [N, S, class_num, H*W] to [N, S, H*W,
// class_num]
MLUCnnl::Transpose(ctx,
perm,
4,
scores_trans_desc_cnnl.get(),
GetBasePtr(&scores_tensor_mluops),
scores_desc_cnnl.get(),
GetBasePtr(scores));
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_MLU_KERNEL(yolo_box, ops::YoloBoxMLUKernel<float>);
...@@ -39,8 +39,17 @@ class DropoutMLUKernel : public framework::OpKernel<T> { ...@@ -39,8 +39,17 @@ class DropoutMLUKernel : public framework::OpKernel<T> {
MLUCnnlTensorDesc x_desc(*x); MLUCnnlTensorDesc x_desc(*x);
MLUCnnlTensorDesc out_desc(*out); MLUCnnlTensorDesc out_desc(*out);
if (!is_test) { if (is_test && is_upscale) {
// exec dropout op for training only. // dropout op for inference: out = input.
framework::TensorCopy(
*x,
ctx.GetPlace(),
ctx.template device_context<platform::MLUDeviceContext>(),
out);
return;
} else if (!is_test) {
// dropout op for training: out = input * mask / ( 1.0 - dropout_prob ) or
// out = input * mask.
int seed_data = 0; int seed_data = 0;
if (seed_tensor) { if (seed_tensor) {
if (platform::is_mlu_place(seed_tensor->place())) { if (platform::is_mlu_place(seed_tensor->place())) {
...@@ -79,50 +88,44 @@ class DropoutMLUKernel : public framework::OpKernel<T> { ...@@ -79,50 +88,44 @@ class DropoutMLUKernel : public framework::OpKernel<T> {
const int device_id = ctx.GetPlace().GetDeviceId(); const int device_id = ctx.GetPlace().GetDeviceId();
auto mlu_gen_random = GetMLURandomGenerator(ctx, device_id, seed_data); auto mlu_gen_random = GetMLURandomGenerator(ctx, device_id, seed_data);
const float prob = is_upscale ? dropout_prob : 0.0f; // compute out = input * mask / ( 1.0 - dropout_prob )
MLUCnnl::FusedDropout(ctx, MLUCnnl::FusedDropout(ctx,
mlu_gen_random->get(), mlu_gen_random->get(),
x_desc.get(), x_desc.get(),
GetBasePtr(x), GetBasePtr(x),
prob, dropout_prob,
GetBasePtr(&(mlu_gen_random->get_state())), GetBasePtr(&(mlu_gen_random->get_state())),
mask_desc.get(), mask_desc.get(),
GetBasePtr(mask), GetBasePtr(mask),
out_desc.get(), out_desc.get(),
GetBasePtr(out)); GetBasePtr(out));
} else {
// exec dropout op for inference only.
if (is_upscale) { if (is_upscale) {
framework::TensorCopy( return;
*x, }
ctx.GetPlace(), }
ctx.template device_context<platform::MLUDeviceContext>(),
out); // In downgrade_in_infer mode, need to multiply (1.0f - dropout_prob).
} else {
auto scale = static_cast<T>(1.0f - dropout_prob);
Tensor scale_tensor(x->dtype()); Tensor scale_tensor(x->dtype());
Tensor bias_tensor(x->dtype());
scale_tensor.mutable_data<T>({1}, ctx.GetPlace()); scale_tensor.mutable_data<T>({1}, ctx.GetPlace());
bias_tensor.mutable_data<T>({1}, ctx.GetPlace());
MLUCnnlTensorDesc scale_desc(scale_tensor); MLUCnnlTensorDesc scale_desc(scale_tensor);
MLUCnnl::Fill(ctx, MLUCnnlTensorDesc bias_desc(bias_tensor);
CNNL_POINTER_MODE_HOST, FillMLUTensorWithHostValue(
&scale, ctx, static_cast<T>(1.0f - dropout_prob), &scale_tensor);
scale_desc.get(), FillMLUTensorWithHostValue(ctx, static_cast<T>(0.0f), &bias_tensor);
GetBasePtr(&scale_tensor));
MLUCnnl::Scale(ctx,
auto data_type = ToCnnlDataType<T>(); 0,
MLUCnnlOpTensorDesc op_tensor_desc( is_test ? x_desc.get() : out_desc.get(),
CNNL_OP_TENSOR_MUL, data_type, CNNL_NOT_PROPAGATE_NAN); is_test ? GetBasePtr(x) : GetBasePtr(out),
MLUCnnl::OpTensor(ctx,
op_tensor_desc.get(),
x_desc.get(),
GetBasePtr(x),
scale_desc.get(), scale_desc.get(),
GetBasePtr(&scale_tensor), GetBasePtr(&scale_tensor),
bias_desc.get(),
GetBasePtr(&bias_tensor),
out_desc.get(), out_desc.get(),
GetBasePtr(out), GetBasePtr(out));
data_type);
}
}
} }
}; };
......
...@@ -67,7 +67,7 @@ if(WITH_GPU OR WITH_ROCM) ...@@ -67,7 +67,7 @@ if(WITH_GPU OR WITH_ROCM)
op_library(skip_layernorm_op) op_library(skip_layernorm_op)
op_library(yolo_box_head_op) op_library(yolo_box_head_op)
op_library(yolo_box_post_op) op_library(yolo_box_post_op)
op_library(fused_embedding_eltwise_layernorm_op) op_library(fused_embedding_eltwise_layernorm_op DEPS bert_encoder_functor)
op_library(fused_gate_attention_op) op_library(fused_gate_attention_op)
# fusion_group # fusion_group
if(NOT APPLE AND NOT WIN32) if(NOT APPLE AND NOT WIN32)
......
...@@ -45,6 +45,14 @@ struct NormConvolutionArgs { ...@@ -45,6 +45,14 @@ struct NormConvolutionArgs {
int stride, int stride,
int dilation, int dilation,
int group) { int group) {
PADDLE_ENFORCE_LT(
ctx.GetComputeCapability(),
90,
phi::errors::PreconditionNotMet(
"Expect compute compatiblity to be less than 90, but got %d. "
"CUDNN FusedOps is no longer available on H100 and later "
"devices.",
ctx.GetComputeCapability()));
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
input_shape.size(), input_shape.size(),
4U, 4U,
......
...@@ -442,7 +442,7 @@ TEST(CudnnNormConvFp16, K1S1) { ...@@ -442,7 +442,7 @@ TEST(CudnnNormConvFp16, K1S1) {
phi::GPUContext *ctx = static_cast<phi::GPUContext *>( phi::GPUContext *ctx = static_cast<phi::GPUContext *>(
platform::DeviceContextPool::Instance().Get(platform::CUDAPlace(0))); platform::DeviceContextPool::Instance().Get(platform::CUDAPlace(0)));
if (ctx->GetComputeCapability() < 70) { if (ctx->GetComputeCapability() < 70 || ctx->GetComputeCapability() >= 90) {
ASSERT_THROW(test.CheckForward(1e-3, true), ASSERT_THROW(test.CheckForward(1e-3, true),
paddle::platform::EnforceNotMet); paddle::platform::EnforceNotMet);
ASSERT_THROW(test.CheckBackward(1e-3, true), ASSERT_THROW(test.CheckBackward(1e-3, true),
...@@ -472,7 +472,7 @@ TEST(CudnnNormConvFp16, K3S1) { ...@@ -472,7 +472,7 @@ TEST(CudnnNormConvFp16, K3S1) {
phi::GPUContext *ctx = static_cast<phi::GPUContext *>( phi::GPUContext *ctx = static_cast<phi::GPUContext *>(
platform::DeviceContextPool::Instance().Get(platform::CUDAPlace(0))); platform::DeviceContextPool::Instance().Get(platform::CUDAPlace(0)));
if (ctx->GetComputeCapability() < 70) { if (ctx->GetComputeCapability() < 70 || ctx->GetComputeCapability() >= 90) {
ASSERT_THROW(test.CheckForward(1e-3, true), ASSERT_THROW(test.CheckForward(1e-3, true),
paddle::platform::EnforceNotMet); paddle::platform::EnforceNotMet);
ASSERT_THROW(test.CheckBackward(1e-3, true), ASSERT_THROW(test.CheckBackward(1e-3, true),
...@@ -502,7 +502,7 @@ TEST(CudnnNormConvFp16, K1S1O4) { ...@@ -502,7 +502,7 @@ TEST(CudnnNormConvFp16, K1S1O4) {
phi::GPUContext *ctx = static_cast<phi::GPUContext *>( phi::GPUContext *ctx = static_cast<phi::GPUContext *>(
platform::DeviceContextPool::Instance().Get(platform::CUDAPlace(0))); platform::DeviceContextPool::Instance().Get(platform::CUDAPlace(0)));
if (ctx->GetComputeCapability() < 70) { if (ctx->GetComputeCapability() < 70 || ctx->GetComputeCapability() >= 90) {
ASSERT_THROW(test.CheckForward(1e-3, true), ASSERT_THROW(test.CheckForward(1e-3, true),
paddle::platform::EnforceNotMet); paddle::platform::EnforceNotMet);
ASSERT_THROW(test.CheckBackward(1e-3, true), ASSERT_THROW(test.CheckBackward(1e-3, true),
...@@ -532,7 +532,7 @@ TEST(CudnnNormConvFp16, K1S2O4) { ...@@ -532,7 +532,7 @@ TEST(CudnnNormConvFp16, K1S2O4) {
phi::GPUContext *ctx = static_cast<phi::GPUContext *>( phi::GPUContext *ctx = static_cast<phi::GPUContext *>(
platform::DeviceContextPool::Instance().Get(platform::CUDAPlace(0))); platform::DeviceContextPool::Instance().Get(platform::CUDAPlace(0)));
if (ctx->GetComputeCapability() <= 70) { if (ctx->GetComputeCapability() <= 70 || ctx->GetComputeCapability() >= 90) {
ASSERT_THROW(test.CheckForward(1e-3, true), ASSERT_THROW(test.CheckForward(1e-3, true),
paddle::platform::EnforceNotMet); paddle::platform::EnforceNotMet);
ASSERT_THROW(test.CheckBackward(1e-3), paddle::platform::EnforceNotMet); ASSERT_THROW(test.CheckBackward(1e-3), paddle::platform::EnforceNotMet);
......
...@@ -256,8 +256,10 @@ template <typename T, ...@@ -256,8 +256,10 @@ template <typename T,
int BlockSizeX, int BlockSizeX,
int BlockSizeY, int BlockSizeY,
int VecSize, int VecSize,
typename Functor> typename Functor,
__global__ void FusedDropoutActBiasGrad(Functor act_grad, int THREADS_PER_CTA = BlockSizeX *BlockSizeY>
__global__ __launch_bounds__(THREADS_PER_CTA) void FusedDropoutActBiasGrad(
Functor act_grad,
const T *dout, const T *dout,
const MaskType *mask, const MaskType *mask,
const T *src, const T *src,
......
...@@ -223,13 +223,7 @@ __global__ void InplaceAddReluAddLayerNormKernel(const float16* y_data, ...@@ -223,13 +223,7 @@ __global__ void InplaceAddReluAddLayerNormKernel(const float16* y_data,
// For layer_norm, reduce to calculate mean and std // For layer_norm, reduce to calculate mean and std
sum_i += static_cast<float>(tmp_3); sum_i += static_cast<float>(tmp_3);
#if defined(PADDLE_WITH_CUDA) && __CUDA_ARCH__ >= 530
square_sum_i += static_cast<float>(__hmul(tmp_3, tmp_3));
#elif defined(PADDLE_WITH_CUDA)
square_sum_i += static_cast<float>(tmp_3) * static_cast<float>(tmp_3); square_sum_i += static_cast<float>(tmp_3) * static_cast<float>(tmp_3);
#else
square_sum_i += static_cast<float>(tmp_3 * tmp_3);
#endif
} }
auto pair = BlockReduce(temp_storage) auto pair = BlockReduce(temp_storage)
.Reduce(PairForLayerNorm<float>(sum_i, square_sum_i), .Reduce(PairForLayerNorm<float>(sum_i, square_sum_i),
...@@ -282,8 +276,8 @@ __global__ void InplaceAddReluAddLayerNormKernel(const float16* y_data, ...@@ -282,8 +276,8 @@ __global__ void InplaceAddReluAddLayerNormKernel(const float16* y_data,
half tmp_0 = __hdiv(__hsub(save_ptr[save_index], mean_i), std_i); half tmp_0 = __hdiv(__hsub(save_ptr[save_index], mean_i), std_i);
half tmp_1 = scale ? __hmul(scale[j], tmp_0) : tmp_0; half tmp_1 = scale ? __hmul(scale[j], tmp_0) : tmp_0;
#else #else
half tmp_0 = static_cast<float>(static_cast<float>(save_ptr[save_index]) + half tmp_0 = static_cast<half>((static_cast<float>(save_ptr[save_index]) -
static_cast<float>(mean_i) / static_cast<float>(mean_i)) /
static_cast<float>(std_i)); static_cast<float>(std_i));
half tmp_1 = scale ? static_cast<half>(static_cast<float>(scale[j]) * half tmp_1 = scale ? static_cast<half>(static_cast<float>(scale[j]) *
static_cast<float>(tmp_0)) static_cast<float>(tmp_0))
...@@ -400,19 +394,16 @@ class FusedFCElementwiseLayerNormOpKernel : public framework::OpKernel<T> { ...@@ -400,19 +394,16 @@ class FusedFCElementwiseLayerNormOpKernel : public framework::OpKernel<T> {
auto* out_data = dev_ctx.template Alloc<T>(out, out->numel() * sizeof(T)); auto* out_data = dev_ctx.template Alloc<T>(out, out->numel() * sizeof(T));
auto blas = phi::funcs::GetBlas<phi::GPUContext, T>(dev_ctx); auto blas = phi::funcs::GetBlas<phi::GPUContext, T>(dev_ctx);
blas.GEMM(false, blas.GEMM(CblasNoTrans,
false, CblasNoTrans,
M, M,
N, N,
K, K,
static_cast<T>(1.0), static_cast<T>(1.0),
x_data, x_data,
K,
w_data, w_data,
N,
static_cast<T>(0.0), static_cast<T>(0.0),
out_data, out_data);
N);
auto* y = ctx.Input<framework::Tensor>("Y"); auto* y = ctx.Input<framework::Tensor>("Y");
auto* bias_0 = ctx.Input<framework::Tensor>("Bias0"); auto* bias_0 = ctx.Input<framework::Tensor>("Bias0");
auto* bias_1 = ctx.Input<framework::Tensor>("Bias1"); auto* bias_1 = ctx.Input<framework::Tensor>("Bias1");
......
...@@ -139,9 +139,8 @@ class FusedGemmEpilogueOp : public framework::OperatorWithKernel { ...@@ -139,9 +139,8 @@ class FusedGemmEpilogueOp : public framework::OperatorWithKernel {
} }
ctx->SetOutputDim("Out", phi::make_ddim(out_dims)); ctx->SetOutputDim("Out", phi::make_ddim(out_dims));
// Note (Ming Huang): Reserve space of relu is a bit-mask,
// which cannot pass nan_and_inf checking if shape is set. if (ctx->HasOutput("ReserveSpace")) {
if (activation == "gelu" && ctx->HasOutput("ReserveSpace")) {
ctx->SetOutputDim("ReserveSpace", phi::make_ddim(out_dims)); ctx->SetOutputDim("ReserveSpace", phi::make_ddim(out_dims));
} }
} }
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment