Merge remote-tracking branch 'origin/dygraph' into dygraph

c98c5dd1 · Leif · 063395ec · 5ebf5d6e · c98c5dd1 · c98c5dd1
Commit c98c5dd1 authored Nov 09, 2021 by Leif
20 changed files
--- a/configs/det/ch_PP-OCRv2/ch_PP-OCR_det_distill.yml
+++ b/configs/det/ch_PP-OCRv2/ch_PP-OCR_det_distill.yml
@@ -90,7 +90,7 @@ Optimizer:
 PostProcess:
  name: DistillationDBPostProcess
-  model_name: ["Student", "Student2"]
+  model_name: ["Student"]
  key: head_out
  thresh: 0.3
  box_thresh: 0.6

--- a/deploy/cpp_infer/include/ocr_rec.h
+++ b/deploy/cpp_infer/include/ocr_rec.h
@@ -44,7 +44,8 @@ public:
                          const int &gpu_id, const int &gpu_mem,
                          const int &cpu_math_library_num_threads,
                          const bool &use_mkldnn, const string &label_path,
-                          const bool &use_tensorrt, const std::string &precision) {
+                          const bool &use_tensorrt, const std::string &precision,
+                          const int &rec_batch_num) {
    this->use_gpu_ = use_gpu;
    this->gpu_id_ = gpu_id;
    this->gpu_mem_ = gpu_mem;
@@ -52,6 +53,7 @@ public:
    this->use_mkldnn_ = use_mkldnn;
    this->use_tensorrt_ = use_tensorrt;
    this->precision_ = precision;
+    this->rec_batch_num_ = rec_batch_num;
    this->label_list_ = Utility::ReadDict(label_path);
    this->label_list_.insert(this->label_list_.begin(),
@@ -64,7 +66,7 @@ public:
  // Load Paddle inference model
  void LoadModel(const std::string &model_dir);
-  void Run(cv::Mat &img, std::vector<double> *times);
+  void Run(std::vector<cv::Mat> img_list, std::vector<double> *times);
 private:
  std::shared_ptr<Predictor> predictor_;
@@ -82,10 +84,12 @@ private:
  bool is_scale_ = true;
  bool use_tensorrt_ = false;
  std::string precision_ = "fp32";
+  int rec_batch_num_ = 6;
  // pre-process
  CrnnResizeImg resize_op_;
  Normalize normalize_op_;
-  Permute permute_op_;
+  PermuteBatch permute_op_;
  // post-process
  PostProcessor post_processor_;

--- a/deploy/cpp_infer/include/preprocess_op.h
+++ b/deploy/cpp_infer/include/preprocess_op.h
@@ -44,6 +44,11 @@ public:
  virtual void Run(const cv::Mat *im, float *data);
 };
+class PermuteBatch {
+public:
+  virtual void Run(const std::vector<cv::Mat> imgs, float *data);
+};
 class ResizeImgType0 {
 public:
  virtual void Run(const cv::Mat &img, cv::Mat &resize_img, int max_size_len,

--- a/deploy/cpp_infer/include/utility.h
+++ b/deploy/cpp_infer/include/utility.h
@@ -50,6 +50,9 @@ public:
  static cv::Mat GetRotateCropImage(const cv::Mat &srcimage,
                          std::vector<std::vector<int>> box);
+  static std::vector<int> argsort(const std::vector<float>& array);
 };
 } // namespace PaddleOCR
\ No newline at end of file
--- a/deploy/cpp_infer/src/main.cpp
+++ b/deploy/cpp_infer/src/main.cpp
@@ -61,7 +61,7 @@ DEFINE_string(cls_model_dir, "", "Path of cls inference model.");
 DEFINE_double(cls_thresh, 0.9, "Threshold of cls_thresh.");
 // recognition related
 DEFINE_string(rec_model_dir, "", "Path of rec inference model.");
-DEFINE_int32(rec_batch_num, 1, "rec_batch_num.");
+DEFINE_int32(rec_batch_num, 6, "rec_batch_num.");
 DEFINE_string(char_list_file, "../../ppocr/utils/ppocr_keys_v1.txt", "Path of dictionary.");
@@ -146,8 +146,9 @@ int main_rec(std::vector<cv::String> cv_all_img_names) {
    CRNNRecognizer rec(FLAGS_rec_model_dir, FLAGS_use_gpu, FLAGS_gpu_id,
                       FLAGS_gpu_mem, FLAGS_cpu_threads,
                       FLAGS_enable_mkldnn, char_list_file,
-                       FLAGS_use_tensorrt, FLAGS_precision);
+                       FLAGS_use_tensorrt, FLAGS_precision, FLAGS_rec_batch_num);
+    std::vector<cv::Mat> img_list;
    for (int i = 0; i < cv_all_img_names.size(); ++i) {
      LOG(INFO) << "The predict img: " << cv_all_img_names[i];
@@ -156,22 +157,21 @@ int main_rec(std::vector<cv::String> cv_all_img_names) {
        std::cerr << "[ERROR] image read failed! image path: " << cv_all_img_names[i] << endl;
        exit(1);
      }
+      img_list.push_back(srcimg);
-      std::vector<double> rec_times;
-      rec.Run(srcimg, &rec_times);
-      time_info[0] += rec_times[0];
-      time_info[1] += rec_times[1];
-      time_info[2] += rec_times[2];
    }
+    std::vector<double> rec_times;
+    rec.Run(img_list, &rec_times);
+    time_info[0] += rec_times[0];
+    time_info[1] += rec_times[1];
+    time_info[2] += rec_times[2];
    if (FLAGS_benchmark) {
        AutoLogger autolog("ocr_rec", 
                           FLAGS_use_gpu,
                           FLAGS_use_tensorrt,
                           FLAGS_enable_mkldnn,
                           FLAGS_cpu_threads,
-                           1, 
+                           FLAGS_rec_batch_num, 
                           "dynamic", 
                           FLAGS_precision, 
                           time_info, 
@@ -209,7 +209,7 @@ int main_system(std::vector<cv::String> cv_all_img_names) {
    CRNNRecognizer rec(FLAGS_rec_model_dir, FLAGS_use_gpu, FLAGS_gpu_id,
                       FLAGS_gpu_mem, FLAGS_cpu_threads,
                       FLAGS_enable_mkldnn, char_list_file,
-                       FLAGS_use_tensorrt, FLAGS_precision);
+                       FLAGS_use_tensorrt, FLAGS_precision, FLAGS_rec_batch_num);
    for (int i = 0; i < cv_all_img_names.size(); ++i) {
      LOG(INFO) << "The predict img: " << cv_all_img_names[i];
@@ -228,19 +228,22 @@ int main_system(std::vector<cv::String> cv_all_img_names) {
      time_info_det[1] += det_times[1];
      time_info_det[2] += det_times[2];
-      cv::Mat crop_img;
+      std::vector<cv::Mat> img_list;
      for (int j = 0; j < boxes.size(); j++) {
-        crop_img = Utility::GetRotateCropImage(srcimg, boxes[j]);
+          cv::Mat crop_img;
+          crop_img = Utility::GetRotateCropImage(srcimg, boxes[j]);
-        if (cls != nullptr) {
+          if (cls != nullptr) {
-          crop_img = cls->Run(crop_img);
+              crop_img = cls->Run(crop_img);
-        }
+          }
-        rec.Run(crop_img, &rec_times);
+          img_list.push_back(crop_img);
-        time_info_rec[0] += rec_times[0];
-        time_info_rec[1] += rec_times[1];
-        time_info_rec[2] += rec_times[2];
      }
+      rec.Run(img_list, &rec_times);
+      time_info_rec[0] += rec_times[0];
+      time_info_rec[1] += rec_times[1];
+      time_info_rec[2] += rec_times[2];
    }
    if (FLAGS_benchmark) {
        AutoLogger autolog_det("ocr_det", 
                            FLAGS_use_gpu,
@@ -257,7 +260,7 @@ int main_system(std::vector<cv::String> cv_all_img_names) {
                            FLAGS_use_tensorrt,
                            FLAGS_enable_mkldnn,
                            FLAGS_cpu_threads,
-                            1, 
+                            FLAGS_rec_batch_num, 
                            "dynamic", 
                            FLAGS_precision, 
                            time_info_rec, 

--- a/deploy/cpp_infer/src/ocr_rec.cpp
+++ b/deploy/cpp_infer/src/ocr_rec.cpp
@@ -15,83 +15,108 @@
 #include <include/ocr_rec.h>
 namespace PaddleOCR {
-void CRNNRecognizer::Run(cv::Mat &img, std::vector<double> *times) {
+void CRNNRecognizer::Run(std::vector<cv::Mat> img_list, std::vector<double> *times) {
-  cv::Mat srcimg;
+    std::chrono::duration<float> preprocess_diff = std::chrono::steady_clock::now() - std::chrono::steady_clock::now();
-  img.copyTo(srcimg);
+    std::chrono::duration<float> inference_diff = std::chrono::steady_clock::now() - std::chrono::steady_clock::now();
-  cv::Mat resize_img;
+    std::chrono::duration<float> postprocess_diff = std::chrono::steady_clock::now() - std::chrono::steady_clock::now();
-  float wh_ratio = float(srcimg.cols) / float(srcimg.rows);
+    int img_num = img_list.size();
-  auto preprocess_start = std::chrono::steady_clock::now();
+    std::vector<float> width_list;
-  this->resize_op_.Run(srcimg, resize_img, wh_ratio, this->use_tensorrt_);
+    for (int i = 0; i < img_num; i++) {
+        width_list.push_back(float(img_list[i].cols) / img_list[i].rows);
-  this->normalize_op_.Run(&resize_img, this->mean_, this->scale_,
+    }
-                          this->is_scale_);
+    std::vector<int> indices = Utility::argsort(width_list);
-  std::vector<float> input(1 * 3 * resize_img.rows * resize_img.cols, 0.0f);
+    for (int beg_img_no = 0; beg_img_no < img_num; beg_img_no += this->rec_batch_num_) {
+        auto preprocess_start = std::chrono::steady_clock::now();
-  this->permute_op_.Run(&resize_img, input.data());
+        int end_img_no = min(img_num, beg_img_no + this->rec_batch_num_);
-  auto preprocess_end = std::chrono::steady_clock::now();
+        float max_wh_ratio = 0;
+        for (int ino = beg_img_no; ino < end_img_no; ino ++) {
-  // Inference.
+            int h = img_list[indices[ino]].rows;
-  auto input_names = this->predictor_->GetInputNames();
+            int w = img_list[indices[ino]].cols;
-  auto input_t = this->predictor_->GetInputHandle(input_names[0]);
+            float wh_ratio = w * 1.0 / h;
-  input_t->Reshape({1, 3, resize_img.rows, resize_img.cols});
+            max_wh_ratio = max(max_wh_ratio, wh_ratio);
-  auto inference_start = std::chrono::steady_clock::now();
+        }
-  input_t->CopyFromCpu(input.data());
+        std::vector<cv::Mat> norm_img_batch;
-  this->predictor_->Run();
+        for (int ino = beg_img_no; ino < end_img_no; ino ++) {
+            cv::Mat srcimg;
-  std::vector<float> predict_batch;
+            img_list[indices[ino]].copyTo(srcimg);
-  auto output_names = this->predictor_->GetOutputNames();
+            cv::Mat resize_img;
-  auto output_t = this->predictor_->GetOutputHandle(output_names[0]);
+            this->resize_op_.Run(srcimg, resize_img, max_wh_ratio, this->use_tensorrt_);
-  auto predict_shape = output_t->shape();
+            this->normalize_op_.Run(&resize_img, this->mean_, this->scale_, this->is_scale_);
+            norm_img_batch.push_back(resize_img);
-  int out_num = std::accumulate(predict_shape.begin(), predict_shape.end(), 1,
+        }
+        int batch_width = int(ceilf(32 * max_wh_ratio)) - 1;
+        std::vector<float> input(this->rec_batch_num_ * 3 * 32 * batch_width, 0.0f);
+        this->permute_op_.Run(norm_img_batch, input.data());
+        auto preprocess_end = std::chrono::steady_clock::now();
+        preprocess_diff += preprocess_end - preprocess_start;
+        // Inference.
+        auto input_names = this->predictor_->GetInputNames();
+        auto input_t = this->predictor_->GetInputHandle(input_names[0]);
+        input_t->Reshape({this->rec_batch_num_, 3, 32, batch_width});
+        auto inference_start = std::chrono::steady_clock::now();
+        input_t->CopyFromCpu(input.data());
+        this->predictor_->Run();
+        std::vector<float> predict_batch;
+        auto output_names = this->predictor_->GetOutputNames();
+        auto output_t = this->predictor_->GetOutputHandle(output_names[0]);
+        auto predict_shape = output_t->shape();
+        int out_num = std::accumulate(predict_shape.begin(), predict_shape.end(), 1,
                                std::multiplies<int>());
-  predict_batch.resize(out_num);
+        predict_batch.resize(out_num);
-  output_t->CopyToCpu(predict_batch.data());
+        output_t->CopyToCpu(predict_batch.data());
-  auto inference_end = std::chrono::steady_clock::now();
+        auto inference_end = std::chrono::steady_clock::now();
+        inference_diff += inference_end - inference_start;
-  // ctc decode
-  auto postprocess_start = std::chrono::steady_clock::now();
+        // ctc decode
-  std::vector<std::string> str_res;
+        auto postprocess_start = std::chrono::steady_clock::now();
-  int argmax_idx;
+        for (int m = 0; m < predict_shape[0]; m++) {
-  int last_index = 0;
+            std::vector<std::string> str_res;
-  float score = 0.f;
+            int argmax_idx;
-  int count = 0;
+            int last_index = 0;
-  float max_value = 0.0f;
+            float score = 0.f;
+            int count = 0;
-  for (int n = 0; n < predict_shape[1]; n++) {
+            float max_value = 0.0f;
-    argmax_idx =
-        int(Utility::argmax(&predict_batch[n * predict_shape[2]],
+            for (int n = 0; n < predict_shape[1]; n++) {
-                            &predict_batch[(n + 1) * predict_shape[2]]));
+                argmax_idx =
-    max_value =
+                    int(Utility::argmax(&predict_batch[(m * predict_shape[1] + n) * predict_shape[2]],
-        float(*std::max_element(&predict_batch[n * predict_shape[2]],
+                                        &predict_batch[(m * predict_shape[1] + n + 1) * predict_shape[2]]));
-                                &predict_batch[(n + 1) * predict_shape[2]]));
+                max_value =
+                    float(*std::max_element(&predict_batch[(m * predict_shape[1] + n) * predict_shape[2]],
-    if (argmax_idx > 0 && (!(n > 0 && argmax_idx == last_index))) {
+                                            &predict_batch[(m * predict_shape[1] + n + 1) * predict_shape[2]]));
-      score += max_value;
-      count += 1;
+                if (argmax_idx > 0 && (!(n > 0 && argmax_idx == last_index))) {
-      str_res.push_back(label_list_[argmax_idx]);
+                    score += max_value;
+                    count += 1;
+                    str_res.push_back(label_list_[argmax_idx]);
+                }
+                last_index = argmax_idx;
+            }
+            score /= count;
+            if (isnan(score))
+                continue;
+            for (int i = 0; i < str_res.size(); i++) {
+                std::cout << str_res[i];
+            }
+            std::cout << "\tscore: " << score << std::endl;
+        }
+        auto postprocess_end = std::chrono::steady_clock::now();
+        postprocess_diff += postprocess_end - postprocess_start;
    }
-    last_index = argmax_idx;
+    times->push_back(double(preprocess_diff.count() * 1000));
-  }
+    times->push_back(double(inference_diff.count() * 1000));
-  auto postprocess_end = std::chrono::steady_clock::now();
+    times->push_back(double(postprocess_diff.count() * 1000));
-  score /= count;
-  for (int i = 0; i < str_res.size(); i++) {
-    std::cout << str_res[i];
-  }
-  std::cout << "\tscore: " << score << std::endl;
-  std::chrono::duration<float> preprocess_diff = preprocess_end - preprocess_start;
-  times->push_back(double(preprocess_diff.count() * 1000));
-  std::chrono::duration<float> inference_diff = inference_end - inference_start;
-  times->push_back(double(inference_diff.count() * 1000));
-  std::chrono::duration<float> postprocess_diff = postprocess_end - postprocess_start;
-  times->push_back(double(postprocess_diff.count() * 1000));
 }
 void CRNNRecognizer::LoadModel(const std::string &model_dir) {
  //   AnalysisConfig config;
  paddle_infer::Config config;

--- a/deploy/cpp_infer/src/preprocess_op.cpp
+++ b/deploy/cpp_infer/src/preprocess_op.cpp
@@ -40,6 +40,17 @@ void Permute::Run(const cv::Mat *im, float *data) {
  }
 }
+void PermuteBatch::Run(const std::vector<cv::Mat> imgs, float *data) {
+    for (int j = 0; j < imgs.size(); j ++){
+        int rh = imgs[j].rows;
+        int rw = imgs[j].cols;
+        int rc = imgs[j].channels();
+        for (int i = 0; i < rc; ++i) {
+            cv::extractChannel(imgs[j], cv::Mat(rh, rw, CV_32FC1, data + (j * rc + i) * rh * rw), i);
+        }
+    }
+}
 void Normalize::Run(cv::Mat *im, const std::vector<float> &mean,
                    const std::vector<float> &scale, const bool is_scale) {
  double e = 1.0;
@@ -90,16 +101,17 @@ void CrnnResizeImg::Run(const cv::Mat &img, cv::Mat &resize_img, float wh_ratio,
  imgC = rec_image_shape[0];
  imgH = rec_image_shape[1];
  imgW = rec_image_shape[2];
  imgW = int(32 * wh_ratio);
  float ratio = float(img.cols) / float(img.rows);
  int resize_w, resize_h;
  if (ceilf(imgH * ratio) > imgW)
    resize_w = imgW;
  else
    resize_w = int(ceilf(imgH * ratio));
  cv::resize(img, resize_img, cv::Size(resize_w, imgH), 0.f, 0.f,
             cv::INTER_LINEAR);
  cv::copyMakeBorder(resize_img, resize_img, 0, 0, 0,

--- a/deploy/cpp_infer/src/utility.cpp
+++ b/deploy/cpp_infer/src/utility.cpp
@@ -147,4 +147,17 @@ cv::Mat Utility::GetRotateCropImage(const cv::Mat &srcimage,
  }
 }
+std::vector<int> Utility::argsort(const std::vector<float>& array)
+{
+    const int array_len(array.size());
+    std::vector<int> array_index(array_len, 0);
+    for (int i = 0; i < array_len; ++i)
+        array_index[i] = i;
+    std::sort(array_index.begin(), array_index.end(),
+        [&array](int pos1, int pos2) {return (array[pos1] < array[pos2]); });
+    return array_index;
+}
 } // namespace PaddleOCR
\ No newline at end of file
--- a/deploy/paddle2onnx/readme.md
+++ b/deploy/paddle2onnx/readme.md
+# paddle2onnx 模型转化与预测
+本章节介绍 PaddleOCR 模型如何转化为 ONNX 模型，并基于 ONNX 引擎预测。
+## 1. 环境准备
+需要准备 Paddle2ONNX 模型转化环境，和 ONNX 模型预测环境
+###  Paddle2ONNX
+Paddle2ONNX 支持将 PaddlePaddle 模型格式转化到 ONNX 模型格式，算子目前稳定支持导出 ONNX Opset 9~11，部分Paddle算子支持更低的ONNX Opset转换。
+更多细节可参考 [Paddle2ONNX](https://github.com/PaddlePaddle/Paddle2ONNX/blob/develop/README_zh.md)
+- 安装 Paddle2ONNX
+```
+python3.7 -m pip install paddle2onnx
+```
+- 安装 ONNX
+```
+# 建议安装 1.4.0 版本，可根据环境更换版本号
+python3.7 -m pip install onnxruntime==1.4.0
+```
+## 2. 模型转换
+- Paddle 模型下载
+有两种方式获取Paddle静态图模型：在 [model_list](../../doc/doc_ch/models_list.md) 中下载PaddleOCR提供的预测模型；
+参考[模型导出说明](../../doc/doc_ch/inference.md#训练模型转inference模型)把训练好的权重转为 inference_model。
+以 ppocr 检测模型为例：
+```
+wget -nc  -P ./inference https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_det_infer.tar
+cd ./inference && tar xf ch_ppocr_mobile_v2.0_det_infer.tar && cd ..
+```
+- 模型转换
+使用 Paddle2ONNX 将Paddle静态图模型转换为ONNX模型格式：
+```
+paddle2onnx --model_dir=./inference/ch_ppocr_mobile_v2.0_det_infer/ \
+--model_filename=inference.pdmodel \
+--params_filename=inference.pdiparams \
+--save_file=./inference/det_mobile_onnx/model.onnx \
+--opset_version=10 \
+--enable_onnx_checker=True
+```
+执行完毕后，ONNX 模型会被保存在 `./inference/det_mobile_onnx/` 路径下
+* 注意：以下几个模型暂不支持转换为 ONNX 模型：
+NRTR、SAR、RARE、SRN
+## 3. onnx 预测
+以检测模型为例，使用 ONNX 预测可执行如下命令：
+```
+python3.7 ../../tools/infer/predict_det.py --use_gpu=False --use_onnx=True \
+--det_model_dir=./inference/det_mobile_onnx/model.onnx \
+--image_dir=../../doc/imgs/1.jpg
+```
+执行命令后在终端会打印出预测的检测框坐标，并在 `./inference_results/` 下保存可视化结果。
+```
+root INFO: 1.jpg  [[[291, 295], [334, 292], [348, 844], [305, 847]], [[344, 296], [379, 294], [387, 669], [353, 671]]]
+The predict time of ../../doc/imgs/1.jpg: 0.06162881851196289
+The visualized image saved in ./inference_results/det_res_1.jpg
+```
+* 注意：ONNX暂时不支持变长预测，需要将输入resize到固定输入，预测结果可能与直接使用Paddle预测有细微不同。
--- a/deploy/pdserving/README.md
+++ b/deploy/pdserving/README.md
@@ -114,7 +114,7 @@ The recognition model is the same.
    git clone https://github.com/PaddlePaddle/PaddleOCR
    # Enter the working directory  
-    cd PaddleOCR/deploy/pdserver/
+    cd PaddleOCR/deploy/pdserving/
    ```
    The pdserver directory contains the code to start the pipeline service and send prediction requests, including:

--- a/deploy/pdserving/README_CN.md
+++ b/deploy/pdserving/README_CN.md
@@ -112,7 +112,7 @@ python3 -m paddle_serving_client.convert --dirname ./ch_ppocr_mobile_v2.0_rec_in
    git clone https://github.com/PaddlePaddle/PaddleOCR
    # 进入到工作目录
-    cd PaddleOCR/deploy/pdserver/
+    cd PaddleOCR/deploy/pdserving/
    ```
    pdserver目录包含启动pipeline服务和发送预测请求的代码，包括：
    ```
@@ -206,7 +206,7 @@ pip3 install paddle-serving-app==0.3.1
 1. 启动服务端程序
 ```
-cd win 
+cd win
 python3 ocr_web_server.py gpu(使用gpu方式)
 或者
 python3 ocr_web_server.py cpu(使用cpu方式)

--- a/deploy/pdserving/ocr_cpp_client.py
+++ b/deploy/pdserving/ocr_cpp_client.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# pylint: disable=doc-string-missing
+from paddle_serving_client import Client
+import sys
+import numpy as np
+import base64
+import os
+import cv2
+from paddle_serving_app.reader import Sequential, URL2Image, ResizeByFactor
+from paddle_serving_app.reader import Div, Normalize, Transpose
+from ocr_reader import OCRReader
+client = Client()
+# TODO:load_client need to load more than one client model.
+# this need to figure out some details.
+client.load_client_config(sys.argv[1:])
+client.connect(["127.0.0.1:9293"])
+import paddle
+test_img_dir = "test_img/"
+ocr_reader = OCRReader(char_dict_path="../../ppocr/utils/ppocr_keys_v1.txt")
+def cv2_to_base64(image):
+    return base64.b64encode(image).decode(
+        'utf8')  #data.tostring()).decode('utf8')
+for img_file in os.listdir(test_img_dir):
+    with open(os.path.join(test_img_dir, img_file), 'rb') as file:
+        image_data = file.read()
+    image = cv2_to_base64(image_data)
+    res_list = []
+    #print(image)
+    fetch_map = client.predict(
+        feed={"x": image}, fetch=["save_infer_model/scale_0.tmp_1"], batch=True)
+    print("fetrch map:", fetch_map)
+    one_batch_res = ocr_reader.postprocess(fetch_map, with_score=True)
+    for res in one_batch_res:
+        res_list.append(res[0])
+    res = {"res": str(res_list)}
+    print(res)
--- a/deploy/pdserving/pipeline_http_client.py
+++ b/deploy/pdserving/pipeline_http_client.py
@@ -18,13 +18,19 @@ import json
 import base64
 import os
+import argparse
+parser = argparse.ArgumentParser(description="args for paddleserving")
+parser.add_argument("--image_dir", type=str, default="../../doc/imgs/")
+args = parser.parse_args()
 def cv2_to_base64(image):
    return base64.b64encode(image).decode('utf8')
 url = "http://127.0.0.1:9998/ocr/prediction"
-test_img_dir = "../../doc/imgs/"
+test_img_dir = args.image_dir
 for idx, img_file in enumerate(os.listdir(test_img_dir)):
    with open(os.path.join(test_img_dir, img_file), 'rb') as file:
        image_data1 = file.read()
@@ -36,5 +42,4 @@ for idx, img_file in enumerate(os.listdir(test_img_dir)):
        r = requests.post(url=url, data=json.dumps(data))
        print(r.json())
-test_img_dir = "../../doc/imgs/"
 print("==> total number of test imgs: ", len(os.listdir(test_img_dir)))
--- a/deploy/pdserving/pipeline_rpc_client.py
+++ b/deploy/pdserving/pipeline_rpc_client.py
@@ -30,7 +30,12 @@ def cv2_to_base64(image):
    return base64.b64encode(image).decode('utf8')
-test_img_dir = "imgs/"
+import argparse
+parser = argparse.ArgumentParser(description="args for paddleserving")
+parser.add_argument("--image_dir", type=str, default="../../doc/imgs/")
+args = parser.parse_args()
+test_img_dir = args.image_dir
 for img_file in os.listdir(test_img_dir):
    with open(os.path.join(test_img_dir, img_file), 'rb') as file:
        image_data = file.read()

--- a/doc/doc_ch/detection.md
+++ b/doc/doc_ch/detection.md
@@ -101,15 +101,28 @@ python3 tools/train.py -c configs/det/det_mv3_db.yml \
 # 单机多卡训练，通过 --gpus 参数设置使用的GPU ID
 python3 -m paddle.distributed.launch --gpus '0,1,2,3' tools/train.py -c configs/det/det_mv3_db.yml \
     -o Global.pretrained_model=./pretrain_models/MobileNetV3_large_x0_5_pretrained
+# 多机多卡训练，通过 --ips 参数设置使用的机器IP地址，通过 --gpus 参数设置使用的GPU ID
+python3 -m paddle.distributed.launch --ips="xx.xx.xx.xx,xx.xx.xx.xx" --gpus '0,1,2,3' tools/train.py -c configs/det/det_mv3_db.yml \
+     -o Global.pretrained_model=./pretrain_models/MobileNetV3_large_x0_5_pretrained
 ```
 上述指令中，通过-c 选择训练使用configs/det/det_db_mv3.yml配置文件。
 有关配置文件的详细解释，请参考[链接](./config.md)。
 您也可以通过-o参数在不需要修改yml文件的情况下，改变训练的参数，比如，调整训练的学习率为0.0001
 ```shell
 python3 tools/train.py -c configs/det/det_mv3_db.yml -o Optimizer.base_lr=0.0001
 ```
+**注意:** 采用多机多卡训练时，需要替换上面命令中的ips值为您机器的地址，机器之间需要能够相互ping通。查看机器ip地址的命令为`ifconfig`。
+如果您想进一步加快训练速度，可以使用[自动混合精度训练](https://www.paddlepaddle.org.cn/documentation/docs/zh/guides/01_paddle2.0_introduction/basic_concept/amp_cn.html)， 以单机单卡为例，命令如下：
+```shell
+python3 tools/train.py -c configs/det/det_mv3_db.yml \
+     -o Global.pretrained_model=./pretrain_models/MobileNetV3_large_x0_5_pretrained \
+     Global.use_amp=True Global.scale_loss=1024.0 Global.use_dynamic_loss_scaling=True
+ ```
 <a name="22-----"></a>
 ## 2.2 断点训练

--- a/doc/doc_en/detection_en.md
+++ b/doc/doc_en/detection_en.md
@@ -98,7 +98,19 @@ python3 tools/train.py -c configs/det/det_mv3_db.yml -o   \
 # multi-GPU training
 # Set the GPU ID used by the '--gpus' parameter.
 python3 -m paddle.distributed.launch --gpus '0,1,2,3'  tools/train.py -c configs/det/det_mv3_db.yml -o Global.pretrained_model=./pretrain_models/MobileNetV3_large_x0_5_pretrained
+# multi-Node, multi-GPU training
+# Set the IPs of your nodes used by the '--ips' parameter. Set the GPU ID used by the '--gpus' parameter.
+python3 -m paddle.distributed.launch --ips="xx.xx.xx.xx,xx.xx.xx.xx" --gpus '0,1,2,3' tools/train.py -c configs/det/det_mv3_db.yml \
+     -o Global.pretrained_model=./pretrain_models/MobileNetV3_large_x0_5_pretrained
+```
+**Note:** For multi-Node multi-GPU training, you need to replace the `ips` value in the preceding command with the address of your machine, and the machines must be able to ping each other. The command for viewing the IP address of the machine is `ifconfig`.
+If you want to further speed up the training, you can use [automatic mixed precision training](https://www.paddlepaddle.org.cn/documentation/docs/zh/guides/01_paddle2.0_introduction/basic_concept/amp_en.html). for single card training, the command is as follows:
+```
+python3 tools/train.py -c configs/det/det_mv3_db.yml \
+     -o Global.pretrained_model=./pretrain_models/MobileNetV3_large_x0_5_pretrained \
+     Global.use_amp=True Global.scale_loss=1024.0 Global.use_dynamic_loss_scaling=True
 ```
 ### 2.2 Load Trained Model and Continue Training

--- a/ppocr/data/imaug/east_process.py
+++ b/ppocr/data/imaug/east_process.py
@@ -11,7 +11,10 @@
 #WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 #See the License for the specific language governing permissions and
 #limitations under the License.
+"""
+This code is refered from: 
+https://github.com/songdejia/EAST/blob/master/data_utils.py
+"""
 import math
 import cv2
 import numpy as np
@@ -24,10 +27,10 @@ __all__ = ['EASTProcessTrain']
 class EASTProcessTrain(object):
    def __init__(self,
-                 image_shape = [512, 512],
+                 image_shape=[512, 512],
-                 background_ratio = 0.125,
+                 background_ratio=0.125,
-                 min_crop_side_ratio = 0.1,
+                 min_crop_side_ratio=0.1,
-                 min_text_size = 10,
+                 min_text_size=10,
                 **kwargs):
        self.input_size = image_shape[1]
        self.random_scale = np.array([0.5, 1, 2.0, 3.0])
@@ -282,12 +285,7 @@ class EASTProcessTrain(object):
                1.0 / max(min(poly_h, poly_w), 1.0)
        return score_map, geo_map, training_mask
-    def crop_area(self,
+    def crop_area(self, im, polys, tags, crop_background=False, max_tries=50):
-                  im,
-                  polys,
-                  tags,
-                  crop_background=False,
-                  max_tries=50):
        """
        make random crop from the input image
        :param im:
@@ -435,5 +433,4 @@ class EASTProcessTrain(object):
        data['score_map'] = score_map
        data['geo_map'] = geo_map
        data['training_mask'] = training_mask
-        # print(im.shape, score_map.shape, geo_map.shape, training_mask.shape)
+        return data
-        return data
\ No newline at end of file
--- a/ppocr/data/imaug/iaa_augment.py
+++ b/ppocr/data/imaug/iaa_augment.py
@@ -11,6 +11,10 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+"""
+This code is refer from:
+https://github.com/WenmuZhou/DBNet.pytorch/blob/master/data_loader/modules/iaa_augment.py
+"""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

--- a/ppocr/data/imaug/make_border_map.py
+++ b/ppocr/data/imaug/make_border_map.py
-# -*- coding:utf-8 -*- 
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+This code is refer from:
+https://github.com/WenmuZhou/DBNet.pytorch/blob/master/data_loader/modules/make_border_map.py
+"""
 from __future__ import absolute_import
 from __future__ import division

--- a/ppocr/data/imaug/make_pse_gt.py
+++ b/ppocr/data/imaug/make_pse_gt.py
-# -*- coding:utf-8 -*- 
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 from __future__ import absolute_import
 from __future__ import division
@@ -12,12 +24,8 @@ from shapely.geometry import Polygon
 __all__ = ['MakePseGt']
-class MakePseGt(object):
-    r'''
-    Making binary mask from detection data with ICDAR format.
-    Typically following the process of class `MakeICDARData`.
-    '''
+class MakePseGt(object):
    def __init__(self, kernel_num=7, size=640, min_shrink_ratio=0.4, **kwargs):
        self.kernel_num = kernel_num
        self.min_shrink_ratio = min_shrink_ratio
@@ -38,16 +46,20 @@ class MakePseGt(object):
            text_polys *= scale
        gt_kernels = []
-        for i in range(1,self.kernel_num+1):
+        for i in range(1, self.kernel_num + 1):
            # s1->sn, from big to small
-            rate = 1.0 - (1.0 - self.min_shrink_ratio) / (self.kernel_num - 1) * i
+            rate = 1.0 - (1.0 - self.min_shrink_ratio) / (self.kernel_num - 1
-            text_kernel, ignore_tags = self.generate_kernel(image.shape[0:2], rate, text_polys, ignore_tags)
+                                                          ) * i
+            text_kernel, ignore_tags = self.generate_kernel(
+                image.shape[0:2], rate, text_polys, ignore_tags)
            gt_kernels.append(text_kernel)
        training_mask = np.ones(image.shape[0:2], dtype='uint8')
        for i in range(text_polys.shape[0]):
            if ignore_tags[i]:
-                cv2.fillPoly(training_mask, text_polys[i].astype(np.int32)[np.newaxis, :, :], 0)
+                cv2.fillPoly(training_mask,
+                             text_polys[i].astype(np.int32)[np.newaxis, :, :],
+                             0)
        gt_kernels = np.array(gt_kernels)
        gt_kernels[gt_kernels > 0] = 1
@@ -59,16 +71,25 @@ class MakePseGt(object):
        data['mask'] = training_mask.astype('float32')
        return data
-    def generate_kernel(self, img_size, shrink_ratio, text_polys, ignore_tags=None):
+    def generate_kernel(self,
+                        img_size,
+                        shrink_ratio,
+                        text_polys,
+                        ignore_tags=None):
+        """
+        Refer to part of the code:
+        https://github.com/open-mmlab/mmocr/blob/main/mmocr/datasets/pipelines/textdet_targets/base_textdet_targets.py
+        """
        h, w = img_size
        text_kernel = np.zeros((h, w), dtype=np.float32)
        for i, poly in enumerate(text_polys):
            polygon = Polygon(poly)
-            distance = polygon.area * (1 - shrink_ratio * shrink_ratio) / (polygon.length + 1e-6)
+            distance = polygon.area * (1 - shrink_ratio * shrink_ratio) / (
+                polygon.length + 1e-6)
            subject = [tuple(l) for l in poly]
            pco = pyclipper.PyclipperOffset()
-            pco.AddPath(subject, pyclipper.JT_ROUND,
+            pco.AddPath(subject, pyclipper.JT_ROUND, pyclipper.ET_CLOSEDPOLYGON)
-                        pyclipper.ET_CLOSEDPOLYGON)
            shrinked = np.array(pco.Execute(-distance))
            if len(shrinked) == 0 or shrinked.size == 0: