update code

fccfdfa5 · dlyrm · dcc7bf4f · fccfdfa5 · fccfdfa5 · fccfdfa5
Commit fccfdfa5 authored Dec 25, 2023 by dlyrm
20 changed files
--- a/deploy/lite/src/keypoint_postprocess.cc
+++ b/deploy/lite/src/keypoint_postprocess.cc
+//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "include/keypoint_postprocess.h"
+#define PI 3.1415926535
+#define HALF_CIRCLE_DEGREE 180
+
+cv::Point2f get_3rd_point(cv::Point2f& a, cv::Point2f& b) {
+  cv::Point2f direct{a.x - b.x, a.y - b.y};
+  return cv::Point2f(a.x - direct.y, a.y + direct.x);
+}
+
+std::vector<float> get_dir(float src_point_x,
+                           float src_point_y,
+                           float rot_rad) {
+  float sn = sin(rot_rad);
+  float cs = cos(rot_rad);
+  std::vector<float> src_result{0.0, 0.0};
+  src_result[0] = src_point_x * cs - src_point_y * sn;
+  src_result[1] = src_point_x * sn + src_point_y * cs;
+  return src_result;
+}
+
+void affine_tranform(
+    float pt_x, float pt_y, cv::Mat& trans, std::vector<float>& preds, int p) {
+  double new1[3] = {pt_x, pt_y, 1.0};
+  cv::Mat new_pt(3, 1, trans.type(), new1);
+  cv::Mat w = trans * new_pt;
+  preds[p * 3 + 1] = static_cast<float>(w.at<double>(0, 0));
+  preds[p * 3 + 2] = static_cast<float>(w.at<double>(1, 0));
+}
+
+void get_affine_transform(std::vector<float>& center,
+                          std::vector<float>& scale,
+                          float rot,
+                          std::vector<int>& output_size,
+                          cv::Mat& trans,
+                          int inv) {
+  float src_w = scale[0];
+  float dst_w = static_cast<float>(output_size[0]);
+  float dst_h = static_cast<float>(output_size[1]);
+  float rot_rad = rot * PI / HALF_CIRCLE_DEGREE;
+  std::vector<float> src_dir = get_dir(-0.5 * src_w, 0, rot_rad);
+  std::vector<float> dst_dir{static_cast<float>(-0.5) * dst_w, 0.0};
+  cv::Point2f srcPoint2f[3], dstPoint2f[3];
+  srcPoint2f[0] = cv::Point2f(center[0], center[1]);
+  srcPoint2f[1] = cv::Point2f(center[0] + src_dir[0], center[1] + src_dir[1]);
+  srcPoint2f[2] = get_3rd_point(srcPoint2f[0], srcPoint2f[1]);
+
+  dstPoint2f[0] = cv::Point2f(dst_w * 0.5, dst_h * 0.5);
+  dstPoint2f[1] =
+      cv::Point2f(dst_w * 0.5 + dst_dir[0], dst_h * 0.5 + dst_dir[1]);
+  dstPoint2f[2] = get_3rd_point(dstPoint2f[0], dstPoint2f[1]);
+  if (inv == 0) {
+    trans = cv::getAffineTransform(srcPoint2f, dstPoint2f);
+  } else {
+    trans = cv::getAffineTransform(dstPoint2f, srcPoint2f);
+  }
+}
+
+void transform_preds(std::vector<float>& coords,
+                     std::vector<float>& center,
+                     std::vector<float>& scale,
+                     std::vector<int>& output_size,
+                     std::vector<int64_t>& dim,
+                     std::vector<float>& target_coords,
+                     bool affine=false) {
+  if (affine) {
+    cv::Mat trans(2, 3, CV_64FC1);
+    get_affine_transform(center, scale, 0, output_size, trans, 1);
+    for (int p = 0; p < dim[1]; ++p) {
+      affine_tranform(
+          coords[p * 2], coords[p * 2 + 1], trans, target_coords, p);
+    }
+  } else {
+    float heat_w = static_cast<float>(output_size[0]);
+    float heat_h = static_cast<float>(output_size[1]);
+    float x_scale = scale[0] / heat_w;
+    float y_scale = scale[1] / heat_h;
+    float offset_x = center[0] - scale[0] / 2.;
+    float offset_y = center[1] - scale[1] / 2.;
+    for (int i = 0; i < dim[1]; i++) {
+      target_coords[i * 3 + 1] = x_scale * coords[i * 2] + offset_x;
+      target_coords[i * 3 + 2] = y_scale * coords[i * 2 + 1] + offset_y;
+    }
+  }
+}
+
+// only for batchsize == 1
+void get_max_preds(std::vector<float>& heatmap,
+                   std::vector<int>& dim,
+                   std::vector<float>& preds,
+                   std::vector<float>& maxvals,
+                   int batchid,
+                   int joint_idx) {
+  int num_joints = dim[1];
+  int width = dim[3];
+  std::vector<int> idx;
+  idx.resize(num_joints * 2);
+
+  for (int j = 0; j < dim[1]; j++) {
+    float* index = &(
+        heatmap[batchid * num_joints * dim[2] * dim[3] + j * dim[2] * dim[3]]);
+    float* end = index + dim[2] * dim[3];
+    float* max_dis = std::max_element(index, end);
+    auto max_id = std::distance(index, max_dis);
+    maxvals[j] = *max_dis;
+    if (*max_dis > 0) {
+      preds[j * 2] = static_cast<float>(max_id % width);
+      preds[j * 2 + 1] = static_cast<float>(max_id / width);
+    }
+  }
+}
+
+
+void dark_parse(std::vector<float>& heatmap,
+                std::vector<int64_t>& dim,
+                std::vector<float>& coords,
+                int px, 
+                int py, 
+                int index,
+                int ch){
+  /*DARK postpocessing, Zhang et al. Distribution-Aware Coordinate
+  Representation for Human Pose Estimation (CVPR 2020).
+  1) offset = - hassian.inv() * derivative
+  2) dx = (heatmap[x+1] - heatmap[x-1])/2.
+  3) dxx = (dx[x+1] - dx[x-1])/2.
+  4) derivative = Mat([dx, dy])
+  5) hassian = Mat([[dxx, dxy], [dxy, dyy]])
+  */
+  std::vector<float>::const_iterator first1 = heatmap.begin() + index;
+  std::vector<float>::const_iterator last1 = heatmap.begin() + index + dim[2] * dim[3];
+  std::vector<float> heatmap_ch(first1, last1);
+  cv::Mat heatmap_mat = cv::Mat(heatmap_ch).reshape(0,dim[2]);
+  heatmap_mat.convertTo(heatmap_mat, CV_32FC1);
+  cv::GaussianBlur(heatmap_mat, heatmap_mat, cv::Size(3, 3), 0, 0);
+  heatmap_mat = heatmap_mat.reshape(1,1);
+  heatmap_ch = std::vector<float>(heatmap_mat.reshape(1,1));
+
+  float epsilon = 1e-10;
+  //sample heatmap to get values in around target location
+  float xy = log(fmax(heatmap_ch[py * dim[3] + px], epsilon));
+  float xr = log(fmax(heatmap_ch[py * dim[3] + px + 1], epsilon));
+  float xl = log(fmax(heatmap_ch[py * dim[3] + px - 1], epsilon));
+
+  float xr2 = log(fmax(heatmap_ch[py * dim[3] + px + 2], epsilon));
+  float xl2 = log(fmax(heatmap_ch[py * dim[3] + px - 2], epsilon));
+  float yu = log(fmax(heatmap_ch[(py + 1) * dim[3] + px], epsilon));
+  float yd = log(fmax(heatmap_ch[(py - 1) * dim[3] + px], epsilon));
+  float yu2 = log(fmax(heatmap_ch[(py + 2) * dim[3] + px], epsilon));
+  float yd2 = log(fmax(heatmap_ch[(py - 2) * dim[3] + px], epsilon));
+  float xryu = log(fmax(heatmap_ch[(py + 1) * dim[3] + px + 1], epsilon));
+  float xryd = log(fmax(heatmap_ch[(py - 1) * dim[3] + px + 1], epsilon));
+  float xlyu = log(fmax(heatmap_ch[(py + 1) * dim[3] + px - 1], epsilon));
+  float xlyd = log(fmax(heatmap_ch[(py - 1) * dim[3] + px - 1], epsilon));
+
+  //compute dx/dy and dxx/dyy with sampled values
+  float dx = 0.5 * (xr - xl);
+  float dy = 0.5 * (yu - yd);
+  float dxx = 0.25 * (xr2 - 2*xy + xl2);
+  float dxy = 0.25 * (xryu - xryd - xlyu + xlyd);
+  float dyy = 0.25 * (yu2 - 2*xy + yd2);
+
+  //finally get offset by derivative and hassian, which combined by dx/dy and dxx/dyy
+  if(dxx * dyy - dxy*dxy != 0){
+    float M[2][2] = {dxx, dxy, dxy, dyy};
+    float D[2] = {dx, dy};
+    cv::Mat hassian(2,2,CV_32F,M);
+    cv::Mat derivative(2,1,CV_32F,D);
+    cv::Mat offset = - hassian.inv() * derivative;
+    coords[ch * 2] += offset.at<float>(0,0);
+    coords[ch * 2 + 1] += offset.at<float>(1,0);
+  }
+}
+
+void get_final_preds(std::vector<float>& heatmap,
+                     std::vector<int64_t>& dim,
+                     std::vector<int64_t>& idxout,
+                     std::vector<int64_t>& idxdim,
+                     std::vector<float>& center,
+                     std::vector<float> scale,
+                     std::vector<float>& preds,
+                     int batchid,
+                     bool DARK) {
+  std::vector<float> coords;
+  coords.resize(dim[1] * 2);
+  int heatmap_height = dim[2];
+  int heatmap_width = dim[3];
+
+  for (int j = 0; j < dim[1]; ++j) {
+    int index = (batchid * dim[1] + j) * dim[2] * dim[3];
+
+    int idx = idxout[batchid * dim[1] + j];
+    preds[j * 3] = heatmap[index + idx];
+    coords[j * 2] = idx % heatmap_width;
+    coords[j * 2 + 1] = idx / heatmap_width;
+
+    int px = int(coords[j * 2] + 0.5);
+    int py = int(coords[j * 2 + 1] + 0.5);
+
+    if(DARK && px > 1 && px < heatmap_width - 2){
+      dark_parse(heatmap, dim, coords, px, py, index, j);
+    }
+    else{
+      if (px > 0 && px < heatmap_width - 1) {
+        float diff_x = heatmap[index + py * dim[3] + px + 1] -
+                      heatmap[index + py * dim[3] + px - 1];
+        coords[j * 2] += diff_x > 0 ? 1 : -1 * 0.25;
+      }
+      if (py > 0 && py < heatmap_height - 1) {
+        float diff_y = heatmap[index + (py + 1) * dim[3] + px] -
+                      heatmap[index + (py - 1) * dim[3] + px];
+        coords[j * 2 + 1] += diff_y > 0 ? 1 : -1 * 0.25;
+      }
+    }
+  }
+  
+  std::vector<int> img_size{heatmap_width, heatmap_height};
+  transform_preds(coords, center, scale, img_size, dim, preds);
+}
--- a/deploy/lite/src/main.cc
+++ b/deploy/lite/src/main.cc
+//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <math.h>
+#include <stdarg.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <algorithm>
+#include <iostream>
+#include <numeric>
+#include <string>
+#include <vector>
+
+#include "include/config_parser.h"
+#include "include/keypoint_detector.h"
+#include "include/object_detector.h"
+#include "include/preprocess_op.h"
+#include "json/json.h"
+
+Json::Value RT_Config;
+
+void PrintBenchmarkLog(std::vector<double> det_time, int img_num) {
+  std::cout << "----------------------- Config info -----------------------"
+            << std::endl;
+  std::cout << "num_threads: " << RT_Config["cpu_threads"].as<int>()
+            << std::endl;
+  std::cout << "----------------------- Data info -----------------------"
+            << std::endl;
+  std::cout << "batch_size_det: " << RT_Config["batch_size_det"].as<int>()
+            << std::endl;
+  std::cout << "----------------------- Model info -----------------------"
+            << std::endl;
+  RT_Config["model_dir_det"].as<std::string>().erase(
+      RT_Config["model_dir_det"].as<std::string>().find_last_not_of("/") + 1);
+  std::cout << "detection model_name: "
+            << RT_Config["model_dir_det"].as<std::string>() << std::endl;
+  std::cout << "----------------------- Perf info ------------------------"
+            << std::endl;
+  std::cout << "Total number of predicted data: " << img_num
+            << " and total time spent(ms): "
+            << std::accumulate(det_time.begin(), det_time.end(), 0.)
+            << std::endl;
+  img_num = std::max(1, img_num);
+  std::cout << "preproce_time(ms): " << det_time[0] / img_num
+            << ", inference_time(ms): " << det_time[1] / img_num
+            << ", postprocess_time(ms): " << det_time[2] / img_num << std::endl;
+}
+
+void PrintKptsBenchmarkLog(std::vector<double> det_time, int img_num) {
+  std::cout << "----------------------- Data info -----------------------"
+            << std::endl;
+  std::cout << "batch_size_keypoint: "
+            << RT_Config["batch_size_keypoint"].as<int>() << std::endl;
+  std::cout << "----------------------- Model info -----------------------"
+            << std::endl;
+  RT_Config["model_dir_keypoint"].as<std::string>().erase(
+      RT_Config["model_dir_keypoint"].as<std::string>().find_last_not_of("/") +
+      1);
+  std::cout << "keypoint model_name: "
+            << RT_Config["model_dir_keypoint"].as<std::string>() << std::endl;
+  std::cout << "----------------------- Perf info ------------------------"
+            << std::endl;
+  std::cout << "Total number of predicted data: " << img_num
+            << " and total time spent(ms): "
+            << std::accumulate(det_time.begin(), det_time.end(), 0.)
+            << std::endl;
+  img_num = std::max(1, img_num);
+  std::cout << "Average time cost per person:" << std::endl
+            << "preproce_time(ms): " << det_time[0] / img_num
+            << ", inference_time(ms): " << det_time[1] / img_num
+            << ", postprocess_time(ms): " << det_time[2] / img_num << std::endl;
+}
+
+void PrintTotalIimeLog(double det_time,
+                       double keypoint_time,
+                       double crop_time) {
+  std::cout << "----------------------- Time info ------------------------"
+            << std::endl;
+  std::cout << "Total Pipeline time(ms) per image: "
+            << det_time + keypoint_time + crop_time << std::endl;
+  std::cout << "Average det time(ms) per image: " << det_time
+            << ", average keypoint time(ms) per image: " << keypoint_time
+            << ", average crop time(ms) per image: " << crop_time << std::endl;
+}
+
+static std::string DirName(const std::string& filepath) {
+  auto pos = filepath.rfind(OS_PATH_SEP);
+  if (pos == std::string::npos) {
+    return "";
+  }
+  return filepath.substr(0, pos);
+}
+
+static bool PathExists(const std::string& path) {
+  struct stat buffer;
+  return (stat(path.c_str(), &buffer) == 0);
+}
+
+static void MkDir(const std::string& path) {
+  if (PathExists(path)) return;
+  int ret = 0;
+  ret = mkdir(path.c_str(), 0755);
+  if (ret != 0) {
+    std::string path_error(path);
+    path_error += " mkdir failed!";
+    throw std::runtime_error(path_error);
+  }
+}
+
+static void MkDirs(const std::string& path) {
+  if (path.empty()) return;
+  if (PathExists(path)) return;
+
+  MkDirs(DirName(path));
+  MkDir(path);
+}
+
+void PredictImage(const std::vector<std::string> all_img_paths,
+                  const int batch_size_det,
+                  const double threshold_det,
+                  const bool run_benchmark,
+                  PaddleDetection::ObjectDetector* det,
+                  PaddleDetection::KeyPointDetector* keypoint,
+                  const std::string& output_dir = "output") {
+  std::vector<double> det_t = {0, 0, 0};
+  int steps = ceil(static_cast<float>(all_img_paths.size()) / batch_size_det);
+  int kpts_imgs = 0;
+  std::vector<double> keypoint_t = {0, 0, 0};
+  double midtimecost = 0;
+  for (int idx = 0; idx < steps; idx++) {
+    std::vector<cv::Mat> batch_imgs;
+    int left_image_cnt = all_img_paths.size() - idx * batch_size_det;
+    if (left_image_cnt > batch_size_det) {
+      left_image_cnt = batch_size_det;
+    }
+    for (int bs = 0; bs < left_image_cnt; bs++) {
+      std::string image_file_path = all_img_paths.at(idx * batch_size_det + bs);
+      cv::Mat im = cv::imread(image_file_path, 1);
+      batch_imgs.insert(batch_imgs.end(), im);
+    }
+    // Store all detected result
+    std::vector<PaddleDetection::ObjectResult> result;
+    std::vector<int> bbox_num;
+    std::vector<double> det_times;
+
+    // Store keypoint results
+    std::vector<PaddleDetection::KeyPointResult> result_kpts;
+    std::vector<cv::Mat> imgs_kpts;
+    std::vector<std::vector<float>> center_bs;
+    std::vector<std::vector<float>> scale_bs;
+    std::vector<int> colormap_kpts = PaddleDetection::GenerateColorMap(20);
+    bool is_rbox = false;
+    if (run_benchmark) {
+      det->Predict(
+          batch_imgs, threshold_det, 50, 50, &result, &bbox_num, &det_times);
+    } else {
+      det->Predict(
+          batch_imgs, threshold_det, 0, 1, &result, &bbox_num, &det_times);
+    }
+
+    // get labels and colormap
+    auto labels = det->GetLabelList();
+    auto colormap = PaddleDetection::GenerateColorMap(labels.size());
+    int item_start_idx = 0;
+    for (int i = 0; i < left_image_cnt; i++) {
+      cv::Mat im = batch_imgs[i];
+      std::vector<PaddleDetection::ObjectResult> im_result;
+      int detect_num = 0;
+      for (int j = 0; j < bbox_num[i]; j++) {
+        PaddleDetection::ObjectResult item = result[item_start_idx + j];
+        if (item.confidence < threshold_det || item.class_id == -1) {
+          continue;
+        }
+        detect_num += 1;
+        im_result.push_back(item);
+        if (item.rect.size() > 6) {
+          is_rbox = true;
+          printf("class=%d confidence=%.4f rect=[%d %d %d %d %d %d %d %d]\n",
+                 item.class_id,
+                 item.confidence,
+                 item.rect[0],
+                 item.rect[1],
+                 item.rect[2],
+                 item.rect[3],
+                 item.rect[4],
+                 item.rect[5],
+                 item.rect[6],
+                 item.rect[7]);
+        } else {
+          printf("class=%d confidence=%.4f rect=[%d %d %d %d]\n",
+                 item.class_id,
+                 item.confidence,
+                 item.rect[0],
+                 item.rect[1],
+                 item.rect[2],
+                 item.rect[3]);
+        }
+      }
+      std::cout << all_img_paths.at(idx * batch_size_det + i)
+                << " The number of detected box: " << detect_num << std::endl;
+      item_start_idx = item_start_idx + bbox_num[i];
+
+      std::vector<int> compression_params;
+      compression_params.push_back(cv::IMWRITE_JPEG_QUALITY);
+      compression_params.push_back(95);
+      std::string output_path(output_dir);
+      if (output_dir.rfind(OS_PATH_SEP) != output_dir.size() - 1) {
+        output_path += OS_PATH_SEP;
+      }
+      std::string image_file_path = all_img_paths.at(idx * batch_size_det + i);
+      if (keypoint) {
+        int imsize = im_result.size();
+        for (int i = 0; i < imsize; i++) {
+          auto keypoint_start_time = std::chrono::steady_clock::now();
+          auto item = im_result[i];
+          cv::Mat crop_img;
+          std::vector<double> keypoint_times;
+          std::vector<int> rect = {
+              item.rect[0], item.rect[1], item.rect[2], item.rect[3]};
+          std::vector<float> center;
+          std::vector<float> scale;
+          if (item.class_id == 0) {
+            PaddleDetection::CropImg(im, crop_img, rect, center, scale);
+            center_bs.emplace_back(center);
+            scale_bs.emplace_back(scale);
+            imgs_kpts.emplace_back(crop_img);
+            kpts_imgs += 1;
+          }
+          auto keypoint_crop_time = std::chrono::steady_clock::now();
+
+          std::chrono::duration<float> midtimediff =
+              keypoint_crop_time - keypoint_start_time;
+          midtimecost += static_cast<double>(midtimediff.count() * 1000);
+
+          if (imgs_kpts.size() == RT_Config["batch_size_keypoint"].as<int>() ||
+              ((i == imsize - 1) && !imgs_kpts.empty())) {
+            if (run_benchmark) {
+              keypoint->Predict(imgs_kpts,
+                                center_bs,
+                                scale_bs,
+                                10,
+                                10,
+                                &result_kpts,
+                                &keypoint_times);
+            } else {
+              keypoint->Predict(imgs_kpts,
+                                center_bs,
+                                scale_bs,
+                                0,
+                                1,
+                                &result_kpts,
+                                &keypoint_times);
+            }
+            imgs_kpts.clear();
+            center_bs.clear();
+            scale_bs.clear();
+            keypoint_t[0] += keypoint_times[0];
+            keypoint_t[1] += keypoint_times[1];
+            keypoint_t[2] += keypoint_times[2];
+          }
+        }
+        std::string kpts_savepath =
+            output_path + "keypoint_" +
+            image_file_path.substr(image_file_path.find_last_of('/') + 1);
+        cv::Mat kpts_vis_img = VisualizeKptsResult(
+            im, result_kpts, colormap_kpts, keypoint->get_threshold());
+        cv::imwrite(kpts_savepath, kpts_vis_img, compression_params);
+        printf("Visualized output saved as %s\n", kpts_savepath.c_str());
+      } else {
+        // Visualization result
+        cv::Mat vis_img = PaddleDetection::VisualizeResult(
+            im, im_result, labels, colormap, is_rbox);
+        std::string det_savepath =
+            output_path + "result_" +
+            image_file_path.substr(image_file_path.find_last_of('/') + 1);
+        cv::imwrite(det_savepath, vis_img, compression_params);
+        printf("Visualized output saved as %s\n", det_savepath.c_str());
+      }
+    }
+
+    det_t[0] += det_times[0];
+    det_t[1] += det_times[1];
+    det_t[2] += det_times[2];
+  }
+  PrintBenchmarkLog(det_t, all_img_paths.size());
+  if (keypoint) {
+    PrintKptsBenchmarkLog(keypoint_t, kpts_imgs);
+    PrintTotalIimeLog(
+        (det_t[0] + det_t[1] + det_t[2]) / all_img_paths.size(),
+        (keypoint_t[0] + keypoint_t[1] + keypoint_t[2]) / all_img_paths.size(),
+        midtimecost / all_img_paths.size());
+  }
+}
+
+int main(int argc, char** argv) {
+  std::cout << "Usage: " << argv[0] << " [config_path] [image_dir](option)\n";
+  if (argc < 2) {
+    std::cout << "Usage: ./main det_runtime_config.json" << std::endl;
+    return -1;
+  }
+  std::string config_path = argv[1];
+  std::string img_path = "";
+
+  if (argc >= 3) {
+    img_path = argv[2];
+  }
+  // Parsing command-line
+  PaddleDetection::load_jsonf(config_path, RT_Config);
+  if (RT_Config["model_dir_det"].as<std::string>().empty()) {
+    std::cout << "Please set [model_det_dir] in " << config_path << std::endl;
+    return -1;
+  }
+  if (RT_Config["image_file"].as<std::string>().empty() &&
+      RT_Config["image_dir"].as<std::string>().empty() && img_path.empty()) {
+    std::cout << "Please set [image_file] or [image_dir] in " << config_path
+              << " Or use command: <" << argv[0] << " [image_dir]>"
+              << std::endl;
+    return -1;
+  }
+  if (!img_path.empty()) {
+    std::cout << "Use image_dir in command line overide the path in config file"
+              << std::endl;
+    RT_Config["image_dir"] = img_path;
+    RT_Config["image_file"] = "";
+  }
+  // Load model and create a object detector
+  PaddleDetection::ObjectDetector det(
+      RT_Config["model_dir_det"].as<std::string>(),
+      RT_Config["cpu_threads"].as<int>(),
+      RT_Config["batch_size_det"].as<int>());
+
+  PaddleDetection::KeyPointDetector* keypoint = nullptr;
+  if (!RT_Config["model_dir_keypoint"].as<std::string>().empty()) {
+    keypoint = new PaddleDetection::KeyPointDetector(
+        RT_Config["model_dir_keypoint"].as<std::string>(),
+        RT_Config["cpu_threads"].as<int>(),
+        RT_Config["batch_size_keypoint"].as<int>(),
+        RT_Config["use_dark_decode"].as<bool>());
+    RT_Config["batch_size_det"] = 1;
+    printf(
+        "batchsize of detection forced to be 1 while keypoint model is not "
+        "empty()");
+  }
+  // Do inference on input image
+
+  if (!RT_Config["image_file"].as<std::string>().empty() ||
+      !RT_Config["image_dir"].as<std::string>().empty()) {
+    if (!PathExists(RT_Config["output_dir"].as<std::string>())) {
+      MkDirs(RT_Config["output_dir"].as<std::string>());
+    }
+    std::vector<std::string> all_img_paths;
+    std::vector<cv::String> cv_all_img_paths;
+    if (!RT_Config["image_file"].as<std::string>().empty()) {
+      all_img_paths.push_back(RT_Config["image_file"].as<std::string>());
+      if (RT_Config["batch_size_det"].as<int>() > 1) {
+        std::cout << "batch_size_det should be 1, when set `image_file`."
+                  << std::endl;
+        return -1;
+      }
+    } else {
+      cv::glob(RT_Config["image_dir"].as<std::string>(), cv_all_img_paths);
+      for (const auto& img_path : cv_all_img_paths) {
+        all_img_paths.push_back(img_path);
+      }
+    }
+    PredictImage(all_img_paths,
+                 RT_Config["batch_size_det"].as<int>(),
+                 RT_Config["threshold_det"].as<float>(),
+                 RT_Config["run_benchmark"].as<bool>(),
+                 &det,
+                 keypoint,
+                 RT_Config["output_dir"].as<std::string>());
+  }
+  delete keypoint;
+  keypoint = nullptr;
+  return 0;
+}
--- a/deploy/lite/src/object_detector.cc
+++ b/deploy/lite/src/object_detector.cc
+//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <sstream>
+// for setprecision
+#include <chrono>
+#include <iomanip>
+#include "include/object_detector.h"
+
+namespace PaddleDetection {
+
+// Load Model and create model predictor
+void ObjectDetector::LoadModel(std::string model_file, int num_theads) {
+  MobileConfig config;
+  config.set_threads(num_theads);
+  config.set_model_from_file(model_file + "/model.nb");
+  config.set_power_mode(LITE_POWER_HIGH);
+
+  predictor_ = CreatePaddlePredictor<MobileConfig>(config);
+}
+
+// Visualiztion MaskDetector results
+cv::Mat VisualizeResult(const cv::Mat& img,
+                        const std::vector<PaddleDetection::ObjectResult>& results,
+                        const std::vector<std::string>& lables,
+                        const std::vector<int>& colormap,
+                        const bool is_rbox = false) {
+  cv::Mat vis_img = img.clone();
+  for (int i = 0; i < results.size(); ++i) {
+    // Configure color and text size
+    std::ostringstream oss;
+    oss << std::setiosflags(std::ios::fixed) << std::setprecision(4);
+    oss << lables[results[i].class_id] << " ";
+    oss << results[i].confidence;
+    std::string text = oss.str();
+    int c1 = colormap[3 * results[i].class_id + 0];
+    int c2 = colormap[3 * results[i].class_id + 1];
+    int c3 = colormap[3 * results[i].class_id + 2];
+    cv::Scalar roi_color = cv::Scalar(c1, c2, c3);
+    int font_face = cv::FONT_HERSHEY_COMPLEX_SMALL;
+    double font_scale = 0.5f;
+    float thickness = 0.5;
+    cv::Size text_size =
+        cv::getTextSize(text, font_face, font_scale, thickness, nullptr);
+    cv::Point origin;
+
+    if (is_rbox) {
+      // Draw object, text, and background
+      for (int k = 0; k < 4; k++) {
+        cv::Point pt1 = cv::Point(results[i].rect[(k * 2) % 8],
+                                  results[i].rect[(k * 2 + 1) % 8]);
+        cv::Point pt2 = cv::Point(results[i].rect[(k * 2 + 2) % 8],
+                                  results[i].rect[(k * 2 + 3) % 8]);
+        cv::line(vis_img, pt1, pt2, roi_color, 2);
+      }
+    } else {
+      int w = results[i].rect[2] - results[i].rect[0];
+      int h = results[i].rect[3] - results[i].rect[1];
+      cv::Rect roi = cv::Rect(results[i].rect[0], results[i].rect[1], w, h);
+      // Draw roi object, text, and background
+      cv::rectangle(vis_img, roi, roi_color, 2);
+    }
+
+    origin.x = results[i].rect[0];
+    origin.y = results[i].rect[1];
+
+    // Configure text background
+    cv::Rect text_back = cv::Rect(results[i].rect[0],
+                                  results[i].rect[1] - text_size.height,
+                                  text_size.width,
+                                  text_size.height);
+    // Draw text, and background
+    cv::rectangle(vis_img, text_back, roi_color, -1);
+    cv::putText(vis_img,
+                text,
+                origin,
+                font_face,
+                font_scale,
+                cv::Scalar(255, 255, 255),
+                thickness);
+  }
+  return vis_img;
+}
+
+void ObjectDetector::Preprocess(const cv::Mat& ori_im) {
+  // Clone the image : keep the original mat for postprocess
+  cv::Mat im = ori_im.clone();
+  cv::cvtColor(im, im, cv::COLOR_BGR2RGB);
+  preprocessor_.Run(&im, &inputs_);
+}
+
+void ObjectDetector::Postprocess(const std::vector<cv::Mat> mats,
+                                 std::vector<PaddleDetection::ObjectResult>* result,
+                                 std::vector<int> bbox_num,
+                                 bool is_rbox = false) {
+  result->clear();
+  int start_idx = 0;
+  for (int im_id = 0; im_id < mats.size(); im_id++) {
+    cv::Mat raw_mat = mats[im_id];
+    int rh = 1;
+    int rw = 1;
+    if (config_.arch_ == "Face") {
+      rh = raw_mat.rows;
+      rw = raw_mat.cols;
+    }
+    for (int j = start_idx; j < start_idx + bbox_num[im_id]; j++) {
+      if (is_rbox) {
+        // Class id
+        int class_id = static_cast<int>(round(output_data_[0 + j * 10]));
+        // Confidence score
+        float score = output_data_[1 + j * 10];
+        int x1 = (output_data_[2 + j * 10] * rw);
+        int y1 = (output_data_[3 + j * 10] * rh);
+        int x2 = (output_data_[4 + j * 10] * rw);
+        int y2 = (output_data_[5 + j * 10] * rh);
+        int x3 = (output_data_[6 + j * 10] * rw);
+        int y3 = (output_data_[7 + j * 10] * rh);
+        int x4 = (output_data_[8 + j * 10] * rw);
+        int y4 = (output_data_[9 + j * 10] * rh);
+
+        PaddleDetection::ObjectResult result_item;
+        result_item.rect = {x1, y1, x2, y2, x3, y3, x4, y4};
+        result_item.class_id = class_id;
+        result_item.confidence = score;
+        result->push_back(result_item);
+      } else {
+        // Class id
+        int class_id = static_cast<int>(round(output_data_[0 + j * 6]));
+        // Confidence score
+        float score = output_data_[1 + j * 6];
+        int xmin = (output_data_[2 + j * 6] * rw);
+        int ymin = (output_data_[3 + j * 6] * rh);
+        int xmax = (output_data_[4 + j * 6] * rw);
+        int ymax = (output_data_[5 + j * 6] * rh);
+        int wd = xmax - xmin;
+        int hd = ymax - ymin;
+
+        PaddleDetection::ObjectResult result_item;
+        result_item.rect = {xmin, ymin, xmax, ymax};
+        result_item.class_id = class_id;
+        result_item.confidence = score;
+        result->push_back(result_item);
+      }
+    }
+    start_idx += bbox_num[im_id];
+  }
+}
+
+void ObjectDetector::Predict(const std::vector<cv::Mat>& imgs,
+                             const double threshold,
+                             const int warmup,
+                             const int repeats,
+                             std::vector<PaddleDetection::ObjectResult>* result,
+                             std::vector<int>* bbox_num,
+                             std::vector<double>* times) {
+  auto preprocess_start = std::chrono::steady_clock::now();
+  int batch_size = imgs.size();
+
+  // in_data_batch
+  std::vector<float> in_data_all;
+  std::vector<float> im_shape_all(batch_size * 2);
+  std::vector<float> scale_factor_all(batch_size * 2);
+  // Preprocess image
+  for (int bs_idx = 0; bs_idx < batch_size; bs_idx++) {
+    cv::Mat im = imgs.at(bs_idx);
+    Preprocess(im);
+    im_shape_all[bs_idx * 2] = inputs_.im_shape_[0];
+    im_shape_all[bs_idx * 2 + 1] = inputs_.im_shape_[1];
+
+    scale_factor_all[bs_idx * 2] = inputs_.scale_factor_[0];
+    scale_factor_all[bs_idx * 2 + 1] = inputs_.scale_factor_[1];
+
+    // TODO: reduce cost time
+    in_data_all.insert(
+        in_data_all.end(), inputs_.im_data_.begin(), inputs_.im_data_.end());
+  }
+  auto preprocess_end = std::chrono::steady_clock::now();
+  std::vector<const float *> output_data_list_;
+  // Prepare input tensor
+
+  auto input_names = predictor_->GetInputNames();
+  for (const auto& tensor_name : input_names) {
+    auto in_tensor = predictor_->GetInputByName(tensor_name);
+    if (tensor_name == "image") {
+      int rh = inputs_.in_net_shape_[0];
+      int rw = inputs_.in_net_shape_[1];
+      in_tensor->Resize({batch_size, 3, rh, rw});
+      auto* inptr = in_tensor->mutable_data<float>();
+      std::copy_n(in_data_all.data(), in_data_all.size(), inptr);
+    } else if (tensor_name == "im_shape") {
+      in_tensor->Resize({batch_size, 2});
+      auto* inptr = in_tensor->mutable_data<float>();
+      std::copy_n(im_shape_all.data(), im_shape_all.size(), inptr);
+    } else if (tensor_name == "scale_factor") {
+      in_tensor->Resize({batch_size, 2});
+      auto* inptr = in_tensor->mutable_data<float>();
+      std::copy_n(scale_factor_all.data(), scale_factor_all.size(), inptr);
+    }
+  }
+
+  // Run predictor
+  // warmup
+  for (int i = 0; i < warmup; i++) {
+    predictor_->Run();
+    // Get output tensor
+    auto output_names = predictor_->GetOutputNames();
+    if (config_.arch_ == "PicoDet") {
+      for (int j = 0; j < output_names.size(); j++) {
+        auto output_tensor = predictor_->GetTensor(output_names[j]);
+        const float* outptr = output_tensor->data<float>();
+        std::vector<int64_t> output_shape = output_tensor->shape();
+        output_data_list_.push_back(outptr);
+      }
+    } else {
+      auto out_tensor = predictor_->GetTensor(output_names[0]);
+      auto out_bbox_num = predictor_->GetTensor(output_names[1]);
+    }
+  }
+
+  bool is_rbox = false;
+  auto inference_start = std::chrono::steady_clock::now();
+  for (int i = 0; i < repeats; i++) {
+    predictor_->Run();
+  }
+  auto inference_end = std::chrono::steady_clock::now();
+  auto postprocess_start = std::chrono::steady_clock::now();
+  // Get output tensor
+  output_data_list_.clear();
+  int num_class = 80;
+  int reg_max = 7;
+  auto output_names = predictor_->GetOutputNames();
+  // TODO: Unified model output.
+  if (config_.arch_ == "PicoDet") {
+    for (int i = 0; i < output_names.size(); i++) {
+      auto output_tensor = predictor_->GetTensor(output_names[i]);
+      const float* outptr = output_tensor->data<float>();
+      std::vector<int64_t> output_shape = output_tensor->shape();
+      if (i == 0) {
+        num_class = output_shape[2];
+      }
+      if (i == config_.fpn_stride_.size()) {
+        reg_max = output_shape[2] / 4 - 1;
+      }
+      output_data_list_.push_back(outptr);
+    }
+  } else {
+    auto output_tensor = predictor_->GetTensor(output_names[0]);
+    auto output_shape = output_tensor->shape();
+    auto out_bbox_num = predictor_->GetTensor(output_names[1]);
+    auto out_bbox_num_shape = out_bbox_num->shape();
+    // Calculate output length
+    int output_size = 1;
+    for (int j = 0; j < output_shape.size(); ++j) {
+      output_size *= output_shape[j];
+    }
+    is_rbox = output_shape[output_shape.size() - 1] % 10 == 0;
+
+    if (output_size < 6) {
+      std::cerr << "[WARNING] No object detected." << std::endl;
+    }
+    output_data_.resize(output_size);
+    std::copy_n(
+        output_tensor->mutable_data<float>(), output_size, output_data_.data());
+
+    int out_bbox_num_size = 1;
+    for (int j = 0; j < out_bbox_num_shape.size(); ++j) {
+      out_bbox_num_size *= out_bbox_num_shape[j];
+    }
+    out_bbox_num_data_.resize(out_bbox_num_size);
+    std::copy_n(out_bbox_num->mutable_data<int>(),
+                out_bbox_num_size,
+                out_bbox_num_data_.data());
+  }
+  // Postprocessing result
+  result->clear();
+  if (config_.arch_ == "PicoDet") {
+    PaddleDetection::PicoDetPostProcess(
+        result, output_data_list_, config_.fpn_stride_, 
+        inputs_.im_shape_, inputs_.scale_factor_,
+        config_.nms_info_["score_threshold"].as<float>(), 
+        config_.nms_info_["nms_threshold"].as<float>(), num_class, reg_max);
+    bbox_num->push_back(result->size());
+  } else {
+    Postprocess(imgs, result, out_bbox_num_data_, is_rbox);
+    bbox_num->clear();
+    for (int k = 0; k < out_bbox_num_data_.size(); k++) {
+      int tmp = out_bbox_num_data_[k];
+      bbox_num->push_back(tmp);
+    }
+  }
+  auto postprocess_end = std::chrono::steady_clock::now();
+
+  std::chrono::duration<float> preprocess_diff =
+      preprocess_end - preprocess_start;
+  times->push_back(double(preprocess_diff.count() * 1000));
+  std::chrono::duration<float> inference_diff = inference_end - inference_start;
+  times->push_back(double(inference_diff.count() / repeats * 1000));
+  std::chrono::duration<float> postprocess_diff =
+      postprocess_end - postprocess_start;
+  times->push_back(double(postprocess_diff.count() * 1000));
+}
+
+std::vector<int> GenerateColorMap(int num_class) {
+  auto colormap = std::vector<int>(3 * num_class, 0);
+  for (int i = 0; i < num_class; ++i) {
+    int j = 0;
+    int lab = i;
+    while (lab) {
+      colormap[i * 3] |= (((lab >> 0) & 1) << (7 - j));
+      colormap[i * 3 + 1] |= (((lab >> 1) & 1) << (7 - j));
+      colormap[i * 3 + 2] |= (((lab >> 2) & 1) << (7 - j));
+      ++j;
+      lab >>= 3;
+    }
+  }
+  return colormap;
+}
+
+}  // namespace PaddleDetection
--- a/deploy/lite/src/picodet_postprocess.cc
+++ b/deploy/lite/src/picodet_postprocess.cc
+//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// The code is based on:
+// https://github.com/RangiLyu/nanodet/blob/main/demo_mnn/nanodet_mnn.cpp
+
+#include "include/picodet_postprocess.h"
+
+namespace PaddleDetection {
+
+float fast_exp(float x) {
+  union {
+    uint32_t i;
+    float f;
+  } v{};
+  v.i = (1 << 23) * (1.4426950409 * x + 126.93490512f);
+  return v.f;
+}
+
+template <typename _Tp>
+int activation_function_softmax(const _Tp *src, _Tp *dst, int length) {
+  const _Tp alpha = *std::max_element(src, src + length);
+  _Tp denominator{0};
+
+  for (int i = 0; i < length; ++i) {
+    dst[i] = fast_exp(src[i] - alpha);
+    denominator += dst[i];
+  }
+
+  for (int i = 0; i < length; ++i) {
+    dst[i] /= denominator;
+  }
+
+  return 0;
+}
+
+// PicoDet decode
+PaddleDetection::ObjectResult
+disPred2Bbox(const float *&dfl_det, int label, float score, int x, int y,
+             int stride, std::vector<float> im_shape, int reg_max) {
+  float ct_x = (x + 0.5) * stride;
+  float ct_y = (y + 0.5) * stride;
+  std::vector<float> dis_pred;
+  dis_pred.resize(4);
+  for (int i = 0; i < 4; i++) {
+    float dis = 0;
+    float *dis_after_sm = new float[reg_max + 1];
+    activation_function_softmax(dfl_det + i * (reg_max + 1), dis_after_sm,
+                                reg_max + 1);
+    for (int j = 0; j < reg_max + 1; j++) {
+      dis += j * dis_after_sm[j];
+    }
+    dis *= stride;
+    dis_pred[i] = dis;
+    delete[] dis_after_sm;
+  }
+  int xmin = (int)(std::max)(ct_x - dis_pred[0], .0f);
+  int ymin = (int)(std::max)(ct_y - dis_pred[1], .0f);
+  int xmax = (int)(std::min)(ct_x + dis_pred[2], (float)im_shape[0]);
+  int ymax = (int)(std::min)(ct_y + dis_pred[3], (float)im_shape[1]);
+
+  PaddleDetection::ObjectResult result_item;
+  result_item.rect = {xmin, ymin, xmax, ymax};
+  result_item.class_id = label;
+  result_item.confidence = score;
+
+  return result_item;
+}
+
+void PicoDetPostProcess(std::vector<PaddleDetection::ObjectResult> *results,
+                        std::vector<const float *> outs,
+                        std::vector<int> fpn_stride,
+                        std::vector<float> im_shape,
+                        std::vector<float> scale_factor, float score_threshold,
+                        float nms_threshold, int num_class, int reg_max) {
+  std::vector<std::vector<PaddleDetection::ObjectResult>> bbox_results;
+  bbox_results.resize(num_class);
+  int in_h = im_shape[0], in_w = im_shape[1];
+  for (int i = 0; i < fpn_stride.size(); ++i) {
+    int feature_h = ceil((float)in_h / fpn_stride[i]);
+    int feature_w = ceil((float)in_w / fpn_stride[i]);
+    for (int idx = 0; idx < feature_h * feature_w; idx++) {
+      const float *scores = outs[i] + (idx * num_class);
+
+      int row = idx / feature_w;
+      int col = idx % feature_w;
+      float score = 0;
+      int cur_label = 0;
+      for (int label = 0; label < num_class; label++) {
+        if (scores[label] > score) {
+          score = scores[label];
+          cur_label = label;
+        }
+      }
+      if (score > score_threshold) {
+        const float *bbox_pred =
+            outs[i + fpn_stride.size()] + (idx * 4 * (reg_max + 1));
+        bbox_results[cur_label].push_back(
+            disPred2Bbox(bbox_pred, cur_label, score, col, row, fpn_stride[i],
+                         im_shape, reg_max));
+      }
+    }
+  }
+  for (int i = 0; i < (int)bbox_results.size(); i++) {
+    PaddleDetection::nms(bbox_results[i], nms_threshold);
+
+    for (auto box : bbox_results[i]) {
+      box.rect[0] = box.rect[0] / scale_factor[1];
+      box.rect[2] = box.rect[2] / scale_factor[1];
+      box.rect[1] = box.rect[1] / scale_factor[0];
+      box.rect[3] = box.rect[3] / scale_factor[0];
+      results->push_back(box);
+    }
+  }
+}
+
+} // namespace PaddleDetection
--- a/deploy/lite/src/preprocess_op.cc
+++ b/deploy/lite/src/preprocess_op.cc
+//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <string>
+#include <thread>
+#include <vector>
+
+#include "include/preprocess_op.h"
+
+namespace PaddleDetection {
+
+void InitInfo::Run(cv::Mat* im, ImageBlob* data) {
+  data->im_shape_ = {static_cast<float>(im->rows),
+                     static_cast<float>(im->cols)};
+  data->scale_factor_ = {1., 1.};
+  data->in_net_shape_ = {static_cast<float>(im->rows),
+                         static_cast<float>(im->cols)};
+}
+
+void NormalizeImage::Run(cv::Mat* im, ImageBlob* data) {
+  double e = 1.0;
+  if (is_scale_) {
+    e *= 1./255.0;
+  }
+  (*im).convertTo(*im, CV_32FC3, e);
+  for (int h = 0; h < im->rows; h++) {
+    for (int w = 0; w < im->cols; w++) {
+      im->at<cv::Vec3f>(h, w)[0] =
+          (im->at<cv::Vec3f>(h, w)[0] - mean_[0]) / scale_[0];
+      im->at<cv::Vec3f>(h, w)[1] =
+          (im->at<cv::Vec3f>(h, w)[1] - mean_[1]) / scale_[1];
+      im->at<cv::Vec3f>(h, w)[2] =
+          (im->at<cv::Vec3f>(h, w)[2] - mean_[2]) / scale_[2];
+    }
+  }
+}
+
+void Permute::Run(cv::Mat* im, ImageBlob* data) {
+  (*im).convertTo(*im, CV_32FC3);
+  int rh = im->rows;
+  int rw = im->cols;
+  int rc = im->channels();
+  (data->im_data_).resize(rc * rh * rw);
+  float* base = (data->im_data_).data();
+  for (int i = 0; i < rc; ++i) {
+    cv::extractChannel(*im, cv::Mat(rh, rw, CV_32FC1, base + i * rh * rw), i);
+  }
+}
+
+void Resize::Run(cv::Mat* im, ImageBlob* data) {
+  auto resize_scale = GenerateScale(*im);
+  data->im_shape_ = {static_cast<float>(im->cols * resize_scale.first),
+                     static_cast<float>(im->rows * resize_scale.second)};
+  data->in_net_shape_ = {static_cast<float>(im->cols * resize_scale.first),
+                         static_cast<float>(im->rows * resize_scale.second)};
+  cv::resize(
+      *im, *im, cv::Size(), resize_scale.first, resize_scale.second, interp_);
+  data->im_shape_ = {
+      static_cast<float>(im->rows), static_cast<float>(im->cols),
+  };
+  data->scale_factor_ = {
+      resize_scale.second, resize_scale.first,
+  };
+}
+
+std::pair<float, float> Resize::GenerateScale(const cv::Mat& im) {
+  std::pair<float, float> resize_scale;
+  int origin_w = im.cols;
+  int origin_h = im.rows;
+
+  if (keep_ratio_) {
+    int im_size_max = std::max(origin_w, origin_h);
+    int im_size_min = std::min(origin_w, origin_h);
+    int target_size_max =
+        *std::max_element(target_size_.begin(), target_size_.end());
+    int target_size_min =
+        *std::min_element(target_size_.begin(), target_size_.end());
+    float scale_min =
+        static_cast<float>(target_size_min) / static_cast<float>(im_size_min);
+    float scale_max =
+        static_cast<float>(target_size_max) / static_cast<float>(im_size_max);
+    float scale_ratio = std::min(scale_min, scale_max);
+    resize_scale = {scale_ratio, scale_ratio};
+  } else {
+    resize_scale.first =
+        static_cast<float>(target_size_[1]) / static_cast<float>(origin_w);
+    resize_scale.second =
+        static_cast<float>(target_size_[0]) / static_cast<float>(origin_h);
+  }
+  return resize_scale;
+}
+
+void PadStride::Run(cv::Mat* im, ImageBlob* data) {
+  if (stride_ <= 0) {
+    return;
+  }
+  int rc = im->channels();
+  int rh = im->rows;
+  int rw = im->cols;
+  int nh = (rh / stride_) * stride_ + (rh % stride_ != 0) * stride_;
+  int nw = (rw / stride_) * stride_ + (rw % stride_ != 0) * stride_;
+  cv::copyMakeBorder(
+      *im, *im, 0, nh - rh, 0, nw - rw, cv::BORDER_CONSTANT, cv::Scalar(0));
+  data->in_net_shape_ = {
+      static_cast<float>(im->rows), static_cast<float>(im->cols),
+  };
+}
+
+void TopDownEvalAffine::Run(cv::Mat* im, ImageBlob* data) {
+  cv::resize(*im, *im, cv::Size(trainsize_[0], trainsize_[1]), 0, 0, interp_);
+  // todo: Simd::ResizeBilinear();
+  data->in_net_shape_ = {
+      static_cast<float>(trainsize_[1]), static_cast<float>(trainsize_[0]),
+  };
+}
+
+// Preprocessor op running order
+const std::vector<std::string> Preprocessor::RUN_ORDER = {"InitInfo",
+                                                          "TopDownEvalAffine",
+                                                          "Resize",
+                                                          "NormalizeImage",
+                                                          "PadStride",
+                                                          "Permute"};
+
+void Preprocessor::Run(cv::Mat* im, ImageBlob* data) {
+  for (const auto& name : RUN_ORDER) {
+    if (ops_.find(name) != ops_.end()) {
+      ops_[name]->Run(im, data);
+    }
+  }
+}
+
+void CropImg(cv::Mat& img,
+             cv::Mat& crop_img,
+             std::vector<int>& area,
+             std::vector<float>& center,
+             std::vector<float>& scale,
+             float expandratio) {
+  int crop_x1 = std::max(0, area[0]);
+  int crop_y1 = std::max(0, area[1]);
+  int crop_x2 = std::min(img.cols - 1, area[2]);
+  int crop_y2 = std::min(img.rows - 1, area[3]);
+  
+  int center_x = (crop_x1 + crop_x2) / 2.;
+  int center_y = (crop_y1 + crop_y2) / 2.;
+  int half_h = (crop_y2 - crop_y1) / 2.;
+  int half_w = (crop_x2 - crop_x1) / 2.;
+
+  if (half_h * 3 > half_w * 4) {
+    half_w = static_cast<int>(half_h * 0.75);
+  } else {
+    half_h = static_cast<int>(half_w * 4 / 3);
+  }
+
+  crop_x1 =
+      std::max(0, center_x - static_cast<int>(half_w * (1 + expandratio)));
+  crop_y1 =
+      std::max(0, center_y - static_cast<int>(half_h * (1 + expandratio)));
+  crop_x2 = std::min(img.cols - 1,
+                     static_cast<int>(center_x + half_w * (1 + expandratio)));
+  crop_y2 = std::min(img.rows - 1,
+                     static_cast<int>(center_y + half_h * (1 + expandratio)));
+  crop_img =
+      img(cv::Range(crop_y1, crop_y2 + 1), cv::Range(crop_x1, crop_x2 + 1));
+
+  center.clear();
+  center.emplace_back((crop_x1 + crop_x2) / 2);
+  center.emplace_back((crop_y1 + crop_y2) / 2);
+  scale.clear();
+  scale.emplace_back((crop_x2 - crop_x1));
+  scale.emplace_back((crop_y2 - crop_y1));
+}
+
+}  // namespace PaddleDetection
--- a/deploy/lite/src/utils.cc
+++ b/deploy/lite/src/utils.cc
+//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "include/utils.h"
+
+namespace PaddleDetection {
+
+void nms(std::vector<ObjectResult> &input_boxes, float nms_threshold) {
+  std::sort(input_boxes.begin(),
+  input_boxes.end(), 
+  [](ObjectResult a, ObjectResult b) { return a.confidence > b.confidence; });
+  std::vector<float> vArea(input_boxes.size());
+  for (int i = 0; i < int(input_boxes.size()); ++i) {
+    vArea[i] = (input_boxes.at(i).rect[2] - input_boxes.at(i).rect[0] + 1) 
+            * (input_boxes.at(i).rect[3] - input_boxes.at(i).rect[1] + 1);
+  }
+  for (int i = 0; i < int(input_boxes.size()); ++i) {
+    for (int j = i + 1; j < int(input_boxes.size());) {
+      float xx1 = (std::max)(input_boxes[i].rect[0], input_boxes[j].rect[0]);
+      float yy1 = (std::max)(input_boxes[i].rect[1], input_boxes[j].rect[1]);
+      float xx2 = (std::min)(input_boxes[i].rect[2], input_boxes[j].rect[2]);
+      float yy2 = (std::min)(input_boxes[i].rect[3], input_boxes[j].rect[3]);
+      float w = (std::max)(float(0), xx2 - xx1 + 1);
+      float h = (std::max)(float(0), yy2 - yy1 + 1);
+      float inter = w * h;
+      float ovr = inter / (vArea[i] + vArea[j] - inter);
+      if (ovr >= nms_threshold) {
+          input_boxes.erase(input_boxes.begin() + j);
+          vArea.erase(vArea.begin() + j);
+      }
+      else {
+          j++;
+      }
+    }
+  }
+}
+
+}  // namespace PaddleDetection
--- a/deploy/python/README.md
+++ b/deploy/python/README.md
+# Python端预测部署
+
+在PaddlePaddle中预测引擎和训练引擎底层有着不同的优化方法, 预测引擎使用了AnalysisPredictor，专门针对推理进行了优化，是基于[C++预测库](https://www.paddlepaddle.org.cn/documentation/docs/zh/advanced_guide/inference_deployment/inference/native_infer.html)的Python接口，该引擎可以对模型进行多项图优化，减少不必要的内存拷贝。如果用户在部署已训练模型的过程中对性能有较高的要求，我们提供了独立于PaddleDetection的预测脚本，方便用户直接集成部署。
+
+
+Python端预测部署主要包含两个步骤：
+- 导出预测模型
+- 基于Python进行预测
+
+## 1. 导出预测模型
+
+PaddleDetection在训练过程包括网络的前向和优化器相关参数，而在部署过程中，我们只需要前向参数，具体参考:[导出模型](../EXPORT_MODEL.md)，例如
+
+```bash
+# 导出YOLOv3检测模型
+python tools/export_model.py -c configs/yolov3/yolov3_darknet53_270e_coco.yml --output_dir=./inference_model \
+ -o weights=https://paddledet.bj.bcebos.com/models/yolov3_darknet53_270e_coco.pdparams
+```
+
+导出后目录下，包括`infer_cfg.yml`, `model.pdiparams`,  `model.pdiparams.info`, `model.pdmodel`四个文件。
+
+
+## 2. 基于Python的预测
+
+### 2.1 通用检测
+在终端输入以下命令进行预测：
+```bash
+python deploy/python/infer.py --model_dir=./output_inference/yolov3_darknet53_270e_coco --image_file=./demo/000000014439.jpg --device=GPU
+```
+
+# 参数说明
+
+参数说明如下:
+
+| 参数 | 是否必须| 含义                                                                                          |
+|-------|-------|---------------------------------------------------------------------------------------------|
+| --model_dir | Yes| 上述导出的模型路径                                                                                   |
+| --image_file | Option | 需要预测的图片                                                                                     |
+| --image_dir  | Option | 要预测的图片文件夹路径                                                                                 |
+| --video_file | Option | 需要预测的视频                                                                                     |
+| --camera_id | Option | 用来预测的摄像头ID，默认为-1(表示不使用摄像头预测，可设置为：0 - (摄像头数目-1) )，预测过程中在可视化界面按`q`退出输出预测结果到：output/output.mp4 |
+| --device | Option | 运行时的设备，可选择`CPU/GPU/XPU`，默认为`CPU`                                                            |
+| --run_mode | Option | 使用GPU时，默认为paddle, 可选（paddle/trt_fp32/trt_fp16/trt_int8）                                     |
+| --batch_size | Option | 预测时的batch size，在指定`image_dir`时有效，默认为1                                                       |
+| --threshold | Option| 预测得分的阈值，默认为0.5                                                                              |
+| --output_dir | Option| 可视化结果保存的根目录，默认为output/                                                                      |
+| --run_benchmark | Option| 是否运行benchmark，同时需指定`--image_file`或`--image_dir`，默认为False                                    |
+| --enable_mkldnn | Option | CPU预测中是否开启MKLDNN加速，默认为False                                                                 |
+| --cpu_threads | Option| 设置cpu线程数，默认为1                                                                               |
+| --trt_calib_mode | Option| TensorRT是否使用校准功能，默认为False。使用TensorRT的int8功能时，需设置为True，使用PaddleSlim量化后的模型时需要设置为False         |
+| --save_images | Option| 是否保存可视化结果                                                                                   |
+| --save_results | Option| 是否在文件夹下将图片的预测结果以JSON的形式保存                                                                   |
+
+
+说明：
+
+- 参数优先级顺序：`camera_id` > `video_file` > `image_dir` > `image_file`。
+- run_mode：paddle代表使用AnalysisPredictor，精度float32来推理，其他参数指用AnalysisPredictor，TensorRT不同精度来推理。
+- 如果安装的PaddlePaddle不支持基于TensorRT进行预测，需要自行编译，详细可参考[预测库编译教程](https://paddleinference.paddlepaddle.org.cn/user_guides/source_compile.html)。
+- --run_benchmark如果设置为True，则需要安装依赖`pip install pynvml psutil GPUtil`。
+- 如果需要使用导出模型在coco数据集上进行评估，请在推理时添加`--save_results`和`--use_coco_category`参数用以保存coco评估所需要的json文件
--- a/deploy/python/benchmark_utils.py
+++ b/deploy/python/benchmark_utils.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import logging
+
+import paddle
+import paddle.inference as paddle_infer
+
+from pathlib import Path
+
+CUR_DIR = os.path.dirname(os.path.abspath(__file__))
+LOG_PATH_ROOT = f"{CUR_DIR}/../../output"
+
+
+class PaddleInferBenchmark(object):
+    def __init__(self,
+                 config,
+                 model_info: dict={},
+                 data_info: dict={},
+                 perf_info: dict={},
+                 resource_info: dict={},
+                 **kwargs):
+        """
+        Construct PaddleInferBenchmark Class to format logs.
+        args:
+            config(paddle.inference.Config): paddle inference config
+            model_info(dict): basic model info
+                {'model_name': 'resnet50'
+                 'precision': 'fp32'}
+            data_info(dict): input data info
+                {'batch_size': 1
+                 'shape': '3,224,224'
+                 'data_num': 1000}
+            perf_info(dict): performance result
+                {'preprocess_time_s': 1.0
+                'inference_time_s': 2.0
+                'postprocess_time_s': 1.0
+                'total_time_s': 4.0}
+            resource_info(dict): 
+                cpu and gpu resources
+                {'cpu_rss': 100
+                 'gpu_rss': 100
+                 'gpu_util': 60}
+        """
+        # PaddleInferBenchmark Log Version
+        self.log_version = "1.0.3"
+
+        # Paddle Version
+        self.paddle_version = paddle.__version__
+        self.paddle_commit = paddle.__git_commit__
+        paddle_infer_info = paddle_infer.get_version()
+        self.paddle_branch = paddle_infer_info.strip().split(': ')[-1]
+
+        # model info
+        self.model_info = model_info
+
+        # data info
+        self.data_info = data_info
+
+        # perf info
+        self.perf_info = perf_info
+
+        try:
+            # required value
+            self.model_name = model_info['model_name']
+            self.precision = model_info['precision']
+
+            self.batch_size = data_info['batch_size']
+            self.shape = data_info['shape']
+            self.data_num = data_info['data_num']
+
+            self.inference_time_s = round(perf_info['inference_time_s'], 4)
+        except:
+            self.print_help()
+            raise ValueError(
+                "Set argument wrong, please check input argument and its type")
+
+        self.preprocess_time_s = perf_info.get('preprocess_time_s', 0)
+        self.postprocess_time_s = perf_info.get('postprocess_time_s', 0)
+        self.with_tracker = True if 'tracking_time_s' in perf_info else False
+        self.tracking_time_s = perf_info.get('tracking_time_s', 0)
+        self.total_time_s = perf_info.get('total_time_s', 0)
+
+        self.inference_time_s_90 = perf_info.get("inference_time_s_90", "")
+        self.inference_time_s_99 = perf_info.get("inference_time_s_99", "")
+        self.succ_rate = perf_info.get("succ_rate", "")
+        self.qps = perf_info.get("qps", "")
+
+        # conf info
+        self.config_status = self.parse_config(config)
+
+        # mem info
+        if isinstance(resource_info, dict):
+            self.cpu_rss_mb = int(resource_info.get('cpu_rss_mb', 0))
+            self.cpu_vms_mb = int(resource_info.get('cpu_vms_mb', 0))
+            self.cpu_shared_mb = int(resource_info.get('cpu_shared_mb', 0))
+            self.cpu_dirty_mb = int(resource_info.get('cpu_dirty_mb', 0))
+            self.cpu_util = round(resource_info.get('cpu_util', 0), 2)
+
+            self.gpu_rss_mb = int(resource_info.get('gpu_rss_mb', 0))
+            self.gpu_util = round(resource_info.get('gpu_util', 0), 2)
+            self.gpu_mem_util = round(resource_info.get('gpu_mem_util', 0), 2)
+        else:
+            self.cpu_rss_mb = 0
+            self.cpu_vms_mb = 0
+            self.cpu_shared_mb = 0
+            self.cpu_dirty_mb = 0
+            self.cpu_util = 0
+
+            self.gpu_rss_mb = 0
+            self.gpu_util = 0
+            self.gpu_mem_util = 0
+
+        # init benchmark logger
+        self.benchmark_logger()
+
+    def benchmark_logger(self):
+        """
+        benchmark logger
+        """
+        # remove other logging handler
+        for handler in logging.root.handlers[:]:
+            logging.root.removeHandler(handler)
+
+        # Init logger
+        FORMAT = '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+        log_output = f"{LOG_PATH_ROOT}/{self.model_name}.log"
+        Path(f"{LOG_PATH_ROOT}").mkdir(parents=True, exist_ok=True)
+        logging.basicConfig(
+            level=logging.INFO,
+            format=FORMAT,
+            handlers=[
+                logging.FileHandler(
+                    filename=log_output, mode='w'),
+                logging.StreamHandler(),
+            ])
+        self.logger = logging.getLogger(__name__)
+        self.logger.info(
+            f"Paddle Inference benchmark log will be saved to {log_output}")
+
+    def parse_config(self, config) -> dict:
+        """
+        parse paddle predictor config
+        args:
+            config(paddle.inference.Config): paddle inference config
+        return:
+            config_status(dict): dict style config info
+        """
+        if isinstance(config, paddle_infer.Config):
+            config_status = {}
+            config_status['runtime_device'] = "gpu" if config.use_gpu(
+            ) else "cpu"
+            config_status['ir_optim'] = config.ir_optim()
+            config_status['enable_tensorrt'] = config.tensorrt_engine_enabled()
+            config_status['precision'] = self.precision
+            config_status['enable_mkldnn'] = config.mkldnn_enabled()
+            config_status[
+                'cpu_math_library_num_threads'] = config.cpu_math_library_num_threads(
+                )
+        elif isinstance(config, dict):
+            config_status['runtime_device'] = config.get('runtime_device', "")
+            config_status['ir_optim'] = config.get('ir_optim', "")
+            config_status['enable_tensorrt'] = config.get('enable_tensorrt', "")
+            config_status['precision'] = config.get('precision', "")
+            config_status['enable_mkldnn'] = config.get('enable_mkldnn', "")
+            config_status['cpu_math_library_num_threads'] = config.get(
+                'cpu_math_library_num_threads', "")
+        else:
+            self.print_help()
+            raise ValueError(
+                "Set argument config wrong, please check input argument and its type"
+            )
+        return config_status
+
+    def report(self, identifier=None):
+        """
+        print log report
+        args:
+            identifier(string): identify log
+        """
+        if identifier:
+            identifier = f"[{identifier}]"
+        else:
+            identifier = ""
+
+        self.logger.info("\n")
+        self.logger.info(
+            "---------------------- Paddle info ----------------------")
+        self.logger.info(f"{identifier} paddle_version: {self.paddle_version}")
+        self.logger.info(f"{identifier} paddle_commit: {self.paddle_commit}")
+        self.logger.info(f"{identifier} paddle_branch: {self.paddle_branch}")
+        self.logger.info(f"{identifier} log_api_version: {self.log_version}")
+        self.logger.info(
+            "----------------------- Conf info -----------------------")
+        self.logger.info(
+            f"{identifier} runtime_device: {self.config_status['runtime_device']}"
+        )
+        self.logger.info(
+            f"{identifier} ir_optim: {self.config_status['ir_optim']}")
+        self.logger.info(f"{identifier} enable_memory_optim: {True}")
+        self.logger.info(
+            f"{identifier} enable_tensorrt: {self.config_status['enable_tensorrt']}"
+        )
+        self.logger.info(
+            f"{identifier} enable_mkldnn: {self.config_status['enable_mkldnn']}")
+        self.logger.info(
+            f"{identifier} cpu_math_library_num_threads: {self.config_status['cpu_math_library_num_threads']}"
+        )
+        self.logger.info(
+            "----------------------- Model info ----------------------")
+        self.logger.info(f"{identifier} model_name: {self.model_name}")
+        self.logger.info(f"{identifier} precision: {self.precision}")
+        self.logger.info(
+            "----------------------- Data info -----------------------")
+        self.logger.info(f"{identifier} batch_size: {self.batch_size}")
+        self.logger.info(f"{identifier} input_shape: {self.shape}")
+        self.logger.info(f"{identifier} data_num: {self.data_num}")
+        self.logger.info(
+            "----------------------- Perf info -----------------------")
+        self.logger.info(
+            f"{identifier} cpu_rss(MB): {self.cpu_rss_mb}, cpu_vms: {self.cpu_vms_mb}, cpu_shared_mb: {self.cpu_shared_mb}, cpu_dirty_mb: {self.cpu_dirty_mb}, cpu_util: {self.cpu_util}%"
+        )
+        self.logger.info(
+            f"{identifier} gpu_rss(MB): {self.gpu_rss_mb}, gpu_util: {self.gpu_util}%, gpu_mem_util: {self.gpu_mem_util}%"
+        )
+        self.logger.info(
+            f"{identifier} total time spent(s): {self.total_time_s}")
+
+        if self.with_tracker:
+            self.logger.info(
+                f"{identifier} preprocess_time(ms): {round(self.preprocess_time_s*1000, 1)}, "
+                f"inference_time(ms): {round(self.inference_time_s*1000, 1)}, "
+                f"postprocess_time(ms): {round(self.postprocess_time_s*1000, 1)}, "
+                f"tracking_time(ms): {round(self.tracking_time_s*1000, 1)}")
+        else:
+            self.logger.info(
+                f"{identifier} preprocess_time(ms): {round(self.preprocess_time_s*1000, 1)}, "
+                f"inference_time(ms): {round(self.inference_time_s*1000, 1)}, "
+                f"postprocess_time(ms): {round(self.postprocess_time_s*1000, 1)}"
+            )
+        if self.inference_time_s_90:
+            self.looger.info(
+                f"{identifier} 90%_cost: {self.inference_time_s_90}, 99%_cost: {self.inference_time_s_99}, succ_rate: {self.succ_rate}"
+            )
+        if self.qps:
+            self.logger.info(f"{identifier} QPS: {self.qps}")
+
+    def print_help(self):
+        """
+        print function help
+        """
+        print("""Usage: 
+            ==== Print inference benchmark logs. ====
+            config = paddle.inference.Config()
+            model_info = {'model_name': 'resnet50'
+                          'precision': 'fp32'}
+            data_info = {'batch_size': 1
+                         'shape': '3,224,224'
+                         'data_num': 1000}
+            perf_info = {'preprocess_time_s': 1.0
+                         'inference_time_s': 2.0
+                         'postprocess_time_s': 1.0
+                         'total_time_s': 4.0}
+            resource_info = {'cpu_rss_mb': 100
+                             'gpu_rss_mb': 100
+                             'gpu_util': 60}
+            log = PaddleInferBenchmark(config, model_info, data_info, perf_info, resource_info)
+            log('Test')
+            """)
+
+    def __call__(self, identifier=None):
+        """
+        __call__
+        args:
+            identifier(string): identify log
+        """
+        self.report(identifier)
--- a/deploy/python/infer.py
+++ b/deploy/python/infer.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import yaml
+import glob
+import json
+from pathlib import Path
+from functools import reduce
+
+import cv2
+import numpy as np
+import math
+import paddle
+from paddle.inference import Config
+from paddle.inference import create_predictor
+
+import sys
+# add deploy path of PaddleDetection to sys.path
+parent_path = os.path.abspath(os.path.join(__file__, *(['..'])))
+sys.path.insert(0, parent_path)
+
+from benchmark_utils import PaddleInferBenchmark
+from preprocess import preprocess, Resize, NormalizeImage, Permute, Pad, decode_image
+from visualize import visualize_box_mask
+from utils import argsparser, Timer, get_current_memory_mb, multiclass_nms, coco_clsid2catid
+
+# Global dictionary
+SUPPORT_MODELS = {
+    'YOLO', 'PPYOLOE', 'YOLOX', 'YOLOF', 'YOLOv5', 'RTMDet', 'YOLOv6', 'YOLOv7', 'YOLOv8', 'DETR'
+}
+
+
+def bench_log(detector, img_list, model_info, batch_size=1, name=None):
+    mems = {
+        'cpu_rss_mb': detector.cpu_mem / len(img_list),
+        'gpu_rss_mb': detector.gpu_mem / len(img_list),
+        'gpu_util': detector.gpu_util * 100 / len(img_list)
+    }
+    perf_info = detector.det_times.report(average=True)
+    data_info = {
+        'batch_size': batch_size,
+        'shape': "dynamic_shape",
+        'data_num': perf_info['img_num']
+    }
+    log = PaddleInferBenchmark(detector.config, model_info, data_info,
+                               perf_info, mems)
+    log(name)
+
+
+class Detector(object):
+    """
+    Args:
+        pred_config (object): config of model, defined by `Config(model_dir)`
+        model_dir (str): root path of model.pdiparams, model.pdmodel and infer_cfg.yml
+        device (str): Choose the device you want to run, it can be: CPU/GPU/XPU, default is CPU
+        run_mode (str): mode of running(paddle/trt_fp32/trt_fp16)
+        batch_size (int): size of pre batch in inference
+        trt_min_shape (int): min shape for dynamic shape in trt
+        trt_max_shape (int): max shape for dynamic shape in trt
+        trt_opt_shape (int): opt shape for dynamic shape in trt
+        trt_calib_mode (bool): If the model is produced by TRT offline quantitative
+            calibration, trt_calib_mode need to set True
+        cpu_threads (int): cpu threads
+        enable_mkldnn (bool): whether to open MKLDNN
+        enable_mkldnn_bfloat16 (bool): whether to turn on mkldnn bfloat16
+        output_dir (str): The path of output
+        threshold (float): The threshold of score for visualization
+        delete_shuffle_pass (bool): whether to remove shuffle_channel_detect_pass in TensorRT. 
+                                    Used by action model.
+    """
+
+    def __init__(self,
+                 model_dir,
+                 device='CPU',
+                 run_mode='paddle',
+                 batch_size=1,
+                 trt_min_shape=1,
+                 trt_max_shape=1280,
+                 trt_opt_shape=640,
+                 trt_calib_mode=False,
+                 cpu_threads=1,
+                 enable_mkldnn=False,
+                 enable_mkldnn_bfloat16=False,
+                 output_dir='output',
+                 threshold=0.5,
+                 delete_shuffle_pass=False):
+        self.pred_config = self.set_config(model_dir)
+        self.predictor, self.config = load_predictor(
+            model_dir,
+            self.pred_config.arch,
+            run_mode=run_mode,
+            batch_size=batch_size,
+            min_subgraph_size=self.pred_config.min_subgraph_size,
+            device=device,
+            use_dynamic_shape=self.pred_config.use_dynamic_shape,
+            trt_min_shape=trt_min_shape,
+            trt_max_shape=trt_max_shape,
+            trt_opt_shape=trt_opt_shape,
+            trt_calib_mode=trt_calib_mode,
+            cpu_threads=cpu_threads,
+            enable_mkldnn=enable_mkldnn,
+            enable_mkldnn_bfloat16=enable_mkldnn_bfloat16,
+            delete_shuffle_pass=delete_shuffle_pass)
+        self.det_times = Timer()
+        self.cpu_mem, self.gpu_mem, self.gpu_util = 0, 0, 0
+        self.batch_size = batch_size
+        self.output_dir = output_dir
+        self.threshold = threshold
+
+    def set_config(self, model_dir):
+        return PredictConfig(model_dir)
+
+    def preprocess(self, image_list):
+        preprocess_ops = []
+        for op_info in self.pred_config.preprocess_infos:
+            new_op_info = op_info.copy()
+            op_type = new_op_info.pop('type')
+            preprocess_ops.append(eval(op_type)(**new_op_info))
+
+        input_im_lst = []
+        input_im_info_lst = []
+        for im_path in image_list:
+            im, im_info = preprocess(im_path, preprocess_ops)
+            input_im_lst.append(im)
+            input_im_info_lst.append(im_info)
+        inputs = create_inputs(input_im_lst, input_im_info_lst)
+        input_names = self.predictor.get_input_names()
+        for i in range(len(input_names)):
+            input_tensor = self.predictor.get_input_handle(input_names[i])
+            if input_names[i] == 'x':
+                input_tensor.copy_from_cpu(inputs['image'])
+            else:
+                input_tensor.copy_from_cpu(inputs[input_names[i]])
+
+        return inputs
+
+    def postprocess(self, inputs, result):
+        # postprocess output of predictor
+        np_boxes_num = result['boxes_num']
+        assert isinstance(np_boxes_num, np.ndarray), \
+            '`np_boxes_num` should be a `numpy.ndarray`'
+
+        result = {k: v for k, v in result.items() if v is not None}
+        return result
+
+    def filter_box(self, result, threshold):
+        np_boxes_num = result['boxes_num']
+        boxes = result['boxes']
+        start_idx = 0
+        filter_boxes = []
+        filter_num = []
+        for i in range(len(np_boxes_num)):
+            boxes_num = np_boxes_num[i]
+            boxes_i = boxes[start_idx:start_idx + boxes_num, :]
+            idx = boxes_i[:, 1] > threshold
+            filter_boxes_i = boxes_i[idx, :]
+            filter_boxes.append(filter_boxes_i)
+            filter_num.append(filter_boxes_i.shape[0])
+            start_idx += boxes_num
+        boxes = np.concatenate(filter_boxes)
+        filter_num = np.array(filter_num)
+        filter_res = {'boxes': boxes, 'boxes_num': filter_num}
+        return filter_res
+
+    def predict(self, repeats=1, run_benchmark=False):
+        '''
+        Args:
+            repeats (int): repeats number for prediction
+        Returns:
+            result (dict): include 'boxes': np.ndarray: shape:[N,6], N: number of box,
+                            matix element:[class, score, x_min, y_min, x_max, y_max]
+                            MaskRCNN's result include 'masks': np.ndarray:
+                            shape: [N, im_h, im_w]
+        '''
+        # model prediction
+        np_boxes_num, np_boxes, np_masks = np.array([0]), None, None
+
+        if run_benchmark:
+            for i in range(repeats):
+                self.predictor.run()
+                paddle.device.cuda.synchronize()
+            result = dict(
+                boxes=np_boxes, masks=np_masks, boxes_num=np_boxes_num)
+            return result
+
+        for i in range(repeats):
+            self.predictor.run()
+            output_names = self.predictor.get_output_names()
+            boxes_tensor = self.predictor.get_output_handle(output_names[0])
+            np_boxes = boxes_tensor.copy_to_cpu()
+            if len(output_names) == 1:
+                # some exported model can not get tensor 'bbox_num' 
+                np_boxes_num = np.array([len(np_boxes)])
+            else:
+                boxes_num = self.predictor.get_output_handle(output_names[1])
+                np_boxes_num = boxes_num.copy_to_cpu()
+            if self.pred_config.mask:
+                masks_tensor = self.predictor.get_output_handle(output_names[2])
+                np_masks = masks_tensor.copy_to_cpu()
+        result = dict(boxes=np_boxes, masks=np_masks, boxes_num=np_boxes_num)
+        return result
+
+    def merge_batch_result(self, batch_result):
+        if len(batch_result) == 1:
+            return batch_result[0]
+        res_key = batch_result[0].keys()
+        results = {k: [] for k in res_key}
+        for res in batch_result:
+            for k, v in res.items():
+                results[k].append(v)
+        for k, v in results.items():
+            if k not in ['masks', 'segm']:
+                results[k] = np.concatenate(v)
+        return results
+
+    def get_timer(self):
+        return self.det_times
+
+    def predict_image_slice(self,
+                            img_list,
+                            slice_size=[640, 640],
+                            overlap_ratio=[0.25, 0.25],
+                            combine_method='nms',
+                            match_threshold=0.6,
+                            match_metric='ios',
+                            run_benchmark=False,
+                            repeats=1,
+                            visual=True,
+                            save_results=False):
+        # slice infer only support bs=1
+        results = []
+        try:
+            import sahi
+            from sahi.slicing import slice_image
+        except Exception as e:
+            print(
+                'sahi not found, plaese install sahi. '
+                'for example: `pip install sahi`, see https://github.com/obss/sahi.'
+            )
+            raise e
+        num_classes = len(self.pred_config.labels)
+        for i in range(len(img_list)):
+            ori_image = img_list[i]
+            slice_image_result = sahi.slicing.slice_image(
+                image=ori_image,
+                slice_height=slice_size[0],
+                slice_width=slice_size[1],
+                overlap_height_ratio=overlap_ratio[0],
+                overlap_width_ratio=overlap_ratio[1])
+            sub_img_num = len(slice_image_result)
+            merged_bboxs = []
+            print('slice to {} sub_samples.', sub_img_num)
+
+            batch_image_list = [
+                slice_image_result.images[_ind] for _ind in range(sub_img_num)
+            ]
+            if run_benchmark:
+                # preprocess
+                inputs = self.preprocess(batch_image_list)  # warmup
+                self.det_times.preprocess_time_s.start()
+                inputs = self.preprocess(batch_image_list)
+                self.det_times.preprocess_time_s.end()
+
+                # model prediction
+                result = self.predict(repeats=50, run_benchmark=True)  # warmup
+                self.det_times.inference_time_s.start()
+                result = self.predict(repeats=repeats, run_benchmark=True)
+                self.det_times.inference_time_s.end(repeats=repeats)
+
+                # postprocess
+                result_warmup = self.postprocess(inputs, result)  # warmup
+                self.det_times.postprocess_time_s.start()
+                result = self.postprocess(inputs, result)
+                self.det_times.postprocess_time_s.end()
+                self.det_times.img_num += 1
+
+                cm, gm, gu = get_current_memory_mb()
+                self.cpu_mem += cm
+                self.gpu_mem += gm
+                self.gpu_util += gu
+            else:
+                # preprocess
+                self.det_times.preprocess_time_s.start()
+                inputs = self.preprocess(batch_image_list)
+                self.det_times.preprocess_time_s.end()
+
+                # model prediction
+                self.det_times.inference_time_s.start()
+                result = self.predict()
+                self.det_times.inference_time_s.end()
+
+                # postprocess
+                self.det_times.postprocess_time_s.start()
+                result = self.postprocess(inputs, result)
+                self.det_times.postprocess_time_s.end()
+                self.det_times.img_num += 1
+
+            st, ed = 0, result['boxes_num'][0]  # start_index, end_index
+            for _ind in range(sub_img_num):
+                boxes_num = result['boxes_num'][_ind]
+                ed = st + boxes_num
+                shift_amount = slice_image_result.starting_pixels[_ind]
+                result['boxes'][st:ed][:, 2:4] = result['boxes'][
+                    st:ed][:, 2:4] + shift_amount
+                result['boxes'][st:ed][:, 4:6] = result['boxes'][
+                    st:ed][:, 4:6] + shift_amount
+                merged_bboxs.append(result['boxes'][st:ed])
+                st = ed
+
+            merged_results = {'boxes': []}
+            if combine_method == 'nms':
+                final_boxes = multiclass_nms(
+                    np.concatenate(merged_bboxs), num_classes, match_threshold,
+                    match_metric)
+                merged_results['boxes'] = np.concatenate(final_boxes)
+            elif combine_method == 'concat':
+                merged_results['boxes'] = np.concatenate(merged_bboxs)
+            else:
+                raise ValueError(
+                    "Now only support 'nms' or 'concat' to fuse detection results."
+                )
+            merged_results['boxes_num'] = np.array(
+                [len(merged_results['boxes'])], dtype=np.int32)
+
+            if visual:
+                visualize(
+                    [ori_image],  # should be list
+                    merged_results,
+                    self.pred_config.labels,
+                    output_dir=self.output_dir,
+                    threshold=self.threshold)
+
+            results.append(merged_results)
+            print('Test iter {}'.format(i))
+
+        results = self.merge_batch_result(results)
+        if save_results:
+            Path(self.output_dir).mkdir(exist_ok=True)
+            self.save_coco_results(
+                img_list, results, use_coco_category=FLAGS.use_coco_category)
+        return results
+
+    def predict_image(self,
+                      image_list,
+                      run_benchmark=False,
+                      repeats=1,
+                      visual=True,
+                      save_results=False):
+        batch_loop_cnt = math.ceil(float(len(image_list)) / self.batch_size)
+        results = []
+        for i in range(batch_loop_cnt):
+            start_index = i * self.batch_size
+            end_index = min((i + 1) * self.batch_size, len(image_list))
+            batch_image_list = image_list[start_index:end_index]
+            if run_benchmark:
+                # preprocess
+                inputs = self.preprocess(batch_image_list)  # warmup
+                self.det_times.preprocess_time_s.start()
+                inputs = self.preprocess(batch_image_list)
+                self.det_times.preprocess_time_s.end()
+
+                # model prediction
+                result = self.predict(repeats=50, run_benchmark=True)  # warmup
+                self.det_times.inference_time_s.start()
+                result = self.predict(repeats=repeats, run_benchmark=True)
+                self.det_times.inference_time_s.end(repeats=repeats)
+
+                # postprocess
+                result_warmup = self.postprocess(inputs, result)  # warmup
+                self.det_times.postprocess_time_s.start()
+                result = self.postprocess(inputs, result)
+                self.det_times.postprocess_time_s.end()
+                self.det_times.img_num += len(batch_image_list)
+
+                cm, gm, gu = get_current_memory_mb()
+                self.cpu_mem += cm
+                self.gpu_mem += gm
+                self.gpu_util += gu
+            else:
+                # preprocess
+                self.det_times.preprocess_time_s.start()
+                inputs = self.preprocess(batch_image_list)
+                self.det_times.preprocess_time_s.end()
+
+                # model prediction
+                self.det_times.inference_time_s.start()
+                result = self.predict()
+                self.det_times.inference_time_s.end()
+
+                # postprocess
+                self.det_times.postprocess_time_s.start()
+                result = self.postprocess(inputs, result)
+                self.det_times.postprocess_time_s.end()
+                self.det_times.img_num += len(batch_image_list)
+
+                if visual:
+                    visualize(
+                        batch_image_list,
+                        result,
+                        self.pred_config.labels,
+                        output_dir=self.output_dir,
+                        threshold=self.threshold)
+            results.append(result)
+            print('Test iter {}'.format(i))
+        results = self.merge_batch_result(results)
+        if save_results:
+            Path(self.output_dir).mkdir(exist_ok=True)
+            self.save_coco_results(
+                image_list, results, use_coco_category=FLAGS.use_coco_category)
+        return results
+
+    def predict_video(self, video_file, camera_id):
+        video_out_name = 'output.mp4'
+        if camera_id != -1:
+            capture = cv2.VideoCapture(camera_id)
+        else:
+            capture = cv2.VideoCapture(video_file)
+            video_out_name = os.path.split(video_file)[-1]
+        # Get Video info : resolution, fps, frame count
+        width = int(capture.get(cv2.CAP_PROP_FRAME_WIDTH))
+        height = int(capture.get(cv2.CAP_PROP_FRAME_HEIGHT))
+        fps = int(capture.get(cv2.CAP_PROP_FPS))
+        frame_count = int(capture.get(cv2.CAP_PROP_FRAME_COUNT))
+        print("fps: %d, frame_count: %d" % (fps, frame_count))
+
+        if not os.path.exists(self.output_dir):
+            os.makedirs(self.output_dir)
+        out_path = os.path.join(self.output_dir, video_out_name)
+        fourcc = cv2.VideoWriter_fourcc(* 'mp4v')
+        writer = cv2.VideoWriter(out_path, fourcc, fps, (width, height))
+        index = 1
+        while (1):
+            ret, frame = capture.read()
+            if not ret:
+                break
+            print('detect frame: %d' % (index))
+            index += 1
+            results = self.predict_image([frame[:, :, ::-1]], visual=False)
+
+            im = visualize_box_mask(
+                frame,
+                results,
+                self.pred_config.labels,
+                threshold=self.threshold)
+            im = np.array(im)
+            writer.write(im)
+            if camera_id != -1:
+                cv2.imshow('Mask Detection', im)
+                if cv2.waitKey(1) & 0xFF == ord('q'):
+                    break
+        writer.release()
+
+    def save_coco_results(self, image_list, results, use_coco_category=False):
+        bbox_results = []
+        mask_results = []
+        idx = 0
+        print("Start saving coco json files...")
+        for i, box_num in enumerate(results['boxes_num']):
+            file_name = os.path.split(image_list[i])[-1]
+            if use_coco_category:
+                img_id = int(os.path.splitext(file_name)[0])
+            else:
+                img_id = i
+
+            if 'boxes' in results:
+                boxes = results['boxes'][idx:idx + box_num].tolist()
+                bbox_results.extend([{
+                    'image_id': img_id,
+                    'category_id': coco_clsid2catid[int(box[0])] \
+                        if use_coco_category else int(box[0]),
+                    'file_name': file_name,
+                    'bbox': [box[2], box[3], box[4] - box[2],
+                         box[5] - box[3]],  # xyxy -> xywh
+                    'score': box[1]} for box in boxes])
+
+            if 'masks' in results:
+                import pycocotools.mask as mask_util
+
+                boxes = results['boxes'][idx:idx + box_num].tolist()
+                masks = results['masks'][i][:box_num].astype(np.uint8)
+                seg_res = []
+                for box, mask in zip(boxes, masks):
+                    rle = mask_util.encode(
+                        np.array(
+                            mask[:, :, None], dtype=np.uint8, order="F"))[0]
+                    if 'counts' in rle:
+                        rle['counts'] = rle['counts'].decode("utf8")
+                    seg_res.append({
+                        'image_id': img_id,
+                        'category_id': coco_clsid2catid[int(box[0])] \
+                        if use_coco_category else int(box[0]),
+                        'file_name': file_name,
+                        'segmentation': rle,
+                        'score': box[1]})
+                mask_results.extend(seg_res)
+
+            idx += box_num
+
+        if bbox_results:
+            bbox_file = os.path.join(self.output_dir, "bbox.json")
+            with open(bbox_file, 'w') as f:
+                json.dump(bbox_results, f)
+            print(f"The bbox result is saved to {bbox_file}")
+        if mask_results:
+            mask_file = os.path.join(self.output_dir, "mask.json")
+            with open(mask_file, 'w') as f:
+                json.dump(mask_results, f)
+            print(f"The mask result is saved to {mask_file}")
+
+
+def create_inputs(imgs, im_info):
+    """generate input for different model type
+    Args:
+        imgs (list(numpy)): list of images (np.ndarray)
+        im_info (list(dict)): list of image info
+    Returns:
+        inputs (dict): input of model
+    """
+    inputs = {}
+
+    im_shape = []
+    scale_factor = []
+    if len(imgs) == 1:
+        inputs['image'] = np.array((imgs[0], )).astype('float32')
+        inputs['im_shape'] = np.array(
+            (im_info[0]['im_shape'], )).astype('float32')
+        inputs['scale_factor'] = np.array(
+            (im_info[0]['scale_factor'], )).astype('float32')
+        return inputs
+
+    for e in im_info:
+        im_shape.append(np.array((e['im_shape'], )).astype('float32'))
+        scale_factor.append(np.array((e['scale_factor'], )).astype('float32'))
+
+    inputs['im_shape'] = np.concatenate(im_shape, axis=0)
+    inputs['scale_factor'] = np.concatenate(scale_factor, axis=0)
+
+    imgs_shape = [[e.shape[1], e.shape[2]] for e in imgs]
+    max_shape_h = max([e[0] for e in imgs_shape])
+    max_shape_w = max([e[1] for e in imgs_shape])
+    padding_imgs = []
+    for img in imgs:
+        im_c, im_h, im_w = img.shape[:]
+        padding_im = np.zeros(
+            (im_c, max_shape_h, max_shape_w), dtype=np.float32)
+        padding_im[:, :im_h, :im_w] = img
+        padding_imgs.append(padding_im)
+    inputs['image'] = np.stack(padding_imgs, axis=0)
+    return inputs
+
+
+class PredictConfig():
+    """set config of preprocess, postprocess and visualize
+    Args:
+        model_dir (str): root path of model.yml
+    """
+
+    def __init__(self, model_dir):
+        # parsing Yaml config for Preprocess
+        deploy_file = os.path.join(model_dir, 'infer_cfg.yml')
+        with open(deploy_file) as f:
+            yml_conf = yaml.safe_load(f)
+        self.check_model(yml_conf)
+        self.arch = yml_conf['arch']
+        self.preprocess_infos = yml_conf['Preprocess']
+        self.min_subgraph_size = yml_conf['min_subgraph_size']
+        self.labels = yml_conf['label_list']
+        self.mask = False
+        self.use_dynamic_shape = yml_conf['use_dynamic_shape']
+        if 'mask' in yml_conf:
+            self.mask = yml_conf['mask']
+        self.tracker = None
+        if 'tracker' in yml_conf:
+            self.tracker = yml_conf['tracker']
+        if 'NMS' in yml_conf:
+            self.nms = yml_conf['NMS']
+        if 'fpn_stride' in yml_conf:
+            self.fpn_stride = yml_conf['fpn_stride']
+        if self.arch == 'RCNN' and yml_conf.get('export_onnx', False):
+            print(
+                'The RCNN export model is used for ONNX and it only supports batch_size = 1'
+            )
+        self.print_config()
+
+    def check_model(self, yml_conf):
+        """
+        Raises:
+            ValueError: loaded model not in supported model type 
+        """
+        for support_model in SUPPORT_MODELS:
+            if support_model in yml_conf['arch']:
+                return True
+        raise ValueError("Unsupported arch: {}, expect {}".format(yml_conf[
+            'arch'], SUPPORT_MODELS))
+
+    def print_config(self):
+        print('-----------  Model Configuration -----------')
+        print('%s: %s' % ('Model Arch', self.arch))
+        print('%s: ' % ('Transform Order'))
+        for op_info in self.preprocess_infos:
+            print('--%s: %s' % ('transform op', op_info['type']))
+        print('--------------------------------------------')
+
+
+def load_predictor(model_dir,
+                   arch,
+                   run_mode='paddle',
+                   batch_size=1,
+                   device='CPU',
+                   min_subgraph_size=3,
+                   use_dynamic_shape=False,
+                   trt_min_shape=1,
+                   trt_max_shape=1280,
+                   trt_opt_shape=640,
+                   trt_calib_mode=False,
+                   cpu_threads=1,
+                   enable_mkldnn=False,
+                   enable_mkldnn_bfloat16=False,
+                   delete_shuffle_pass=False):
+    """set AnalysisConfig, generate AnalysisPredictor
+    Args:
+        model_dir (str): root path of __model__ and __params__
+        device (str): Choose the device you want to run, it can be: CPU/GPU/XPU, default is CPU
+        run_mode (str): mode of running(paddle/trt_fp32/trt_fp16/trt_int8)
+        use_dynamic_shape (bool): use dynamic shape or not
+        trt_min_shape (int): min shape for dynamic shape in trt
+        trt_max_shape (int): max shape for dynamic shape in trt
+        trt_opt_shape (int): opt shape for dynamic shape in trt
+        trt_calib_mode (bool): If the model is produced by TRT offline quantitative
+            calibration, trt_calib_mode need to set True
+        delete_shuffle_pass (bool): whether to remove shuffle_channel_detect_pass in TensorRT. 
+                                    Used by action model.
+    Returns:
+        predictor (PaddlePredictor): AnalysisPredictor
+    Raises:
+        ValueError: predict by TensorRT need device == 'GPU'.
+    """
+    if device != 'GPU' and run_mode != 'paddle':
+        raise ValueError(
+            "Predict by TensorRT mode: {}, expect device=='GPU', but device == {}"
+            .format(run_mode, device))
+    infer_model = os.path.join(model_dir, 'model.pdmodel')
+    infer_params = os.path.join(model_dir, 'model.pdiparams')
+    if not os.path.exists(infer_model):
+        infer_model = os.path.join(model_dir, 'inference.pdmodel')
+        infer_params = os.path.join(model_dir, 'inference.pdiparams')
+        if not os.path.exists(infer_model):
+            raise ValueError(
+                "Cannot find any inference model in dir: {},".format(model_dir))
+    config = Config(infer_model, infer_params)
+    if device == 'GPU':
+        # initial GPU memory(M), device ID
+        config.enable_use_gpu(200, 0)
+        # optimize graph and fuse op
+        config.switch_ir_optim(True)
+    elif device == 'XPU':
+        if config.lite_engine_enabled():
+            config.enable_lite_engine()
+        config.enable_xpu(10 * 1024 * 1024)
+    elif device == 'NPU':
+        if config.lite_engine_enabled():
+            config.enable_lite_engine()
+        config.enable_custom_device('npu')
+    else:
+        config.disable_gpu()
+        config.set_cpu_math_library_num_threads(cpu_threads)
+        if enable_mkldnn:
+            try:
+                # cache 10 different shapes for mkldnn to avoid memory leak
+                config.set_mkldnn_cache_capacity(10)
+                config.enable_mkldnn()
+                if enable_mkldnn_bfloat16:
+                    config.enable_mkldnn_bfloat16()
+            except Exception as e:
+                print(
+                    "The current environment does not support `mkldnn`, so disable mkldnn."
+                )
+                pass
+
+    precision_map = {
+        'trt_int8': Config.Precision.Int8,
+        'trt_fp32': Config.Precision.Float32,
+        'trt_fp16': Config.Precision.Half
+    }
+    if run_mode in precision_map.keys():
+        config.enable_tensorrt_engine(
+            workspace_size=(1 << 25) * batch_size,
+            max_batch_size=batch_size,
+            min_subgraph_size=min_subgraph_size,
+            precision_mode=precision_map[run_mode],
+            use_static=False,
+            use_calib_mode=trt_calib_mode)
+        if FLAGS.collect_trt_shape_info:
+            config.collect_shape_range_info(FLAGS.tuned_trt_shape_file)
+        elif os.path.exists(FLAGS.tuned_trt_shape_file):
+            print(f'Use dynamic shape file: '
+                  f'{FLAGS.tuned_trt_shape_file} for TRT...')
+            config.enable_tuned_tensorrt_dynamic_shape(
+                FLAGS.tuned_trt_shape_file, True)
+
+        if use_dynamic_shape:
+            min_input_shape = {
+                'image': [batch_size, 3, trt_min_shape, trt_min_shape],
+                'scale_factor': [batch_size, 2]
+            }
+            max_input_shape = {
+                'image': [batch_size, 3, trt_max_shape, trt_max_shape],
+                'scale_factor': [batch_size, 2]
+            }
+            opt_input_shape = {
+                'image': [batch_size, 3, trt_opt_shape, trt_opt_shape],
+                'scale_factor': [batch_size, 2]
+            }
+            config.set_trt_dynamic_shape_info(min_input_shape, max_input_shape,
+                                              opt_input_shape)
+            print('trt set dynamic shape done!')
+
+    # disable print log when predict
+    config.disable_glog_info()
+    # enable shared memory
+    config.enable_memory_optim()
+    # disable feed, fetch OP, needed by zero_copy_run
+    config.switch_use_feed_fetch_ops(False)
+    if delete_shuffle_pass:
+        config.delete_pass("shuffle_channel_detect_pass")
+    predictor = create_predictor(config)
+    return predictor, config
+
+
+def get_test_images(infer_dir, infer_img):
+    """
+    Get image path list in TEST mode
+    """
+    assert infer_img is not None or infer_dir is not None, \
+        "--image_file or --image_dir should be set"
+    assert infer_img is None or os.path.isfile(infer_img), \
+            "{} is not a file".format(infer_img)
+    assert infer_dir is None or os.path.isdir(infer_dir), \
+            "{} is not a directory".format(infer_dir)
+
+    # infer_img has a higher priority
+    if infer_img and os.path.isfile(infer_img):
+        return [infer_img]
+
+    images = set()
+    infer_dir = os.path.abspath(infer_dir)
+    assert os.path.isdir(infer_dir), \
+        "infer_dir {} is not a directory".format(infer_dir)
+    exts = ['jpg', 'jpeg', 'png', 'bmp']
+    exts += [ext.upper() for ext in exts]
+    for ext in exts:
+        images.update(glob.glob('{}/*.{}'.format(infer_dir, ext)))
+    images = list(images)
+
+    assert len(images) > 0, "no image found in {}".format(infer_dir)
+    print("Found {} inference images in total.".format(len(images)))
+
+    return images
+
+
+def visualize(image_list, result, labels, output_dir='output/', threshold=0.5):
+    # visualize the predict result
+    start_idx = 0
+    for idx, image_file in enumerate(image_list):
+        im_bboxes_num = result['boxes_num'][idx]
+        im_results = {}
+        if 'boxes' in result:
+            im_results['boxes'] = result['boxes'][start_idx:start_idx +
+                                                  im_bboxes_num, :]
+        if 'masks' in result:
+            im_results['masks'] = result['masks'][start_idx:start_idx +
+                                                  im_bboxes_num, :]
+        if 'segm' in result:
+            im_results['segm'] = result['segm'][start_idx:start_idx +
+                                                im_bboxes_num, :]
+        if 'label' in result:
+            im_results['label'] = result['label'][start_idx:start_idx +
+                                                  im_bboxes_num]
+        if 'score' in result:
+            im_results['score'] = result['score'][start_idx:start_idx +
+                                                  im_bboxes_num]
+
+        start_idx += im_bboxes_num
+        im = visualize_box_mask(
+            image_file, im_results, labels, threshold=threshold)
+        img_name = os.path.split(image_file)[-1]
+        if not os.path.exists(output_dir):
+            os.makedirs(output_dir)
+        out_path = os.path.join(output_dir, img_name)
+        im.save(out_path, quality=95)
+        print("save result to: " + out_path)
+
+
+def print_arguments(args):
+    print('-----------  Running Arguments -----------')
+    for arg, value in sorted(vars(args).items()):
+        print('%s: %s' % (arg, value))
+    print('------------------------------------------')
+
+
+def main():
+    deploy_file = os.path.join(FLAGS.model_dir, 'infer_cfg.yml')
+    with open(deploy_file) as f:
+        yml_conf = yaml.safe_load(f)
+    arch = yml_conf['arch']
+    detector_func = 'Detector'
+
+    detector = eval(detector_func)(
+        FLAGS.model_dir,
+        device=FLAGS.device,
+        run_mode=FLAGS.run_mode,
+        batch_size=FLAGS.batch_size,
+        trt_min_shape=FLAGS.trt_min_shape,
+        trt_max_shape=FLAGS.trt_max_shape,
+        trt_opt_shape=FLAGS.trt_opt_shape,
+        trt_calib_mode=FLAGS.trt_calib_mode,
+        cpu_threads=FLAGS.cpu_threads,
+        enable_mkldnn=FLAGS.enable_mkldnn,
+        enable_mkldnn_bfloat16=FLAGS.enable_mkldnn_bfloat16,
+        threshold=FLAGS.threshold,
+        output_dir=FLAGS.output_dir)
+
+    # predict from video file or camera video stream
+    if FLAGS.video_file is not None or FLAGS.camera_id != -1:
+        detector.predict_video(FLAGS.video_file, FLAGS.camera_id)
+    else:
+        # predict from image
+        if FLAGS.image_dir is None and FLAGS.image_file is not None:
+            assert FLAGS.batch_size == 1, "batch_size should be 1, when image_file is not None"
+        img_list = get_test_images(FLAGS.image_dir, FLAGS.image_file)
+        if FLAGS.slice_infer:
+            detector.predict_image_slice(
+                img_list,
+                FLAGS.slice_size,
+                FLAGS.overlap_ratio,
+                FLAGS.combine_method,
+                FLAGS.match_threshold,
+                FLAGS.match_metric,
+                visual=FLAGS.save_images,
+                save_results=FLAGS.save_results)
+        else:
+            detector.predict_image(
+                img_list,
+                FLAGS.run_benchmark,
+                repeats=100,
+                visual=FLAGS.save_images,
+                save_results=FLAGS.save_results)
+        if not FLAGS.run_benchmark:
+            detector.det_times.info(average=True)
+        else:
+            mode = FLAGS.run_mode
+            model_dir = FLAGS.model_dir
+            model_info = {
+                'model_name': model_dir.strip('/').split('/')[-1],
+                'precision': mode.split('_')[-1]
+            }
+            bench_log(detector, img_list, model_info, name='DET')
+
+
+if __name__ == '__main__':
+    paddle.enable_static()
+    parser = argsparser()
+    FLAGS = parser.parse_args()
+    print_arguments(FLAGS)
+    FLAGS.device = FLAGS.device.upper()
+    assert FLAGS.device in ['CPU', 'GPU', 'XPU', 'NPU'
+                            ], "device should be CPU, GPU, XPU or NPU"
+    assert not FLAGS.use_gpu, "use_gpu has been deprecated, please use --device"
+
+    assert not (
+        FLAGS.enable_mkldnn == False and FLAGS.enable_mkldnn_bfloat16 == True
+    ), 'To enable mkldnn bfloat, please turn on both enable_mkldnn and enable_mkldnn_bfloat16'
+
+    main()
--- a/deploy/python/preprocess.py
+++ b/deploy/python/preprocess.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import cv2
+import numpy as np
+from PIL import Image
+
+
+def decode_image(im_file, im_info):
+    """read rgb image
+    Args:
+        im_file (str|np.ndarray): input can be image path or np.ndarray
+        im_info (dict): info of image
+    Returns:
+        im (np.ndarray):  processed image (np.ndarray)
+        im_info (dict): info of processed image
+    """
+    if isinstance(im_file, str):
+        with open(im_file, 'rb') as f:
+            im_read = f.read()
+        data = np.frombuffer(im_read, dtype='uint8')
+        im = cv2.imdecode(data, 1)  # BGR mode, but need RGB mode
+        im = cv2.cvtColor(im, cv2.COLOR_BGR2RGB)
+    else:
+        im = im_file
+    im_info['im_shape'] = np.array(im.shape[:2], dtype=np.float32)
+    im_info['scale_factor'] = np.array([1., 1.], dtype=np.float32)
+    return im, im_info
+
+
+class Resize(object):
+    """resize image by target_size and max_size
+    Args:
+        target_size (int): the target size of image
+        keep_ratio (bool): whether keep_ratio or not, default true
+        interp (int): method of resize
+    """
+
+    def __init__(self, target_size, keep_ratio=True, interp=cv2.INTER_LINEAR):
+        if isinstance(target_size, int):
+            target_size = [target_size, target_size]
+        self.target_size = target_size
+        self.keep_ratio = keep_ratio
+        self.interp = interp
+
+    def __call__(self, im, im_info):
+        """
+        Args:
+            im (np.ndarray): image (np.ndarray)
+            im_info (dict): info of image
+        Returns:
+            im (np.ndarray):  processed image (np.ndarray)
+            im_info (dict): info of processed image
+        """
+        assert len(self.target_size) == 2
+        assert self.target_size[0] > 0 and self.target_size[1] > 0
+        im_channel = im.shape[2]
+        im_scale_y, im_scale_x = self.generate_scale(im)
+        im = cv2.resize(
+            im,
+            None,
+            None,
+            fx=im_scale_x,
+            fy=im_scale_y,
+            interpolation=self.interp)
+        im_info['im_shape'] = np.array(im.shape[:2]).astype('float32')
+        im_info['scale_factor'] = np.array(
+            [im_scale_y, im_scale_x]).astype('float32')
+        return im, im_info
+
+    def generate_scale(self, im):
+        """
+        Args:
+            im (np.ndarray): image (np.ndarray)
+        Returns:
+            im_scale_x: the resize ratio of X
+            im_scale_y: the resize ratio of Y
+        """
+        origin_shape = im.shape[:2]
+        im_c = im.shape[2]
+        if self.keep_ratio:
+            im_size_min = np.min(origin_shape)
+            im_size_max = np.max(origin_shape)
+            target_size_min = np.min(self.target_size)
+            target_size_max = np.max(self.target_size)
+            im_scale = float(target_size_min) / float(im_size_min)
+            if np.round(im_scale * im_size_max) > target_size_max:
+                im_scale = float(target_size_max) / float(im_size_max)
+            im_scale_x = im_scale
+            im_scale_y = im_scale
+        else:
+            resize_h, resize_w = self.target_size
+            im_scale_y = resize_h / float(origin_shape[0])
+            im_scale_x = resize_w / float(origin_shape[1])
+        return im_scale_y, im_scale_x
+
+
+class NormalizeImage(object):
+    """normalize image
+    Args:
+        mean (list): im - mean
+        std (list): im / std
+        is_scale (bool): whether need im / 255
+        norm_type (str): type in ['mean_std', 'none']
+    """
+
+    def __init__(self, mean, std, is_scale=True, norm_type='mean_std'):
+        self.mean = mean
+        self.std = std
+        self.is_scale = is_scale
+        self.norm_type = norm_type
+
+    def __call__(self, im, im_info):
+        """
+        Args:
+            im (np.ndarray): image (np.ndarray)
+            im_info (dict): info of image
+        Returns:
+            im (np.ndarray):  processed image (np.ndarray)
+            im_info (dict): info of processed image
+        """
+        im = im.astype(np.float32, copy=False)
+        if self.is_scale:
+            scale = 1.0 / 255.0
+            im *= scale
+
+        if self.norm_type == 'mean_std':
+            mean = np.array(self.mean)[np.newaxis, np.newaxis, :]
+            std = np.array(self.std)[np.newaxis, np.newaxis, :]
+            im -= mean
+            im /= std
+        return im, im_info
+
+
+class Permute(object):
+    """permute image
+    Args:
+        to_bgr (bool): whether convert RGB to BGR 
+        channel_first (bool): whether convert HWC to CHW
+    """
+
+    def __init__(self, ):
+        super(Permute, self).__init__()
+
+    def __call__(self, im, im_info):
+        """
+        Args:
+            im (np.ndarray): image (np.ndarray)
+            im_info (dict): info of image
+        Returns:
+            im (np.ndarray):  processed image (np.ndarray)
+            im_info (dict): info of processed image
+        """
+        im = im.transpose((2, 0, 1)).copy()
+        return im, im_info
+
+
+class PadStride(object):
+    """ padding image for model with FPN, instead PadBatch(pad_to_stride) in original config
+    Args:
+        stride (bool): model with FPN need image shape % stride == 0
+    """
+
+    def __init__(self, stride=0):
+        self.coarsest_stride = stride
+
+    def __call__(self, im, im_info):
+        """
+        Args:
+            im (np.ndarray): image (np.ndarray)
+            im_info (dict): info of image
+        Returns:
+            im (np.ndarray):  processed image (np.ndarray)
+            im_info (dict): info of processed image
+        """
+        coarsest_stride = self.coarsest_stride
+        if coarsest_stride <= 0:
+            return im, im_info
+        im_c, im_h, im_w = im.shape
+        pad_h = int(np.ceil(float(im_h) / coarsest_stride) * coarsest_stride)
+        pad_w = int(np.ceil(float(im_w) / coarsest_stride) * coarsest_stride)
+        padding_im = np.zeros((im_c, pad_h, pad_w), dtype=np.float32)
+        padding_im[:, :im_h, :im_w] = im
+        return padding_im, im_info
+
+
+class LetterBoxResize(object):
+    def __init__(self, target_size):
+        """
+        Resize image to target size, convert normalized xywh to pixel xyxy
+        format ([x_center, y_center, width, height] -> [x0, y0, x1, y1]).
+        Args:
+            target_size (int|list): image target size.
+        """
+        super(LetterBoxResize, self).__init__()
+        if isinstance(target_size, int):
+            target_size = [target_size, target_size]
+        self.target_size = target_size
+
+    def letterbox(self, img, height, width, color=(127.5, 127.5, 127.5)):
+        # letterbox: resize a rectangular image to a padded rectangular
+        shape = img.shape[:2]  # [height, width]
+        ratio_h = float(height) / shape[0]
+        ratio_w = float(width) / shape[1]
+        ratio = min(ratio_h, ratio_w)
+        new_shape = (round(shape[1] * ratio),
+                     round(shape[0] * ratio))  # [width, height]
+        padw = (width - new_shape[0]) / 2
+        padh = (height - new_shape[1]) / 2
+        top, bottom = round(padh - 0.1), round(padh + 0.1)
+        left, right = round(padw - 0.1), round(padw + 0.1)
+
+        img = cv2.resize(
+            img, new_shape, interpolation=cv2.INTER_AREA)  # resized, no border
+        img = cv2.copyMakeBorder(
+            img, top, bottom, left, right, cv2.BORDER_CONSTANT,
+            value=color)  # padded rectangular
+        return img, ratio, padw, padh
+
+    def __call__(self, im, im_info):
+        """
+        Args:
+            im (np.ndarray): image (np.ndarray)
+            im_info (dict): info of image
+        Returns:
+            im (np.ndarray):  processed image (np.ndarray)
+            im_info (dict): info of processed image
+        """
+        assert len(self.target_size) == 2
+        assert self.target_size[0] > 0 and self.target_size[1] > 0
+        height, width = self.target_size
+        h, w = im.shape[:2]
+        im, ratio, padw, padh = self.letterbox(im, height=height, width=width)
+
+        new_shape = [round(h * ratio), round(w * ratio)]
+        im_info['im_shape'] = np.array(new_shape, dtype=np.float32)
+        im_info['scale_factor'] = np.array([ratio, ratio], dtype=np.float32)
+        return im, im_info
+
+
+class Pad(object):
+    def __init__(self, size, fill_value=[114.0, 114.0, 114.0]):
+        """
+        Pad image to a specified size.
+        Args:
+            size (list[int]): image target size
+            fill_value (list[float]): rgb value of pad area, default (114.0, 114.0, 114.0)
+        """
+        super(Pad, self).__init__()
+        if isinstance(size, int):
+            size = [size, size]
+        self.size = size
+        self.fill_value = fill_value
+
+    def __call__(self, im, im_info):
+        im_h, im_w = im.shape[:2]
+        h, w = self.size
+        if h == im_h and w == im_w:
+            im = im.astype(np.float32)
+            return im, im_info
+
+        canvas = np.ones((h, w, 3), dtype=np.float32)
+        canvas *= np.array(self.fill_value, dtype=np.float32)
+        canvas[0:im_h, 0:im_w, :] = im.astype(np.float32)
+        im = canvas
+        return im, im_info
+
+
+def preprocess(im, preprocess_ops):
+    # process image by preprocess_ops
+    im_info = {
+        'scale_factor': np.array(
+            [1., 1.], dtype=np.float32),
+        'im_shape': None,
+    }
+    im, im_info = decode_image(im, im_info)
+    for operator in preprocess_ops:
+        im, im_info = operator(im, im_info)
+    return im, im_info
--- a/deploy/python/utils.py
+++ b/deploy/python/utils.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import time
+import os
+import ast
+import argparse
+import numpy as np
+
+
+def argsparser():
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument(
+        "--model_dir",
+        type=str,
+        default=None,
+        help=("Directory include:'model.pdiparams', 'model.pdmodel', "
+              "'infer_cfg.yml', created by tools/export_model.py."),
+        required=True)
+    parser.add_argument(
+        "--image_file", type=str, default=None, help="Path of image file.")
+    parser.add_argument(
+        "--image_dir",
+        type=str,
+        default=None,
+        help="Dir of image file, `image_file` has a higher priority.")
+    parser.add_argument(
+        "--batch_size", type=int, default=1, help="batch_size for inference.")
+    parser.add_argument(
+        "--video_file",
+        type=str,
+        default=None,
+        help="Path of video file, `video_file` or `camera_id` has a highest priority."
+    )
+    parser.add_argument(
+        "--camera_id",
+        type=int,
+        default=-1,
+        help="device id of camera to predict.")
+    parser.add_argument(
+        "--threshold", type=float, default=0.5, help="Threshold of score.")
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="output",
+        help="Directory of output visualization files.")
+    parser.add_argument(
+        "--run_mode",
+        type=str,
+        default='paddle',
+        help="mode of running(paddle/trt_fp32/trt_fp16/trt_int8)")
+    parser.add_argument(
+        "--device",
+        type=str,
+        default='cpu',
+        help="Choose the device you want to run, it can be: CPU/GPU/XPU/NPU, default is CPU."
+    )
+    parser.add_argument(
+        "--use_gpu",
+        type=ast.literal_eval,
+        default=False,
+        help="Deprecated, please use `--device`.")
+    parser.add_argument(
+        "--run_benchmark",
+        type=ast.literal_eval,
+        default=False,
+        help="Whether to predict a image_file repeatedly for benchmark")
+    parser.add_argument(
+        "--enable_mkldnn",
+        type=ast.literal_eval,
+        default=False,
+        help="Whether use mkldnn with CPU.")
+    parser.add_argument(
+        "--enable_mkldnn_bfloat16",
+        type=ast.literal_eval,
+        default=False,
+        help="Whether use mkldnn bfloat16 inference with CPU.")
+    parser.add_argument(
+        "--cpu_threads", type=int, default=1, help="Num of threads with CPU.")
+    parser.add_argument(
+        "--trt_min_shape", type=int, default=1, help="min_shape for TensorRT.")
+    parser.add_argument(
+        "--trt_max_shape",
+        type=int,
+        default=1280,
+        help="max_shape for TensorRT.")
+    parser.add_argument(
+        "--trt_opt_shape",
+        type=int,
+        default=640,
+        help="opt_shape for TensorRT.")
+    parser.add_argument(
+        "--trt_calib_mode",
+        type=bool,
+        default=False,
+        help="If the model is produced by TRT offline quantitative "
+        "calibration, trt_calib_mode need to set True.")
+    parser.add_argument(
+        '--save_images',
+        type=ast.literal_eval,
+        default=True,
+        help='Save visualization image results.')
+    parser.add_argument(
+        "--save_results",
+        action='store_true',
+        default=False,
+        help="Whether save detection result to file using coco format")
+    parser.add_argument(
+        '--use_coco_category',
+        action='store_true',
+        default=False,
+        help='Whether to use the coco format dictionary `clsid2catid`')
+    parser.add_argument(
+        "--slice_infer",
+        action='store_true',
+        help="Whether to slice the image and merge the inference results for small object detection."
+    )
+    parser.add_argument(
+        '--slice_size',
+        nargs='+',
+        type=int,
+        default=[640, 640],
+        help="Height of the sliced image.")
+    parser.add_argument(
+        "--overlap_ratio",
+        nargs='+',
+        type=float,
+        default=[0.25, 0.25],
+        help="Overlap height ratio of the sliced image.")
+    parser.add_argument(
+        "--combine_method",
+        type=str,
+        default='nms',
+        help="Combine method of the sliced images' detection results, choose in ['nms', 'nmm', 'concat']."
+    )
+    parser.add_argument(
+        "--match_threshold",
+        type=float,
+        default=0.6,
+        help="Combine method matching threshold.")
+    parser.add_argument(
+        "--match_metric",
+        type=str,
+        default='ios',
+        help="Combine method matching metric, choose in ['iou', 'ios'].")
+    parser.add_argument(
+        "--collect_trt_shape_info",
+        action='store_true',
+        default=False,
+        help="Whether to collect dynamic shape before using tensorrt.")
+    parser.add_argument(
+        "--tuned_trt_shape_file",
+        type=str,
+        default="shape_range_info.pbtxt",
+        help="Path of a dynamic shape file for tensorrt.")
+    return parser
+
+
+class Times(object):
+    def __init__(self):
+        self.time = 0.
+        # start time
+        self.st = 0.
+        # end time
+        self.et = 0.
+
+    def start(self):
+        self.st = time.time()
+
+    def end(self, repeats=1, accumulative=True):
+        self.et = time.time()
+        if accumulative:
+            self.time += (self.et - self.st) / repeats
+        else:
+            self.time = (self.et - self.st) / repeats
+
+    def reset(self):
+        self.time = 0.
+        self.st = 0.
+        self.et = 0.
+
+    def value(self):
+        return round(self.time, 4)
+
+
+class Timer(Times):
+    def __init__(self, with_tracker=False):
+        super(Timer, self).__init__()
+        self.with_tracker = with_tracker
+        self.preprocess_time_s = Times()
+        self.inference_time_s = Times()
+        self.postprocess_time_s = Times()
+        self.tracking_time_s = Times()
+        self.img_num = 0
+
+    def info(self, average=False):
+        pre_time = self.preprocess_time_s.value()
+        infer_time = self.inference_time_s.value()
+        post_time = self.postprocess_time_s.value()
+        track_time = self.tracking_time_s.value()
+
+        total_time = pre_time + infer_time + post_time
+        if self.with_tracker:
+            total_time = total_time + track_time
+        total_time = round(total_time, 4)
+        print("------------------ Inference Time Info ----------------------")
+        print("total_time(ms): {}, img_num: {}".format(total_time * 1000,
+                                                       self.img_num))
+        preprocess_time = round(pre_time / max(1, self.img_num),
+                                4) if average else pre_time
+        postprocess_time = round(post_time / max(1, self.img_num),
+                                 4) if average else post_time
+        inference_time = round(infer_time / max(1, self.img_num),
+                               4) if average else infer_time
+        tracking_time = round(track_time / max(1, self.img_num),
+                              4) if average else track_time
+
+        average_latency = total_time / max(1, self.img_num)
+        qps = 0
+        if total_time > 0:
+            qps = 1 / average_latency
+        print("average latency time(ms): {:.2f}, QPS: {:2f}".format(
+            average_latency * 1000, qps))
+        if self.with_tracker:
+            print(
+                "preprocess_time(ms): {:.2f}, inference_time(ms): {:.2f}, postprocess_time(ms): {:.2f}, tracking_time(ms): {:.2f}".
+                format(preprocess_time * 1000, inference_time * 1000,
+                       postprocess_time * 1000, tracking_time * 1000))
+        else:
+            print(
+                "preprocess_time(ms): {:.2f}, inference_time(ms): {:.2f}, postprocess_time(ms): {:.2f}".
+                format(preprocess_time * 1000, inference_time * 1000,
+                       postprocess_time * 1000))
+
+    def report(self, average=False):
+        dic = {}
+        pre_time = self.preprocess_time_s.value()
+        infer_time = self.inference_time_s.value()
+        post_time = self.postprocess_time_s.value()
+        track_time = self.tracking_time_s.value()
+
+        dic['preprocess_time_s'] = round(pre_time / max(1, self.img_num),
+                                         4) if average else pre_time
+        dic['inference_time_s'] = round(infer_time / max(1, self.img_num),
+                                        4) if average else infer_time
+        dic['postprocess_time_s'] = round(post_time / max(1, self.img_num),
+                                          4) if average else post_time
+        dic['img_num'] = self.img_num
+        total_time = pre_time + infer_time + post_time
+        if self.with_tracker:
+            dic['tracking_time_s'] = round(track_time / max(1, self.img_num),
+                                           4) if average else track_time
+            total_time = total_time + track_time
+        dic['total_time_s'] = round(total_time, 4)
+        return dic
+
+
+def get_current_memory_mb():
+    """
+    It is used to Obtain the memory usage of the CPU and GPU during the running of the program.
+    And this function Current program is time-consuming.
+    """
+    import pynvml
+    import psutil
+    import GPUtil
+    gpu_id = int(os.environ.get('CUDA_VISIBLE_DEVICES', 0))
+
+    pid = os.getpid()
+    p = psutil.Process(pid)
+    info = p.memory_full_info()
+    cpu_mem = info.uss / 1024. / 1024.
+    gpu_mem = 0
+    gpu_percent = 0
+    gpus = GPUtil.getGPUs()
+    if gpu_id is not None and len(gpus) > 0:
+        gpu_percent = gpus[gpu_id].load
+        pynvml.nvmlInit()
+        handle = pynvml.nvmlDeviceGetHandleByIndex(0)
+        meminfo = pynvml.nvmlDeviceGetMemoryInfo(handle)
+        gpu_mem = meminfo.used / 1024. / 1024.
+    return round(cpu_mem, 4), round(gpu_mem, 4), round(gpu_percent, 4)
+
+
+def multiclass_nms(bboxs, num_classes, match_threshold=0.6, match_metric='iou'):
+    final_boxes = []
+    for c in range(num_classes):
+        idxs = bboxs[:, 0] == c
+        if np.count_nonzero(idxs) == 0: continue
+        r = nms(bboxs[idxs, 1:], match_threshold, match_metric)
+        final_boxes.append(np.concatenate([np.full((r.shape[0], 1), c), r], 1))
+    return final_boxes
+
+
+def nms(dets, match_threshold=0.6, match_metric='iou'):
+    """ Apply NMS to avoid detecting too many overlapping bounding boxes.
+        Args:
+            dets: shape [N, 5], [score, x1, y1, x2, y2]
+            match_metric: 'iou' or 'ios'
+            match_threshold: overlap thresh for match metric.
+    """
+    if dets.shape[0] == 0:
+        return dets[[], :]
+    scores = dets[:, 0]
+    x1 = dets[:, 1]
+    y1 = dets[:, 2]
+    x2 = dets[:, 3]
+    y2 = dets[:, 4]
+    areas = (x2 - x1 + 1) * (y2 - y1 + 1)
+    order = scores.argsort()[::-1]
+
+    ndets = dets.shape[0]
+    suppressed = np.zeros((ndets), dtype=np.int32)
+
+    for _i in range(ndets):
+        i = order[_i]
+        if suppressed[i] == 1:
+            continue
+        ix1 = x1[i]
+        iy1 = y1[i]
+        ix2 = x2[i]
+        iy2 = y2[i]
+        iarea = areas[i]
+        for _j in range(_i + 1, ndets):
+            j = order[_j]
+            if suppressed[j] == 1:
+                continue
+            xx1 = max(ix1, x1[j])
+            yy1 = max(iy1, y1[j])
+            xx2 = min(ix2, x2[j])
+            yy2 = min(iy2, y2[j])
+            w = max(0.0, xx2 - xx1 + 1)
+            h = max(0.0, yy2 - yy1 + 1)
+            inter = w * h
+            if match_metric == 'iou':
+                union = iarea + areas[j] - inter
+                match_value = inter / union
+            elif match_metric == 'ios':
+                smaller = min(iarea, areas[j])
+                match_value = inter / smaller
+            else:
+                raise ValueError()
+            if match_value >= match_threshold:
+                suppressed[j] = 1
+    keep = np.where(suppressed == 0)[0]
+    dets = dets[keep, :]
+    return dets
+
+
+coco_clsid2catid = {
+    0: 1,
+    1: 2,
+    2: 3,
+    3: 4,
+    4: 5,
+    5: 6,
+    6: 7,
+    7: 8,
+    8: 9,
+    9: 10,
+    10: 11,
+    11: 13,
+    12: 14,
+    13: 15,
+    14: 16,
+    15: 17,
+    16: 18,
+    17: 19,
+    18: 20,
+    19: 21,
+    20: 22,
+    21: 23,
+    22: 24,
+    23: 25,
+    24: 27,
+    25: 28,
+    26: 31,
+    27: 32,
+    28: 33,
+    29: 34,
+    30: 35,
+    31: 36,
+    32: 37,
+    33: 38,
+    34: 39,
+    35: 40,
+    36: 41,
+    37: 42,
+    38: 43,
+    39: 44,
+    40: 46,
+    41: 47,
+    42: 48,
+    43: 49,
+    44: 50,
+    45: 51,
+    46: 52,
+    47: 53,
+    48: 54,
+    49: 55,
+    50: 56,
+    51: 57,
+    52: 58,
+    53: 59,
+    54: 60,
+    55: 61,
+    56: 62,
+    57: 63,
+    58: 64,
+    59: 65,
+    60: 67,
+    61: 70,
+    62: 72,
+    63: 73,
+    64: 74,
+    65: 75,
+    66: 76,
+    67: 77,
+    68: 78,
+    69: 79,
+    70: 80,
+    71: 81,
+    72: 82,
+    73: 84,
+    74: 85,
+    75: 86,
+    76: 87,
+    77: 88,
+    78: 89,
+    79: 90
+}
--- a/deploy/python/visualize.py
+++ b/deploy/python/visualize.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import division
+
+import os
+import cv2
+import numpy as np
+from PIL import Image, ImageDraw, ImageFile
+ImageFile.LOAD_TRUNCATED_IMAGES = True
+import math
+
+
+def visualize_box_mask(im, results, labels, threshold=0.5):
+    """
+    Args:
+        im (str/np.ndarray): path of image/np.ndarray read by cv2
+        results (dict): include 'boxes': np.ndarray: shape:[N,6], N: number of box,
+                        matix element:[class, score, x_min, y_min, x_max, y_max]
+                        MaskRCNN's results include 'masks': np.ndarray:
+                        shape:[N, im_h, im_w]
+        labels (list): labels:['class1', ..., 'classn']
+        threshold (float): Threshold of score.
+    Returns:
+        im (PIL.Image.Image): visualized image
+    """
+    if isinstance(im, str):
+        im = Image.open(im).convert('RGB')
+    elif isinstance(im, np.ndarray):
+        im = Image.fromarray(im)
+    if 'masks' in results and 'boxes' in results and len(results['boxes']) > 0:
+        im = draw_mask(
+            im, results['boxes'], results['masks'], labels, threshold=threshold)
+    if 'boxes' in results and len(results['boxes']) > 0:
+        im = draw_box(im, results['boxes'], labels, threshold=threshold)
+    if 'segm' in results:
+        im = draw_segm(
+            im,
+            results['segm'],
+            results['label'],
+            results['score'],
+            labels,
+            threshold=threshold)
+    return im
+
+
+def get_color_map_list(num_classes):
+    """
+    Args:
+        num_classes (int): number of class
+    Returns:
+        color_map (list): RGB color list
+    """
+    color_map = num_classes * [0, 0, 0]
+    for i in range(0, num_classes):
+        j = 0
+        lab = i
+        while lab:
+            color_map[i * 3] |= (((lab >> 0) & 1) << (7 - j))
+            color_map[i * 3 + 1] |= (((lab >> 1) & 1) << (7 - j))
+            color_map[i * 3 + 2] |= (((lab >> 2) & 1) << (7 - j))
+            j += 1
+            lab >>= 3
+    color_map = [color_map[i:i + 3] for i in range(0, len(color_map), 3)]
+    return color_map
+
+
+def draw_mask(im, np_boxes, np_masks, labels, threshold=0.5):
+    """
+    Args:
+        im (PIL.Image.Image): PIL image
+        np_boxes (np.ndarray): shape:[N,6], N: number of box,
+            matix element:[class, score, x_min, y_min, x_max, y_max]
+        np_masks (np.ndarray): shape:[N, im_h, im_w]
+        labels (list): labels:['class1', ..., 'classn']
+        threshold (float): threshold of mask
+    Returns:
+        im (PIL.Image.Image): visualized image
+    """
+    color_list = get_color_map_list(len(labels))
+    w_ratio = 0.4
+    alpha = 0.7
+    im = np.array(im).astype('float32')
+    clsid2color = {}
+    expect_boxes = (np_boxes[:, 1] > threshold) & (np_boxes[:, 0] > -1)
+    np_boxes = np_boxes[expect_boxes, :]
+    np_masks = np_masks[expect_boxes, :, :]
+    im_h, im_w = im.shape[:2]
+    np_masks = np_masks[:, :im_h, :im_w]
+    for i in range(len(np_masks)):
+        clsid, score = int(np_boxes[i][0]), np_boxes[i][1]
+        mask = np_masks[i]
+        if clsid not in clsid2color:
+            clsid2color[clsid] = color_list[clsid]
+        color_mask = clsid2color[clsid]
+        for c in range(3):
+            color_mask[c] = color_mask[c] * (1 - w_ratio) + w_ratio * 255
+        idx = np.nonzero(mask)
+        color_mask = np.array(color_mask)
+        im[idx[0], idx[1], :] *= 1.0 - alpha
+        im[idx[0], idx[1], :] += alpha * color_mask
+    return Image.fromarray(im.astype('uint8'))
+
+
+def draw_box(im, np_boxes, labels, threshold=0.5):
+    """
+    Args:
+        im (PIL.Image.Image): PIL image
+        np_boxes (np.ndarray): shape:[N,6], N: number of box,
+                               matix element:[class, score, x_min, y_min, x_max, y_max]
+        labels (list): labels:['class1', ..., 'classn']
+        threshold (float): threshold of box
+    Returns:
+        im (PIL.Image.Image): visualized image
+    """
+    draw_thickness = min(im.size) // 320
+    draw = ImageDraw.Draw(im)
+    clsid2color = {}
+    color_list = get_color_map_list(len(labels))
+    expect_boxes = (np_boxes[:, 1] > threshold) & (np_boxes[:, 0] > -1)
+    np_boxes = np_boxes[expect_boxes, :]
+
+    for dt in np_boxes:
+        clsid, bbox, score = int(dt[0]), dt[2:], dt[1]
+        if clsid not in clsid2color:
+            clsid2color[clsid] = color_list[clsid]
+        color = tuple(clsid2color[clsid])
+
+        if len(bbox) == 4:
+            xmin, ymin, xmax, ymax = bbox
+            print('class_id:{:d}, confidence:{:.4f}, left_top:[{:.2f},{:.2f}],'
+                  'right_bottom:[{:.2f},{:.2f}]'.format(
+                      int(clsid), score, xmin, ymin, xmax, ymax))
+            # draw bbox
+            draw.line(
+                [(xmin, ymin), (xmin, ymax), (xmax, ymax), (xmax, ymin),
+                 (xmin, ymin)],
+                width=draw_thickness,
+                fill=color)
+        elif len(bbox) == 8:
+            x1, y1, x2, y2, x3, y3, x4, y4 = bbox
+            draw.line(
+                [(x1, y1), (x2, y2), (x3, y3), (x4, y4), (x1, y1)],
+                width=2,
+                fill=color)
+            xmin = min(x1, x2, x3, x4)
+            ymin = min(y1, y2, y3, y4)
+
+        # draw label
+        text = "{} {:.4f}".format(labels[clsid], score)
+        tw, th = draw.textsize(text)
+        draw.rectangle(
+            [(xmin + 1, ymin - th), (xmin + tw + 1, ymin)], fill=color)
+        draw.text((xmin + 1, ymin - th), text, fill=(255, 255, 255))
+    return im
+
+
+def draw_segm(im,
+              np_segms,
+              np_label,
+              np_score,
+              labels,
+              threshold=0.5,
+              alpha=0.7):
+    """
+    Draw segmentation on image
+    """
+    mask_color_id = 0
+    w_ratio = .4
+    color_list = get_color_map_list(len(labels))
+    im = np.array(im).astype('float32')
+    clsid2color = {}
+    np_segms = np_segms.astype(np.uint8)
+    for i in range(np_segms.shape[0]):
+        mask, score, clsid = np_segms[i], np_score[i], np_label[i]
+        if score < threshold:
+            continue
+
+        if clsid not in clsid2color:
+            clsid2color[clsid] = color_list[clsid]
+        color_mask = clsid2color[clsid]
+        for c in range(3):
+            color_mask[c] = color_mask[c] * (1 - w_ratio) + w_ratio * 255
+        idx = np.nonzero(mask)
+        color_mask = np.array(color_mask)
+        idx0 = np.minimum(idx[0], im.shape[0] - 1)
+        idx1 = np.minimum(idx[1], im.shape[1] - 1)
+        im[idx0, idx1, :] *= 1.0 - alpha
+        im[idx0, idx1, :] += alpha * color_mask
+        sum_x = np.sum(mask, axis=0)
+        x = np.where(sum_x > 0.5)[0]
+        sum_y = np.sum(mask, axis=1)
+        y = np.where(sum_y > 0.5)[0]
+        x0, x1, y0, y1 = x[0], x[-1], y[0], y[-1]
+        cv2.rectangle(im, (x0, y0), (x1, y1),
+                      tuple(color_mask.astype('int32').tolist()), 1)
+        bbox_text = '%s %.2f' % (labels[clsid], score)
+        t_size = cv2.getTextSize(bbox_text, 0, 0.3, thickness=1)[0]
+        cv2.rectangle(im, (x0, y0), (x0 + t_size[0], y0 - t_size[1] - 3),
+                      tuple(color_mask.astype('int32').tolist()), -1)
+        cv2.putText(
+            im,
+            bbox_text, (x0, y0 - 2),
+            cv2.FONT_HERSHEY_SIMPLEX,
+            0.3, (0, 0, 0),
+            1,
+            lineType=cv2.LINE_AA)
+    return Image.fromarray(im.astype('uint8'))
--- a/deploy/serving/README.md
+++ b/deploy/serving/README.md
+# 服务端预测部署
+
+`PaddleDetection`训练出来的模型可以使用[Serving](https://github.com/PaddlePaddle/Serving) 部署在服务端。  
+本教程以在COCO数据集上用`configs/yolov3/yolov3_darknet53_270e_coco.yml`算法训练的模型进行部署。  
+预训练模型权重文件为[yolov3_darknet53_270e_coco.pdparams](https://paddledet.bj.bcebos.com/models/yolov3_darknet53_270e_coco.pdparams) 。
+
+## 1. 首先验证模型
+```
+python tools/infer.py -c configs/yolov3/yolov3_darknet53_270e_coco.yml --infer_img=demo/000000014439.jpg -o use_gpu=True weights=https://paddledet.bj.bcebos.com/models/yolov3_darknet53_270e_coco.pdparams --infer_img=demo/000000014439.jpg
+```
+
+## 2. 安装 paddle serving
+请参考[PaddleServing](https://github.com/PaddlePaddle/Serving/tree/v0.7.0) 中安装教程安装（版本>=0.7.0）。
+
+## 3. 导出模型
+PaddleDetection在训练过程包括网络的前向和优化器相关参数，而在部署过程中，我们只需要前向参数，具体参考:[导出模型](https://github.com/PaddlePaddle/PaddleDetection/blob/develop/deploy/EXPORT_MODEL.md)
+
+```
+python tools/export_model.py -c configs/yolov3/yolov3_darknet53_270e_coco.yml -o weights=https://paddledet.bj.bcebos.com/models/yolov3_darknet53_270e_coco.pdparams --export_serving_model=True
+```
+
+以上命令会在`output_inference/`文件夹下生成一个`yolov3_darknet53_270e_coco`文件夹：
+```
+output_inference
+│   ├── yolov3_darknet53_270e_coco
+│   │   ├── infer_cfg.yml
+│   │   ├── model.pdiparams
+│   │   ├── model.pdiparams.info
+│   │   ├── model.pdmodel
+│   │   ├── serving_client
+│   │   │   ├── serving_client_conf.prototxt
+│   │   │   ├── serving_client_conf.stream.prototxt
+│   │   ├── serving_server
+│   │   │   ├── __model__
+│   │   │   ├── __params__
+│   │   │   ├── serving_server_conf.prototxt
+│   │   │   ├── serving_server_conf.stream.prototxt
+│   │   │   ├── ...
+```
+
+`serving_client`文件夹下`serving_client_conf.prototxt`详细说明了模型输入输出信息
+`serving_client_conf.prototxt`文件内容为：
+```
+feed_var {
+  name: "im_shape"
+  alias_name: "im_shape"
+  is_lod_tensor: false
+  feed_type: 1
+  shape: 2
+}
+feed_var {
+  name: "image"
+  alias_name: "image"
+  is_lod_tensor: false
+  feed_type: 1
+  shape: 3
+  shape: 608
+  shape: 608
+}
+feed_var {
+  name: "scale_factor"
+  alias_name: "scale_factor"
+  is_lod_tensor: false
+  feed_type: 1
+  shape: 2
+}
+fetch_var {
+  name: "multiclass_nms3_0.tmp_0"
+  alias_name: "multiclass_nms3_0.tmp_0"
+  is_lod_tensor: true
+  fetch_type: 1
+  shape: -1
+}
+fetch_var {
+  name: "multiclass_nms3_0.tmp_2"
+  alias_name: "multiclass_nms3_0.tmp_2"
+  is_lod_tensor: false
+  fetch_type: 2
+```
+
+## 4. 启动PaddleServing服务
+
+```
+cd output_inference/yolov3_darknet53_270e_coco/
+
+# GPU
+python -m paddle_serving_server.serve --model serving_server --port 9393 --gpu_ids 0
+
+# CPU
+python -m paddle_serving_server.serve --model serving_server --port 9393
+```
+
+## 5. 测试部署的服务
+准备`label_list.txt`文件，示例`label_list.txt`文件内容为
+```
+person
+bicycle
+car
+motorcycle
+airplane
+bus
+train
+truck
+boat
+traffic light
+fire hydrant
+stop sign
+parking meter
+bench
+bird
+cat
+dog
+horse
+sheep
+cow
+elephant
+bear
+zebra
+giraffe
+backpack
+umbrella
+handbag
+tie
+suitcase
+frisbee
+skis
+snowboard
+sports ball
+kite
+baseball bat
+baseball glove
+skateboard
+surfboard
+tennis racket
+bottle
+wine glass
+cup
+fork
+knife
+spoon
+bowl
+banana
+apple
+sandwich
+orange
+broccoli
+carrot
+hot dog
+pizza
+donut
+cake
+chair
+couch
+potted plant
+bed
+dining table
+toilet
+tv
+laptop
+mouse
+remote
+keyboard
+cell phone
+microwave
+oven
+toaster
+sink
+refrigerator
+book
+clock
+vase
+scissors
+teddy bear
+hair drier
+toothbrush
+```
+
+设置`prototxt`文件路径为`serving_client/serving_client_conf.prototxt`
+设置`fetch`为`fetch=["multiclass_nms3_0.tmp_0"])`
+
+测试
+```
+# 进入目录
+cd output_inference/yolov3_darknet53_270e_coco/
+
+# 测试代码 test_client.py 会自动创建output文件夹，并在output下生成`bbox.json`和`000000014439.jpg`两个文件
+python ../../deploy/serving/test_client.py ../../deploy/serving/label_list.txt ../../demo/000000014439.jpg
+```
--- a/deploy/serving/cpp/README.md
+++ b/deploy/serving/cpp/README.md
+# C++ Serving预测部署
+
+## 1. 简介
+Paddle Serving是飞桨开源的服务化部署框架，提供了C++ Serving和Python Pipeline两套框架，
+C++ Serving框架更倾向于追求极致性能，Python Pipeline框架倾向于二次开发的便捷性。
+旨在帮助深度学习开发者和企业提供高性能、灵活易用的工业级在线推理服务，助力人工智能落地应用。
+
+更多关于Paddle Serving的介绍，可以参考[Paddle Serving官网repo](https://github.com/PaddlePaddle/Serving)。
+
+本文档主要介绍利用C++ Serving框架实现模型（以yolov3_darknet53_270e_coco为例）的服务化部署。
+
+## 2. C++ Serving预测部署
+
+#### 2.1 C++ 服务化部署样例程序介绍
+服务化部署的样例程序的目录地址为：`deploy/serving/cpp`
+```shell
+deploy/
+├── serving/
+│   ├── python/                       # Python 服务化部署样例程序目录
+│   │   ├──config.yml                 # 服务端模型预测相关配置文件
+│   │   ├──pipeline_http_client.py    # 客户端代码
+│   │   ├──postprocess_ops.py         # 用户自定义后处理代码
+│   │   ├──preprocess_ops.py          # 用户自定义预处理代码
+│   │   ├──README.md                  # 说明文档
+│   │   ├──web_service.py             # 服务端代码
+│   ├── cpp/                          # C++ 服务化部署样例程序目录
+│   │   ├──preprocess/                # C++ 自定义OP
+│   │   ├──build_server.sh            # C++ Serving 编译脚本
+│   │   ├──serving_client.py          # 客户端代码
+│   │   └── ...
+│   └── ...
+└── ...
+```
+
+### 2.2 环境准备
+安装Paddle Serving三个安装包的最新版本，
+分别是：paddle-serving-client, paddle-serving-app和paddlepaddle(CPU/GPU版本二选一)。
+```commandline
+pip install paddle-serving-client
+# pip install paddle-serving-server # CPU
+pip install paddle-serving-server-gpu # GPU 默认 CUDA10.2 + TensorRT6，其他环境需手动指定版本号
+pip install paddle-serving-app
+# pip install paddlepaddle # CPU
+pip install paddlepaddle-gpu
+```
+您可能需要使用国内镜像源（例如百度源, 在pip命令中添加`-i https://mirror.baidu.com/pypi/simple`）来加速下载。
+Paddle Serving Server更多不同运行环境的whl包下载地址，请参考：[下载页面](https://github.com/PaddlePaddle/Serving/blob/v0.7.0/doc/Latest_Packages_CN.md)
+PaddlePaddle更多版本请参考[官网](https://www.paddlepaddle.org.cn/install/quick?docurl=/documentation/docs/zh/install/pip/linux-pip.html)
+
+### 2.3 服务化部署模型导出
+导出步骤参考文档[PaddleDetection部署模型导出教程](../../EXPORT_MODEL.md),
+导出服务化部署模型需要添加`--export_serving_model True`参数，导出示例如下:
+```commandline
+python tools/export_model.py -c configs/yolov3/yolov3_darknet53_270e_coco.yml \
+                             --export_serving_model True \
+                             -o weights=https://paddledet.bj.bcebos.com/models/yolov3_darknet53_270e_coco.pdparams
+```
+
+### 2.4 编译C++ Serving & 启动服务端模型预测服务
+可使用一键编译脚本`deploy/serving/cpp/build_server.sh`进行编译
+```commandline
+bash deploy/serving/cpp/build_server.sh
+```
+当完成以上编译安装和模型导出后，可以按如下命令启动模型预测服务：
+```commandline
+python -m paddle_serving_server.serve --model output_inference/yolov3_darknet53_270e_coco/serving_server --op yolov3_darknet53_270e_coco --port 9997 &
+```
+如果需要自定义开发OP，请参考[文档](https://github.com/PaddlePaddle/Serving/blob/v0.8.3/doc/C%2B%2B_Serving/2%2B_model.md)进行开发
+
+### 2.5 启动客户端访问
+当成功启动了模型预测服务，可以按如下命令启动客户端访问服务：
+```commandline
+python deploy/serving/python/serving_client.py --serving_client output_inference/yolov3_darknet53_270e_coco/serving_client --image_file demo/000000014439.jpg --http_port 9997
+```
--- a/deploy/serving/cpp/build_server.sh
+++ b/deploy/serving/cpp/build_server.sh
+#使用镜像：
+#registry.baidubce.com/paddlepaddle/paddle:latest-dev-cuda10.1-cudnn7-gcc82
+
+#编译Serving Server：
+
+#client和app可以直接使用release版本
+
+#server因为加入了自定义OP，需要重新编译
+
+apt-get update
+apt install -y libcurl4-openssl-dev libbz2-dev
+wget https://paddle-serving.bj.bcebos.com/others/centos_ssl.tar && tar xf centos_ssl.tar && rm -rf centos_ssl.tar && mv libcrypto.so.1.0.2k /usr/lib/libcrypto.so.1.0.2k && mv libssl.so.1.0.2k /usr/lib/libssl.so.1.0.2k && ln -sf /usr/lib/libcrypto.so.1.0.2k /usr/lib/libcrypto.so.10 && ln -sf /usr/lib/libssl.so.1.0.2k /usr/lib/libssl.so.10 && ln -sf /usr/lib/libcrypto.so.10 /usr/lib/libcrypto.so && ln -sf /usr/lib/libssl.so.10 /usr/lib/libssl.so
+
+# 安装go依赖
+rm -rf /usr/local/go
+wget -qO- https://paddle-ci.cdn.bcebos.com/go1.17.2.linux-amd64.tar.gz | tar -xz -C /usr/local
+export GOROOT=/usr/local/go
+export GOPATH=/root/gopath
+export PATH=$PATH:$GOPATH/bin:$GOROOT/bin
+go env -w GO111MODULE=on
+go env -w GOPROXY=https://goproxy.cn,direct
+go install github.com/grpc-ecosystem/grpc-gateway/protoc-gen-grpc-gateway@v1.15.2
+go install github.com/grpc-ecosystem/grpc-gateway/protoc-gen-swagger@v1.15.2
+go install github.com/golang/protobuf/protoc-gen-go@v1.4.3
+go install google.golang.org/grpc@v1.33.0
+go env -w GO111MODULE=auto
+
+# 下载opencv库
+wget https://paddle-qa.bj.bcebos.com/PaddleServing/opencv3.tar.gz && tar -xvf opencv3.tar.gz && rm -rf opencv3.tar.gz
+export OPENCV_DIR=$PWD/opencv3
+
+# clone Serving
+git clone https://github.com/PaddlePaddle/Serving.git -b develop --depth=1
+cd Serving
+export Serving_repo_path=$PWD
+git submodule update --init --recursive
+python -m pip install -r python/requirements.txt
+
+# set env
+export PYTHON_INCLUDE_DIR=$(python -c "from distutils.sysconfig import get_python_inc; print(get_python_inc())")
+export PYTHON_LIBRARIES=$(python -c "import distutils.sysconfig as sysconfig; print(sysconfig.get_config_var('LIBDIR'))")
+export PYTHON_EXECUTABLE=`which python`
+
+export CUDA_PATH='/usr/local/cuda'
+export CUDNN_LIBRARY='/usr/local/cuda/lib64/'
+export CUDA_CUDART_LIBRARY='/usr/local/cuda/lib64/'
+export TENSORRT_LIBRARY_PATH='/usr/local/TensorRT6-cuda10.1-cudnn7/targets/x86_64-linux-gnu/'
+
+# cp 自定义OP代码
+\cp ../deploy/serving/cpp/preprocess/*.h ${Serving_repo_path}/core/general-server/op
+\cp ../deploy/serving/cpp/preprocess/*.cpp ${Serving_repo_path}/core/general-server/op
+
+# 编译Server, export SERVING_BIN
+mkdir server-build-gpu-opencv && cd server-build-gpu-opencv
+cmake -DPYTHON_INCLUDE_DIR=$PYTHON_INCLUDE_DIR \
+            -DPYTHON_LIBRARIES=$PYTHON_LIBRARIES \
+            -DPYTHON_EXECUTABLE=$PYTHON_EXECUTABLE \
+            -DCUDA_TOOLKIT_ROOT_DIR=${CUDA_PATH} \
+            -DCUDNN_LIBRARY=${CUDNN_LIBRARY} \
+            -DCUDA_CUDART_LIBRARY=${CUDA_CUDART_LIBRARY} \
+            -DTENSORRT_ROOT=${TENSORRT_LIBRARY_PATH} \
+            -DOPENCV_DIR=${OPENCV_DIR} \
+            -DWITH_OPENCV=ON \
+            -DSERVER=ON \
+            -DWITH_GPU=ON ..
+make -j32
+
+python -m pip install python/dist/paddle*
+export SERVING_BIN=$PWD/core/general-server/serving
+cd ../../
--- a/deploy/serving/cpp/preprocess/mask_rcnn_r50_fpn_1x_coco.cpp
+++ b/deploy/serving/cpp/preprocess/mask_rcnn_r50_fpn_1x_coco.cpp
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "core/general-server/op/mask_rcnn_r50_fpn_1x_coco.h"
+#include "core/predictor/framework/infer.h"
+#include "core/predictor/framework/memory.h"
+#include "core/predictor/framework/resource.h"
+#include "core/util/include/timer.h"
+#include <algorithm>
+#include <iostream>
+#include <memory>
+#include <sstream>
+
+namespace baidu {
+namespace paddle_serving {
+namespace serving {
+
+using baidu::paddle_serving::Timer;
+using baidu::paddle_serving::predictor::InferManager;
+using baidu::paddle_serving::predictor::MempoolWrapper;
+using baidu::paddle_serving::predictor::PaddleGeneralModelConfig;
+using baidu::paddle_serving::predictor::general_model::Request;
+using baidu::paddle_serving::predictor::general_model::Response;
+using baidu::paddle_serving::predictor::general_model::Tensor;
+
+int mask_rcnn_r50_fpn_1x_coco::inference() {
+  VLOG(2) << "Going to run inference";
+  const std::vector<std::string> pre_node_names = pre_names();
+  if (pre_node_names.size() != 1) {
+    LOG(ERROR) << "This op(" << op_name()
+               << ") can only have one predecessor op, but received "
+               << pre_node_names.size();
+    return -1;
+  }
+  const std::string pre_name = pre_node_names[0];
+
+  const GeneralBlob *input_blob = get_depend_argument<GeneralBlob>(pre_name);
+  if (!input_blob) {
+    LOG(ERROR) << "input_blob is nullptr,error";
+    return -1;
+  }
+  uint64_t log_id = input_blob->GetLogId();
+  VLOG(2) << "(logid=" << log_id << ") Get precedent op name: " << pre_name;
+
+  GeneralBlob *output_blob = mutable_data<GeneralBlob>();
+  if (!output_blob) {
+    LOG(ERROR) << "output_blob is nullptr,error";
+    return -1;
+  }
+  output_blob->SetLogId(log_id);
+
+  if (!input_blob) {
+    LOG(ERROR) << "(logid=" << log_id
+               << ") Failed mutable depended argument, op:" << pre_name;
+    return -1;
+  }
+
+  const TensorVector *in = &input_blob->tensor_vector;
+  TensorVector *out = &output_blob->tensor_vector;
+
+  int batch_size = input_blob->_batch_size;
+  output_blob->_batch_size = batch_size;
+  VLOG(2) << "(logid=" << log_id << ") infer batch size: " << batch_size;
+
+  Timer timeline;
+  int64_t start = timeline.TimeStampUS();
+  timeline.Start();
+
+  // only support string type
+  char *total_input_ptr = static_cast<char *>(in->at(0).data.data());
+  std::string base64str = total_input_ptr;
+
+  cv::Mat img = Base2Mat(base64str);
+  cv::cvtColor(img, img, cv::COLOR_BGR2RGB);
+
+  // preprocess
+  Resize(&img, scale_factor_h, scale_factor_w, im_shape_h, im_shape_w);
+  Normalize(&img, mean_, scale_, is_scale_);
+  PadStride(&img, 32);
+  int input_shape_h = img.rows;
+  int input_shape_w = img.cols;
+  std::vector<float> input(1 * 3 * input_shape_h * input_shape_w, 0.0f);
+  Permute(img, input.data());
+
+  // create real_in
+  TensorVector *real_in = new TensorVector();
+  if (!real_in) {
+    LOG(ERROR) << "real_in is nullptr,error";
+    return -1;
+  }
+
+  int in_num = 0;
+  size_t databuf_size = 0;
+  void *databuf_data = NULL;
+  char *databuf_char = NULL;
+
+  // im_shape
+  std::vector<float> im_shape{static_cast<float>(im_shape_h),
+                              static_cast<float>(im_shape_w)};
+  databuf_size = 2 * sizeof(float);
+
+  databuf_data = MempoolWrapper::instance().malloc(databuf_size);
+  if (!databuf_data) {
+    LOG(ERROR) << "Malloc failed, size: " << databuf_size;
+    return -1;
+  }
+
+  memcpy(databuf_data, im_shape.data(), databuf_size);
+  databuf_char = reinterpret_cast<char *>(databuf_data);
+  paddle::PaddleBuf paddleBuf_0(databuf_char, databuf_size);
+  paddle::PaddleTensor tensor_in_0;
+  tensor_in_0.name = "im_shape";
+  tensor_in_0.dtype = paddle::PaddleDType::FLOAT32;
+  tensor_in_0.shape = {1, 2};
+  tensor_in_0.lod = in->at(0).lod;
+  tensor_in_0.data = paddleBuf_0;
+  real_in->push_back(tensor_in_0);
+
+  // image
+  in_num = 1 * 3 * input_shape_h * input_shape_w;
+  databuf_size = in_num * sizeof(float);
+
+  databuf_data = MempoolWrapper::instance().malloc(databuf_size);
+  if (!databuf_data) {
+    LOG(ERROR) << "Malloc failed, size: " << databuf_size;
+    return -1;
+  }
+
+  memcpy(databuf_data, input.data(), databuf_size);
+  databuf_char = reinterpret_cast<char *>(databuf_data);
+  paddle::PaddleBuf paddleBuf_1(databuf_char, databuf_size);
+  paddle::PaddleTensor tensor_in_1;
+  tensor_in_1.name = "image";
+  tensor_in_1.dtype = paddle::PaddleDType::FLOAT32;
+  tensor_in_1.shape = {1, 3, input_shape_h, input_shape_w};
+  tensor_in_1.lod = in->at(0).lod;
+  tensor_in_1.data = paddleBuf_1;
+  real_in->push_back(tensor_in_1);
+
+  // scale_factor
+  std::vector<float> scale_factor{scale_factor_h, scale_factor_w};
+  databuf_size = 2 * sizeof(float);
+
+  databuf_data = MempoolWrapper::instance().malloc(databuf_size);
+  if (!databuf_data) {
+    LOG(ERROR) << "Malloc failed, size: " << databuf_size;
+    return -1;
+  }
+
+  memcpy(databuf_data, scale_factor.data(), databuf_size);
+  databuf_char = reinterpret_cast<char *>(databuf_data);
+  paddle::PaddleBuf paddleBuf_2(databuf_char, databuf_size);
+  paddle::PaddleTensor tensor_in_2;
+  tensor_in_2.name = "scale_factor";
+  tensor_in_2.dtype = paddle::PaddleDType::FLOAT32;
+  tensor_in_2.shape = {1, 2};
+  tensor_in_2.lod = in->at(0).lod;
+  tensor_in_2.data = paddleBuf_2;
+  real_in->push_back(tensor_in_2);
+
+  if (InferManager::instance().infer(engine_name().c_str(), real_in, out,
+                                     batch_size)) {
+    LOG(ERROR) << "(logid=" << log_id
+               << ") Failed do infer in fluid model: " << engine_name().c_str();
+    return -1;
+  }
+
+  int64_t end = timeline.TimeStampUS();
+  CopyBlobInfo(input_blob, output_blob);
+  AddBlobInfo(output_blob, start);
+  AddBlobInfo(output_blob, end);
+  return 0;
+}
+
+void mask_rcnn_r50_fpn_1x_coco::Resize(cv::Mat *img, float &scale_factor_h,
+                                       float &scale_factor_w, int &im_shape_h,
+                                       int &im_shape_w) {
+  // keep_ratio
+  int im_size_max = std::max(img->rows, img->cols);
+  int im_size_min = std::min(img->rows, img->cols);
+  int target_size_max = std::max(im_shape_h, im_shape_w);
+  int target_size_min = std::min(im_shape_h, im_shape_w);
+  float scale_min =
+      static_cast<float>(target_size_min) / static_cast<float>(im_size_min);
+  float scale_max =
+      static_cast<float>(target_size_max) / static_cast<float>(im_size_max);
+  float scale_ratio = std::min(scale_min, scale_max);
+
+  // scale_factor
+  scale_factor_h = scale_ratio;
+  scale_factor_w = scale_ratio;
+
+  // Resize
+  cv::resize(*img, *img, cv::Size(), scale_ratio, scale_ratio, 2);
+  im_shape_h = img->rows;
+  im_shape_w = img->cols;
+}
+
+void mask_rcnn_r50_fpn_1x_coco::Normalize(cv::Mat *img,
+                                          const std::vector<float> &mean,
+                                          const std::vector<float> &scale,
+                                          const bool is_scale) {
+  // Normalize
+  double e = 1.0;
+  if (is_scale) {
+    e /= 255.0;
+  }
+  (*img).convertTo(*img, CV_32FC3, e);
+  for (int h = 0; h < img->rows; h++) {
+    for (int w = 0; w < img->cols; w++) {
+      img->at<cv::Vec3f>(h, w)[0] =
+          (img->at<cv::Vec3f>(h, w)[0] - mean[0]) / scale[0];
+      img->at<cv::Vec3f>(h, w)[1] =
+          (img->at<cv::Vec3f>(h, w)[1] - mean[1]) / scale[1];
+      img->at<cv::Vec3f>(h, w)[2] =
+          (img->at<cv::Vec3f>(h, w)[2] - mean[2]) / scale[2];
+    }
+  }
+}
+
+void mask_rcnn_r50_fpn_1x_coco::PadStride(cv::Mat *img, int stride_) {
+  // PadStride
+  if (stride_ <= 0)
+    return;
+  int rh = img->rows;
+  int rw = img->cols;
+  int nh = (rh / stride_) * stride_ + (rh % stride_ != 0) * stride_;
+  int nw = (rw / stride_) * stride_ + (rw % stride_ != 0) * stride_;
+  cv::copyMakeBorder(*img, *img, 0, nh - rh, 0, nw - rw, cv::BORDER_CONSTANT,
+                     cv::Scalar(0));
+}
+
+void mask_rcnn_r50_fpn_1x_coco::Permute(const cv::Mat &img, float *data) {
+  // Permute
+  int rh = img.rows;
+  int rw = img.cols;
+  int rc = img.channels();
+  for (int i = 0; i < rc; ++i) {
+    cv::extractChannel(img, cv::Mat(rh, rw, CV_32FC1, data + i * rh * rw), i);
+  }
+}
+
+cv::Mat mask_rcnn_r50_fpn_1x_coco::Base2Mat(std::string &base64_data) {
+  cv::Mat img;
+  std::string s_mat;
+  s_mat = base64Decode(base64_data.data(), base64_data.size());
+  std::vector<char> base64_img(s_mat.begin(), s_mat.end());
+  img = cv::imdecode(base64_img, cv::IMREAD_COLOR); // CV_LOAD_IMAGE_COLOR
+  return img;
+}
+
+std::string mask_rcnn_r50_fpn_1x_coco::base64Decode(const char *Data,
+                                                    int DataByte) {
+  const char DecodeTable[] = {
+      0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+      0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+      0,  0,  0,  0,  0,  0,  0,  0,  0,
+      62, // '+'
+      0,  0,  0,
+      63,                                     // '/'
+      52, 53, 54, 55, 56, 57, 58, 59, 60, 61, // '0'-'9'
+      0,  0,  0,  0,  0,  0,  0,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9,
+      10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, // 'A'-'Z'
+      0,  0,  0,  0,  0,  0,  26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36,
+      37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, // 'a'-'z'
+  };
+
+  std::string strDecode;
+  int nValue;
+  int i = 0;
+  while (i < DataByte) {
+    if (*Data != '\r' && *Data != '\n') {
+      nValue = DecodeTable[*Data++] << 18;
+      nValue += DecodeTable[*Data++] << 12;
+      strDecode += (nValue & 0x00FF0000) >> 16;
+      if (*Data != '=') {
+        nValue += DecodeTable[*Data++] << 6;
+        strDecode += (nValue & 0x0000FF00) >> 8;
+        if (*Data != '=') {
+          nValue += DecodeTable[*Data++];
+          strDecode += nValue & 0x000000FF;
+        }
+      }
+      i += 4;
+    } else // 回车换行,跳过
+    {
+      Data++;
+      i++;
+    }
+  }
+  return strDecode;
+}
+
+DEFINE_OP(mask_rcnn_r50_fpn_1x_coco);
+
+} // namespace serving
+} // namespace paddle_serving
+} // namespace baidu
--- a/deploy/serving/cpp/preprocess/mask_rcnn_r50_fpn_1x_coco.h
+++ b/deploy/serving/cpp/preprocess/mask_rcnn_r50_fpn_1x_coco.h
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "core/general-server/general_model_service.pb.h"
+#include "core/general-server/op/general_infer_helper.h"
+#include "paddle_inference_api.h" // NOLINT
+#include <string>
+#include <vector>
+
+#include "opencv2/core.hpp"
+#include "opencv2/imgcodecs.hpp"
+#include "opencv2/imgproc.hpp"
+#include <chrono>
+#include <iomanip>
+#include <iostream>
+#include <ostream>
+#include <vector>
+
+#include <cstring>
+#include <fstream>
+#include <numeric>
+
+namespace baidu {
+namespace paddle_serving {
+namespace serving {
+
+class mask_rcnn_r50_fpn_1x_coco
+    : public baidu::paddle_serving::predictor::OpWithChannel<GeneralBlob> {
+public:
+  typedef std::vector<paddle::PaddleTensor> TensorVector;
+
+  DECLARE_OP(mask_rcnn_r50_fpn_1x_coco);
+
+  int inference();
+
+private:
+  // preprocess
+  std::vector<float> mean_ = {0.485f, 0.456f, 0.406f};
+  std::vector<float> scale_ = {0.229f, 0.224f, 0.225f};
+  bool is_scale_ = true;
+  int im_shape_h = 1333;
+  int im_shape_w = 800;
+  float scale_factor_h = 1.0f;
+  float scale_factor_w = 1.0f;
+
+  void Resize(cv::Mat *img, float &scale_factor_h, float &scale_factor_w,
+              int &im_shape_h, int &im_shape_w);
+  void Normalize(cv::Mat *img, const std::vector<float> &mean,
+                 const std::vector<float> &scale, const bool is_scale);
+  void PadStride(cv::Mat *img, int stride_ = -1);
+  void Permute(const cv::Mat &img, float *data);
+
+  // read pics
+  cv::Mat Base2Mat(std::string &base64_data);
+  std::string base64Decode(const char *Data, int DataByte);
+};
+
+} // namespace serving
+} // namespace paddle_serving
+} // namespace baidu
--- a/deploy/serving/cpp/preprocess/picodet_lcnet_1_5x_416_coco.cpp
+++ b/deploy/serving/cpp/preprocess/picodet_lcnet_1_5x_416_coco.cpp
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "core/general-server/op/picodet_lcnet_1_5x_416_coco.h"
+#include "core/predictor/framework/infer.h"
+#include "core/predictor/framework/memory.h"
+#include "core/predictor/framework/resource.h"
+#include "core/util/include/timer.h"
+#include <algorithm>
+#include <iostream>
+#include <memory>
+#include <sstream>
+
+namespace baidu {
+namespace paddle_serving {
+namespace serving {
+
+using baidu::paddle_serving::Timer;
+using baidu::paddle_serving::predictor::InferManager;
+using baidu::paddle_serving::predictor::MempoolWrapper;
+using baidu::paddle_serving::predictor::PaddleGeneralModelConfig;
+using baidu::paddle_serving::predictor::general_model::Request;
+using baidu::paddle_serving::predictor::general_model::Response;
+using baidu::paddle_serving::predictor::general_model::Tensor;
+
+int picodet_lcnet_1_5x_416_coco::inference() {
+  VLOG(2) << "Going to run inference";
+  const std::vector<std::string> pre_node_names = pre_names();
+  if (pre_node_names.size() != 1) {
+    LOG(ERROR) << "This op(" << op_name()
+               << ") can only have one predecessor op, but received "
+               << pre_node_names.size();
+    return -1;
+  }
+  const std::string pre_name = pre_node_names[0];
+
+  const GeneralBlob *input_blob = get_depend_argument<GeneralBlob>(pre_name);
+  if (!input_blob) {
+    LOG(ERROR) << "input_blob is nullptr,error";
+    return -1;
+  }
+  uint64_t log_id = input_blob->GetLogId();
+  VLOG(2) << "(logid=" << log_id << ") Get precedent op name: " << pre_name;
+
+  GeneralBlob *output_blob = mutable_data<GeneralBlob>();
+  if (!output_blob) {
+    LOG(ERROR) << "output_blob is nullptr,error";
+    return -1;
+  }
+  output_blob->SetLogId(log_id);
+
+  if (!input_blob) {
+    LOG(ERROR) << "(logid=" << log_id
+               << ") Failed mutable depended argument, op:" << pre_name;
+    return -1;
+  }
+
+  const TensorVector *in = &input_blob->tensor_vector;
+  TensorVector *out = &output_blob->tensor_vector;
+
+  int batch_size = input_blob->_batch_size;
+  output_blob->_batch_size = batch_size;
+  VLOG(2) << "(logid=" << log_id << ") infer batch size: " << batch_size;
+
+  Timer timeline;
+  int64_t start = timeline.TimeStampUS();
+  timeline.Start();
+
+  // only support string type
+  char *total_input_ptr = static_cast<char *>(in->at(0).data.data());
+  std::string base64str = total_input_ptr;
+
+  cv::Mat img = Base2Mat(base64str);
+  cv::cvtColor(img, img, cv::COLOR_BGR2RGB);
+
+  // preprocess
+  std::vector<float> input(1 * 3 * im_shape_h * im_shape_w, 0.0f);
+  preprocess_det(img, input.data(), scale_factor_h, scale_factor_w, im_shape_h,
+                 im_shape_w, mean_, scale_, is_scale_);
+
+  // create real_in
+  TensorVector *real_in = new TensorVector();
+  if (!real_in) {
+    LOG(ERROR) << "real_in is nullptr,error";
+    return -1;
+  }
+
+  int in_num = 0;
+  size_t databuf_size = 0;
+  void *databuf_data = NULL;
+  char *databuf_char = NULL;
+
+  // image
+  in_num = 1 * 3 * im_shape_h * im_shape_w;
+  databuf_size = in_num * sizeof(float);
+
+  databuf_data = MempoolWrapper::instance().malloc(databuf_size);
+  if (!databuf_data) {
+    LOG(ERROR) << "Malloc failed, size: " << databuf_size;
+    return -1;
+  }
+
+  memcpy(databuf_data, input.data(), databuf_size);
+  databuf_char = reinterpret_cast<char *>(databuf_data);
+  paddle::PaddleBuf paddleBuf(databuf_char, databuf_size);
+  paddle::PaddleTensor tensor_in;
+  tensor_in.name = "image";
+  tensor_in.dtype = paddle::PaddleDType::FLOAT32;
+  tensor_in.shape = {1, 3, im_shape_h, im_shape_w};
+  tensor_in.lod = in->at(0).lod;
+  tensor_in.data = paddleBuf;
+  real_in->push_back(tensor_in);
+
+  // scale_factor
+  std::vector<float> scale_factor{scale_factor_h, scale_factor_w};
+  databuf_size = 2 * sizeof(float);
+
+  databuf_data = MempoolWrapper::instance().malloc(databuf_size);
+  if (!databuf_data) {
+    LOG(ERROR) << "Malloc failed, size: " << databuf_size;
+    return -1;
+  }
+
+  memcpy(databuf_data, scale_factor.data(), databuf_size);
+  databuf_char = reinterpret_cast<char *>(databuf_data);
+  paddle::PaddleBuf paddleBuf_2(databuf_char, databuf_size);
+  paddle::PaddleTensor tensor_in_2;
+  tensor_in_2.name = "scale_factor";
+  tensor_in_2.dtype = paddle::PaddleDType::FLOAT32;
+  tensor_in_2.shape = {1, 2};
+  tensor_in_2.lod = in->at(0).lod;
+  tensor_in_2.data = paddleBuf_2;
+  real_in->push_back(tensor_in_2);
+
+  if (InferManager::instance().infer(engine_name().c_str(), real_in, out,
+                                     batch_size)) {
+    LOG(ERROR) << "(logid=" << log_id
+               << ") Failed do infer in fluid model: " << engine_name().c_str();
+    return -1;
+  }
+
+  int64_t end = timeline.TimeStampUS();
+  CopyBlobInfo(input_blob, output_blob);
+  AddBlobInfo(output_blob, start);
+  AddBlobInfo(output_blob, end);
+  return 0;
+}
+
+void picodet_lcnet_1_5x_416_coco::preprocess_det(
+    const cv::Mat &img, float *data, float &scale_factor_h,
+    float &scale_factor_w, int im_shape_h, int im_shape_w,
+    const std::vector<float> &mean, const std::vector<float> &scale,
+    const bool is_scale) {
+  // scale_factor
+  scale_factor_h =
+      static_cast<float>(im_shape_h) / static_cast<float>(img.rows);
+  scale_factor_w =
+      static_cast<float>(im_shape_w) / static_cast<float>(img.cols);
+
+  // Resize
+  cv::Mat resize_img;
+  cv::resize(img, resize_img, cv::Size(im_shape_w, im_shape_h), 0, 0, 2);
+
+  // Normalize
+  double e = 1.0;
+  if (is_scale) {
+    e /= 255.0;
+  }
+  cv::Mat img_fp;
+  (resize_img).convertTo(img_fp, CV_32FC3, e);
+  for (int h = 0; h < im_shape_h; h++) {
+    for (int w = 0; w < im_shape_w; w++) {
+      img_fp.at<cv::Vec3f>(h, w)[0] =
+          (img_fp.at<cv::Vec3f>(h, w)[0] - mean[0]) / scale[0];
+      img_fp.at<cv::Vec3f>(h, w)[1] =
+          (img_fp.at<cv::Vec3f>(h, w)[1] - mean[1]) / scale[1];
+      img_fp.at<cv::Vec3f>(h, w)[2] =
+          (img_fp.at<cv::Vec3f>(h, w)[2] - mean[2]) / scale[2];
+    }
+  }
+
+  // Permute
+  int rh = img_fp.rows;
+  int rw = img_fp.cols;
+  int rc = img_fp.channels();
+  for (int i = 0; i < rc; ++i) {
+    cv::extractChannel(img_fp, cv::Mat(rh, rw, CV_32FC1, data + i * rh * rw),
+                       i);
+  }
+}
+
+cv::Mat picodet_lcnet_1_5x_416_coco::Base2Mat(std::string &base64_data) {
+  cv::Mat img;
+  std::string s_mat;
+  s_mat = base64Decode(base64_data.data(), base64_data.size());
+  std::vector<char> base64_img(s_mat.begin(), s_mat.end());
+  img = cv::imdecode(base64_img, cv::IMREAD_COLOR); // CV_LOAD_IMAGE_COLOR
+  return img;
+}
+
+std::string picodet_lcnet_1_5x_416_coco::base64Decode(const char *Data,
+                                                      int DataByte) {
+  const char DecodeTable[] = {
+      0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+      0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+      0,  0,  0,  0,  0,  0,  0,  0,  0,
+      62, // '+'
+      0,  0,  0,
+      63,                                     // '/'
+      52, 53, 54, 55, 56, 57, 58, 59, 60, 61, // '0'-'9'
+      0,  0,  0,  0,  0,  0,  0,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9,
+      10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, // 'A'-'Z'
+      0,  0,  0,  0,  0,  0,  26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36,
+      37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, // 'a'-'z'
+  };
+
+  std::string strDecode;
+  int nValue;
+  int i = 0;
+  while (i < DataByte) {
+    if (*Data != '\r' && *Data != '\n') {
+      nValue = DecodeTable[*Data++] << 18;
+      nValue += DecodeTable[*Data++] << 12;
+      strDecode += (nValue & 0x00FF0000) >> 16;
+      if (*Data != '=') {
+        nValue += DecodeTable[*Data++] << 6;
+        strDecode += (nValue & 0x0000FF00) >> 8;
+        if (*Data != '=') {
+          nValue += DecodeTable[*Data++];
+          strDecode += nValue & 0x000000FF;
+        }
+      }
+      i += 4;
+    } else // 回车换行,跳过
+    {
+      Data++;
+      i++;
+    }
+  }
+  return strDecode;
+}
+
+DEFINE_OP(picodet_lcnet_1_5x_416_coco);
+
+} // namespace serving
+} // namespace paddle_serving
+} // namespace baidu
--- a/deploy/serving/cpp/preprocess/picodet_lcnet_1_5x_416_coco.h
+++ b/deploy/serving/cpp/preprocess/picodet_lcnet_1_5x_416_coco.h
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "core/general-server/general_model_service.pb.h"
+#include "core/general-server/op/general_infer_helper.h"
+#include "paddle_inference_api.h" // NOLINT
+#include <string>
+#include <vector>
+
+#include "opencv2/core.hpp"
+#include "opencv2/imgcodecs.hpp"
+#include "opencv2/imgproc.hpp"
+#include <chrono>
+#include <iomanip>
+#include <iostream>
+#include <ostream>
+#include <vector>
+
+#include <cstring>
+#include <fstream>
+#include <numeric>
+
+namespace baidu {
+namespace paddle_serving {
+namespace serving {
+
+class picodet_lcnet_1_5x_416_coco
+    : public baidu::paddle_serving::predictor::OpWithChannel<GeneralBlob> {
+public:
+  typedef std::vector<paddle::PaddleTensor> TensorVector;
+
+  DECLARE_OP(picodet_lcnet_1_5x_416_coco);
+
+  int inference();
+
+private:
+  // preprocess
+  std::vector<float> mean_ = {0.485f, 0.456f, 0.406f};
+  std::vector<float> scale_ = {0.229f, 0.224f, 0.225f};
+  bool is_scale_ = true;
+  int im_shape_h = 416;
+  int im_shape_w = 416;
+  float scale_factor_h = 1.0f;
+  float scale_factor_w = 1.0f;
+  void preprocess_det(const cv::Mat &img, float *data, float &scale_factor_h,
+                      float &scale_factor_w, int im_shape_h, int im_shape_w,
+                      const std::vector<float> &mean,
+                      const std::vector<float> &scale, const bool is_scale);
+
+  // read pics
+  cv::Mat Base2Mat(std::string &base64_data);
+  std::string base64Decode(const char *Data, int DataByte);
+};
+
+} // namespace serving
+} // namespace paddle_serving
+} // namespace baidu
--- a/deploy/serving/cpp/preprocess/ppyolo_mbv3_large_coco.cpp
+++ b/deploy/serving/cpp/preprocess/ppyolo_mbv3_large_coco.cpp
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "core/general-server/op/ppyolo_mbv3_large_coco.h"
+#include "core/predictor/framework/infer.h"
+#include "core/predictor/framework/memory.h"
+#include "core/predictor/framework/resource.h"
+#include "core/util/include/timer.h"
+#include <algorithm>
+#include <iostream>
+#include <memory>
+#include <sstream>
+
+namespace baidu {
+namespace paddle_serving {
+namespace serving {
+
+using baidu::paddle_serving::Timer;
+using baidu::paddle_serving::predictor::InferManager;
+using baidu::paddle_serving::predictor::MempoolWrapper;
+using baidu::paddle_serving::predictor::PaddleGeneralModelConfig;
+using baidu::paddle_serving::predictor::general_model::Request;
+using baidu::paddle_serving::predictor::general_model::Response;
+using baidu::paddle_serving::predictor::general_model::Tensor;
+
+int ppyolo_mbv3_large_coco::inference() {
+  VLOG(2) << "Going to run inference";
+  const std::vector<std::string> pre_node_names = pre_names();
+  if (pre_node_names.size() != 1) {
+    LOG(ERROR) << "This op(" << op_name()
+               << ") can only have one predecessor op, but received "
+               << pre_node_names.size();
+    return -1;
+  }
+  const std::string pre_name = pre_node_names[0];
+
+  const GeneralBlob *input_blob = get_depend_argument<GeneralBlob>(pre_name);
+  if (!input_blob) {
+    LOG(ERROR) << "input_blob is nullptr,error";
+    return -1;
+  }
+  uint64_t log_id = input_blob->GetLogId();
+  VLOG(2) << "(logid=" << log_id << ") Get precedent op name: " << pre_name;
+
+  GeneralBlob *output_blob = mutable_data<GeneralBlob>();
+  if (!output_blob) {
+    LOG(ERROR) << "output_blob is nullptr,error";
+    return -1;
+  }
+  output_blob->SetLogId(log_id);
+
+  if (!input_blob) {
+    LOG(ERROR) << "(logid=" << log_id
+               << ") Failed mutable depended argument, op:" << pre_name;
+    return -1;
+  }
+
+  const TensorVector *in = &input_blob->tensor_vector;
+  TensorVector *out = &output_blob->tensor_vector;
+
+  int batch_size = input_blob->_batch_size;
+  output_blob->_batch_size = batch_size;
+  VLOG(2) << "(logid=" << log_id << ") infer batch size: " << batch_size;
+
+  Timer timeline;
+  int64_t start = timeline.TimeStampUS();
+  timeline.Start();
+
+  // only support string type
+  char *total_input_ptr = static_cast<char *>(in->at(0).data.data());
+  std::string base64str = total_input_ptr;
+
+  cv::Mat img = Base2Mat(base64str);
+  cv::cvtColor(img, img, cv::COLOR_BGR2RGB);
+
+  // preprocess
+  std::vector<float> input(1 * 3 * im_shape_h * im_shape_w, 0.0f);
+  preprocess_det(img, input.data(), scale_factor_h, scale_factor_w, im_shape_h,
+                 im_shape_w, mean_, scale_, is_scale_);
+
+  // create real_in
+  TensorVector *real_in = new TensorVector();
+  if (!real_in) {
+    LOG(ERROR) << "real_in is nullptr,error";
+    return -1;
+  }
+
+  int in_num = 0;
+  size_t databuf_size = 0;
+  void *databuf_data = NULL;
+  char *databuf_char = NULL;
+
+  // im_shape
+  std::vector<float> im_shape{static_cast<float>(im_shape_h),
+                              static_cast<float>(im_shape_w)};
+  databuf_size = 2 * sizeof(float);
+
+  databuf_data = MempoolWrapper::instance().malloc(databuf_size);
+  if (!databuf_data) {
+    LOG(ERROR) << "Malloc failed, size: " << databuf_size;
+    return -1;
+  }
+
+  memcpy(databuf_data, im_shape.data(), databuf_size);
+  databuf_char = reinterpret_cast<char *>(databuf_data);
+  paddle::PaddleBuf paddleBuf_0(databuf_char, databuf_size);
+  paddle::PaddleTensor tensor_in_0;
+  tensor_in_0.name = "im_shape";
+  tensor_in_0.dtype = paddle::PaddleDType::FLOAT32;
+  tensor_in_0.shape = {1, 2};
+  tensor_in_0.lod = in->at(0).lod;
+  tensor_in_0.data = paddleBuf_0;
+  real_in->push_back(tensor_in_0);
+
+  // image
+  in_num = 1 * 3 * im_shape_h * im_shape_w;
+  databuf_size = in_num * sizeof(float);
+
+  databuf_data = MempoolWrapper::instance().malloc(databuf_size);
+  if (!databuf_data) {
+    LOG(ERROR) << "Malloc failed, size: " << databuf_size;
+    return -1;
+  }
+
+  memcpy(databuf_data, input.data(), databuf_size);
+  databuf_char = reinterpret_cast<char *>(databuf_data);
+  paddle::PaddleBuf paddleBuf_1(databuf_char, databuf_size);
+  paddle::PaddleTensor tensor_in_1;
+  tensor_in_1.name = "image";
+  tensor_in_1.dtype = paddle::PaddleDType::FLOAT32;
+  tensor_in_1.shape = {1, 3, im_shape_h, im_shape_w};
+  tensor_in_1.lod = in->at(0).lod;
+  tensor_in_1.data = paddleBuf_1;
+  real_in->push_back(tensor_in_1);
+
+  // scale_factor
+  std::vector<float> scale_factor{scale_factor_h, scale_factor_w};
+  databuf_size = 2 * sizeof(float);
+
+  databuf_data = MempoolWrapper::instance().malloc(databuf_size);
+  if (!databuf_data) {
+    LOG(ERROR) << "Malloc failed, size: " << databuf_size;
+    return -1;
+  }
+
+  memcpy(databuf_data, scale_factor.data(), databuf_size);
+  databuf_char = reinterpret_cast<char *>(databuf_data);
+  paddle::PaddleBuf paddleBuf_2(databuf_char, databuf_size);
+  paddle::PaddleTensor tensor_in_2;
+  tensor_in_2.name = "scale_factor";
+  tensor_in_2.dtype = paddle::PaddleDType::FLOAT32;
+  tensor_in_2.shape = {1, 2};
+  tensor_in_2.lod = in->at(0).lod;
+  tensor_in_2.data = paddleBuf_2;
+  real_in->push_back(tensor_in_2);
+
+  if (InferManager::instance().infer(engine_name().c_str(), real_in, out,
+                                     batch_size)) {
+    LOG(ERROR) << "(logid=" << log_id
+               << ") Failed do infer in fluid model: " << engine_name().c_str();
+    return -1;
+  }
+
+  int64_t end = timeline.TimeStampUS();
+  CopyBlobInfo(input_blob, output_blob);
+  AddBlobInfo(output_blob, start);
+  AddBlobInfo(output_blob, end);
+  return 0;
+}
+
+void ppyolo_mbv3_large_coco::preprocess_det(const cv::Mat &img, float *data,
+                                            float &scale_factor_h,
+                                            float &scale_factor_w,
+                                            int im_shape_h, int im_shape_w,
+                                            const std::vector<float> &mean,
+                                            const std::vector<float> &scale,
+                                            const bool is_scale) {
+  // scale_factor
+  scale_factor_h =
+      static_cast<float>(im_shape_h) / static_cast<float>(img.rows);
+  scale_factor_w =
+      static_cast<float>(im_shape_w) / static_cast<float>(img.cols);
+
+  // Resize
+  cv::Mat resize_img;
+  cv::resize(img, resize_img, cv::Size(im_shape_w, im_shape_h), 0, 0, 2);
+
+  // Normalize
+  double e = 1.0;
+  if (is_scale) {
+    e /= 255.0;
+  }
+  cv::Mat img_fp;
+  (resize_img).convertTo(img_fp, CV_32FC3, e);
+  for (int h = 0; h < im_shape_h; h++) {
+    for (int w = 0; w < im_shape_w; w++) {
+      img_fp.at<cv::Vec3f>(h, w)[0] =
+          (img_fp.at<cv::Vec3f>(h, w)[0] - mean[0]) / scale[0];
+      img_fp.at<cv::Vec3f>(h, w)[1] =
+          (img_fp.at<cv::Vec3f>(h, w)[1] - mean[1]) / scale[1];
+      img_fp.at<cv::Vec3f>(h, w)[2] =
+          (img_fp.at<cv::Vec3f>(h, w)[2] - mean[2]) / scale[2];
+    }
+  }
+
+  // Permute
+  int rh = img_fp.rows;
+  int rw = img_fp.cols;
+  int rc = img_fp.channels();
+  for (int i = 0; i < rc; ++i) {
+    cv::extractChannel(img_fp, cv::Mat(rh, rw, CV_32FC1, data + i * rh * rw),
+                       i);
+  }
+}
+
+cv::Mat ppyolo_mbv3_large_coco::Base2Mat(std::string &base64_data) {
+  cv::Mat img;
+  std::string s_mat;
+  s_mat = base64Decode(base64_data.data(), base64_data.size());
+  std::vector<char> base64_img(s_mat.begin(), s_mat.end());
+  img = cv::imdecode(base64_img, cv::IMREAD_COLOR); // CV_LOAD_IMAGE_COLOR
+  return img;
+}
+
+std::string ppyolo_mbv3_large_coco::base64Decode(const char *Data,
+                                                 int DataByte) {
+  const char DecodeTable[] = {
+      0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+      0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+      0,  0,  0,  0,  0,  0,  0,  0,  0,
+      62, // '+'
+      0,  0,  0,
+      63,                                     // '/'
+      52, 53, 54, 55, 56, 57, 58, 59, 60, 61, // '0'-'9'
+      0,  0,  0,  0,  0,  0,  0,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9,
+      10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, // 'A'-'Z'
+      0,  0,  0,  0,  0,  0,  26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36,
+      37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, // 'a'-'z'
+  };
+
+  std::string strDecode;
+  int nValue;
+  int i = 0;
+  while (i < DataByte) {
+    if (*Data != '\r' && *Data != '\n') {
+      nValue = DecodeTable[*Data++] << 18;
+      nValue += DecodeTable[*Data++] << 12;
+      strDecode += (nValue & 0x00FF0000) >> 16;
+      if (*Data != '=') {
+        nValue += DecodeTable[*Data++] << 6;
+        strDecode += (nValue & 0x0000FF00) >> 8;
+        if (*Data != '=') {
+          nValue += DecodeTable[*Data++];
+          strDecode += nValue & 0x000000FF;
+        }
+      }
+      i += 4;
+    } else // 回车换行,跳过
+    {
+      Data++;
+      i++;
+    }
+  }
+  return strDecode;
+}
+
+DEFINE_OP(ppyolo_mbv3_large_coco);
+
+} // namespace serving
+} // namespace paddle_serving
+} // namespace baidu