update code

fccfdfa5 · dlyrm · dcc7bf4f · fccfdfa5 · fccfdfa5 · fccfdfa5
Commit fccfdfa5 authored Dec 25, 2023 by dlyrm
20 changed files
--- a/deploy/third_engine/demo_mnn_kpts/keypoint_detector.h
+++ b/deploy/third_engine/demo_mnn_kpts/keypoint_detector.h
+//   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <ctime>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include <opencv2/core/core.hpp>
+#include <opencv2/highgui/highgui.hpp>
+#include <opencv2/imgproc/imgproc.hpp>
+
+#include "Interpreter.hpp"
+
+#include "ImageProcess.hpp"
+#include "MNNDefine.h"
+#include "Tensor.hpp"
+
+#include "keypoint_postprocess.h"
+
+using namespace MNN;
+
+namespace PaddleDetection {
+// Object KeyPoint Result
+struct KeyPointResult {
+  // Keypoints: shape(N x 3); N: number of Joints; 3: x,y,conf
+  std::vector<float> keypoints;
+  int num_joints = -1;
+};
+
+// Visualiztion KeyPoint Result
+cv::Mat VisualizeKptsResult(const cv::Mat& img,
+                            const std::vector<KeyPointResult>& results,
+                            const std::vector<int>& colormap,
+                            float threshold = 0.2);
+
+class KeyPointDetector {
+ public:
+  explicit KeyPointDetector(const std::string& model_path,
+                            int num_thread = 4,
+                            int input_height = 256,
+                            int input_width = 192,
+                            float score_threshold = 0.3,
+                            const int batch_size = 1,
+                            bool use_dark = true) {
+    printf("config path: %s",
+           model_path.substr(0, model_path.find_last_of('/') + 1).c_str());
+    use_dark_ = use_dark;
+
+    in_w = input_width;
+    in_h = input_height;
+    threshold_ = score_threshold;
+
+    KeyPointDet_interpreter = std::shared_ptr<MNN::Interpreter>(
+        MNN::Interpreter::createFromFile(model_path.c_str()));
+    MNN::ScheduleConfig config;
+    config.type = MNN_FORWARD_CPU;
+    /*modeNum means gpuMode for GPU usage, Or means numThread for CPU usage.*/
+    config.numThread = num_thread;
+    // If type not fount, let it failed
+    config.backupType = MNN_FORWARD_CPU;
+    BackendConfig backendConfig;
+    backendConfig.precision = static_cast<MNN::BackendConfig::PrecisionMode>(1);
+    config.backendConfig = &backendConfig;
+
+    KeyPointDet_session = KeyPointDet_interpreter->createSession(config);
+
+    input_tensor =
+        KeyPointDet_interpreter->getSessionInput(KeyPointDet_session, nullptr);
+  }
+
+  ~KeyPointDetector() {
+    KeyPointDet_interpreter->releaseModel();
+    KeyPointDet_interpreter->releaseSession(KeyPointDet_session);
+  }
+
+  // Load Paddle inference model
+  void LoadModel(std::string model_file, int num_theads);
+
+  // Run predictor
+  void Predict(const std::vector<cv::Mat> imgs,
+               std::vector<std::vector<float>>& center,
+               std::vector<std::vector<float>>& scale,
+               std::vector<KeyPointResult>* result = nullptr);
+
+  bool use_dark() { return this->use_dark_; }
+
+  inline float get_threshold() { return threshold_; };
+
+  // const float mean_vals[3] = { 103.53f, 116.28f, 123.675f };
+  // const float norm_vals[3] = { 0.017429f, 0.017507f, 0.017125f };
+  const float mean_vals[3] = {0.f, 0.f, 0.f};
+  const float norm_vals[3] = {1.f, 1.f, 1.f};
+  int in_w = 128;
+  int in_h = 256;
+
+ private:
+  // Postprocess result
+  void Postprocess(std::vector<float>& output,
+                   std::vector<int>& output_shape,
+                   std::vector<int>& idxout,
+                   std::vector<int>& idx_shape,
+                   std::vector<KeyPointResult>* result,
+                   std::vector<std::vector<float>>& center,
+                   std::vector<std::vector<float>>& scale);
+
+  std::vector<float> output_data_;
+  std::vector<int> idx_data_;
+  float threshold_;
+  bool use_dark_;
+
+  std::shared_ptr<MNN::Interpreter> KeyPointDet_interpreter;
+  MNN::Session* KeyPointDet_session = nullptr;
+  MNN::Tensor* input_tensor = nullptr;
+};
+
+}  // namespace PaddleDetection
--- a/deploy/third_engine/demo_mnn_kpts/keypoint_postprocess.cpp
+++ b/deploy/third_engine/demo_mnn_kpts/keypoint_postprocess.cpp
+//   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "keypoint_postprocess.h"
+#define PI 3.1415926535
+#define HALF_CIRCLE_DEGREE 180
+
+cv::Point2f get_3rd_point(cv::Point2f& a, cv::Point2f& b) {
+  cv::Point2f direct{a.x - b.x, a.y - b.y};
+  return cv::Point2f(a.x - direct.y, a.y + direct.x);
+}
+
+std::vector<float> get_dir(float src_point_x,
+                           float src_point_y,
+                           float rot_rad) {
+  float sn = sin(rot_rad);
+  float cs = cos(rot_rad);
+  std::vector<float> src_result{0.0, 0.0};
+  src_result[0] = src_point_x * cs - src_point_y * sn;
+  src_result[1] = src_point_x * sn + src_point_y * cs;
+  return src_result;
+}
+
+void affine_tranform(
+    float pt_x, float pt_y, cv::Mat& trans, std::vector<float>& preds, int p) {
+  double new1[3] = {pt_x, pt_y, 1.0};
+  cv::Mat new_pt(3, 1, trans.type(), new1);
+  cv::Mat w = trans * new_pt;
+  preds[p * 3 + 1] = static_cast<float>(w.at<double>(0, 0));
+  preds[p * 3 + 2] = static_cast<float>(w.at<double>(1, 0));
+}
+
+void get_affine_transform(std::vector<float>& center,
+                          std::vector<float>& scale,
+                          float rot,
+                          std::vector<int>& output_size,
+                          cv::Mat& trans,
+                          int inv) {
+  float src_w = scale[0];
+  float dst_w = static_cast<float>(output_size[0]);
+  float dst_h = static_cast<float>(output_size[1]);
+  float rot_rad = rot * PI / HALF_CIRCLE_DEGREE;
+  std::vector<float> src_dir = get_dir(-0.5 * src_w, 0, rot_rad);
+  std::vector<float> dst_dir{static_cast<float>(-0.5) * dst_w, 0.0};
+  cv::Point2f srcPoint2f[3], dstPoint2f[3];
+  srcPoint2f[0] = cv::Point2f(center[0], center[1]);
+  srcPoint2f[1] = cv::Point2f(center[0] + src_dir[0], center[1] + src_dir[1]);
+  srcPoint2f[2] = get_3rd_point(srcPoint2f[0], srcPoint2f[1]);
+
+  dstPoint2f[0] = cv::Point2f(dst_w * 0.5, dst_h * 0.5);
+  dstPoint2f[1] =
+      cv::Point2f(dst_w * 0.5 + dst_dir[0], dst_h * 0.5 + dst_dir[1]);
+  dstPoint2f[2] = get_3rd_point(dstPoint2f[0], dstPoint2f[1]);
+  if (inv == 0) {
+    trans = cv::getAffineTransform(srcPoint2f, dstPoint2f);
+  } else {
+    trans = cv::getAffineTransform(dstPoint2f, srcPoint2f);
+  }
+}
+
+void transform_preds(std::vector<float>& coords,
+                     std::vector<float>& center,
+                     std::vector<float>& scale,
+                     std::vector<int>& output_size,
+                     std::vector<int>& dim,
+                     std::vector<float>& target_coords) {
+  cv::Mat trans(2, 3, CV_64FC1);
+  get_affine_transform(center, scale, 0, output_size, trans, 1);
+  for (int p = 0; p < dim[1]; ++p) {
+    affine_tranform(coords[p * 2], coords[p * 2 + 1], trans, target_coords, p);
+  }
+}
+
+// only for batchsize == 1
+void get_max_preds(std::vector<float>& heatmap,
+                   std::vector<int>& dim,
+                   std::vector<float>& preds,
+                   std::vector<float>& maxvals,
+                   int batchid,
+                   int joint_idx) {
+  int num_joints = dim[1];
+  int width = dim[3];
+  std::vector<int> idx;
+  idx.resize(num_joints * 2);
+
+  for (int j = 0; j < dim[1]; j++) {
+    float* index = &(
+        heatmap[batchid * num_joints * dim[2] * dim[3] + j * dim[2] * dim[3]]);
+    float* end = index + dim[2] * dim[3];
+    float* max_dis = std::max_element(index, end);
+    auto max_id = std::distance(index, max_dis);
+    maxvals[j] = *max_dis;
+    if (*max_dis > 0) {
+      preds[j * 2] = static_cast<float>(max_id % width);
+      preds[j * 2 + 1] = static_cast<float>(max_id / width);
+    }
+  }
+}
+
+void dark_parse(std::vector<float>& heatmap,
+                std::vector<int>& dim,
+                std::vector<float>& coords,
+                int px,
+                int py,
+                int index,
+                int ch) {
+  /*DARK postpocessing, Zhang et al. Distribution-Aware Coordinate
+  Representation for Human Pose Estimation (CVPR 2020).
+  1) offset = - hassian.inv() * derivative
+  2) dx = (heatmap[x+1] - heatmap[x-1])/2.
+  3) dxx = (dx[x+1] - dx[x-1])/2.
+  4) derivative = Mat([dx, dy])
+  5) hassian = Mat([[dxx, dxy], [dxy, dyy]])
+  */
+  std::vector<float>::const_iterator first1 = heatmap.begin() + index;
+  std::vector<float>::const_iterator last1 =
+      heatmap.begin() + index + dim[2] * dim[3];
+  std::vector<float> heatmap_ch(first1, last1);
+  cv::Mat heatmap_mat = cv::Mat(heatmap_ch).reshape(0, dim[2]);
+  heatmap_mat.convertTo(heatmap_mat, CV_32FC1);
+  cv::GaussianBlur(heatmap_mat, heatmap_mat, cv::Size(3, 3), 0, 0);
+  heatmap_mat = heatmap_mat.reshape(1, 1);
+  heatmap_ch = std::vector<float>(heatmap_mat.reshape(1, 1));
+
+  float epsilon = 1e-10;
+  // sample heatmap to get values in around target location
+  float xy = log(fmax(heatmap_ch[py * dim[3] + px], epsilon));
+  float xr = log(fmax(heatmap_ch[py * dim[3] + px + 1], epsilon));
+  float xl = log(fmax(heatmap_ch[py * dim[3] + px - 1], epsilon));
+
+  float xr2 = log(fmax(heatmap_ch[py * dim[3] + px + 2], epsilon));
+  float xl2 = log(fmax(heatmap_ch[py * dim[3] + px - 2], epsilon));
+  float yu = log(fmax(heatmap_ch[(py + 1) * dim[3] + px], epsilon));
+  float yd = log(fmax(heatmap_ch[(py - 1) * dim[3] + px], epsilon));
+  float yu2 = log(fmax(heatmap_ch[(py + 2) * dim[3] + px], epsilon));
+  float yd2 = log(fmax(heatmap_ch[(py - 2) * dim[3] + px], epsilon));
+  float xryu = log(fmax(heatmap_ch[(py + 1) * dim[3] + px + 1], epsilon));
+  float xryd = log(fmax(heatmap_ch[(py - 1) * dim[3] + px + 1], epsilon));
+  float xlyu = log(fmax(heatmap_ch[(py + 1) * dim[3] + px - 1], epsilon));
+  float xlyd = log(fmax(heatmap_ch[(py - 1) * dim[3] + px - 1], epsilon));
+
+  // compute dx/dy and dxx/dyy with sampled values
+  float dx = 0.5 * (xr - xl);
+  float dy = 0.5 * (yu - yd);
+  float dxx = 0.25 * (xr2 - 2 * xy + xl2);
+  float dxy = 0.25 * (xryu - xryd - xlyu + xlyd);
+  float dyy = 0.25 * (yu2 - 2 * xy + yd2);
+
+  // finally get offset by derivative and hassian, which combined by dx/dy and
+  // dxx/dyy
+  if (dxx * dyy - dxy * dxy != 0) {
+    float M[2][2] = {dxx, dxy, dxy, dyy};
+    float D[2] = {dx, dy};
+    cv::Mat hassian(2, 2, CV_32F, M);
+    cv::Mat derivative(2, 1, CV_32F, D);
+    cv::Mat offset = -hassian.inv() * derivative;
+    coords[ch * 2] += offset.at<float>(0, 0);
+    coords[ch * 2 + 1] += offset.at<float>(1, 0);
+  }
+}
+
+void get_final_preds(std::vector<float>& heatmap,
+                     std::vector<int>& dim,
+                     std::vector<int>& idxout,
+                     std::vector<int>& idxdim,
+                     std::vector<float>& center,
+                     std::vector<float> scale,
+                     std::vector<float>& preds,
+                     int batchid,
+                     bool DARK) {
+  std::vector<float> coords;
+  coords.resize(dim[1] * 2);
+  int heatmap_height = dim[2];
+  int heatmap_width = dim[3];
+
+  for (int j = 0; j < dim[1]; ++j) {
+    int index = (batchid * dim[1] + j) * dim[2] * dim[3];
+
+    int idx = idxout[batchid * dim[1] + j];
+    preds[j * 3] = heatmap[index + idx];
+    coords[j * 2] = idx % heatmap_width;
+    coords[j * 2 + 1] = idx / heatmap_width;
+
+    int px = int(coords[j * 2] + 0.5);
+    int py = int(coords[j * 2 + 1] + 0.5);
+
+    if (DARK && px > 1 && px < heatmap_width - 2 && py > 1 &&
+        py < heatmap_height - 2) {
+      dark_parse(heatmap, dim, coords, px, py, index, j);
+    } else {
+      if (px > 0 && px < heatmap_width - 1) {
+        float diff_x = heatmap[index + py * dim[3] + px + 1] -
+                       heatmap[index + py * dim[3] + px - 1];
+        coords[j * 2] += diff_x > 0 ? 1 : -1 * 0.25;
+      }
+      if (py > 0 && py < heatmap_height - 1) {
+        float diff_y = heatmap[index + (py + 1) * dim[3] + px] -
+                       heatmap[index + (py - 1) * dim[3] + px];
+        coords[j * 2 + 1] += diff_y > 0 ? 1 : -1 * 0.25;
+      }
+    }
+  }
+
+  std::vector<int> img_size{heatmap_width, heatmap_height};
+  transform_preds(coords, center, scale, img_size, dim, preds);
+}
+
+void CropImg(cv::Mat& img,
+             cv::Mat& crop_img,
+             std::vector<int>& area,
+             std::vector<float>& center,
+             std::vector<float>& scale,
+             float expandratio) {
+  int crop_x1 = std::max(0, area[0]);
+  int crop_y1 = std::max(0, area[1]);
+  int crop_x2 = std::min(img.cols - 1, area[2]);
+  int crop_y2 = std::min(img.rows - 1, area[3]);
+
+  int center_x = (crop_x1 + crop_x2) / 2.;
+  int center_y = (crop_y1 + crop_y2) / 2.;
+  int half_h = (crop_y2 - crop_y1) / 2.;
+  int half_w = (crop_x2 - crop_x1) / 2.;
+
+  if (half_h * 3 > half_w * 4) {
+    half_w = static_cast<int>(half_h * 0.75);
+  } else {
+    half_h = static_cast<int>(half_w * 4 / 3);
+  }
+
+  crop_x1 =
+      std::max(0, center_x - static_cast<int>(half_w * (1 + expandratio)));
+  crop_y1 =
+      std::max(0, center_y - static_cast<int>(half_h * (1 + expandratio)));
+  crop_x2 = std::min(img.cols - 1,
+                     static_cast<int>(center_x + half_w * (1 + expandratio)));
+  crop_y2 = std::min(img.rows - 1,
+                     static_cast<int>(center_y + half_h * (1 + expandratio)));
+  crop_img =
+      img(cv::Range(crop_y1, crop_y2 + 1), cv::Range(crop_x1, crop_x2 + 1));
+
+  center.clear();
+  center.emplace_back((crop_x1 + crop_x2) / 2);
+  center.emplace_back((crop_y1 + crop_y2) / 2);
+  scale.clear();
+  scale.emplace_back((crop_x2 - crop_x1));
+  scale.emplace_back((crop_y2 - crop_y1));
+}
--- a/deploy/third_engine/demo_mnn_kpts/keypoint_postprocess.h
+++ b/deploy/third_engine/demo_mnn_kpts/keypoint_postprocess.h
+//   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <opencv2/core/core.hpp>
+#include <opencv2/highgui/highgui.hpp>
+#include <opencv2/imgproc/imgproc.hpp>
+#include <vector>
+
+std::vector<float> get_3rd_point(std::vector<float>& a, std::vector<float>& b);
+std::vector<float> get_dir(float src_point_x, float src_point_y, float rot_rad);
+void affine_tranform(float pt_x,
+                     float pt_y,
+                     cv::Mat& trans,
+                     std::vector<float>& x,
+                     int p,
+                     int num);
+cv::Mat get_affine_transform(std::vector<float>& center,
+                             std::vector<float>& scale,
+                             float rot,
+                             std::vector<int>& output_size,
+                             int inv);
+void transform_preds(std::vector<float>& coords,
+                     std::vector<float>& center,
+                     std::vector<float>& scale,
+                     std::vector<int>& output_size,
+                     std::vector<int>& dim,
+                     std::vector<float>& target_coords);
+void box_to_center_scale(std::vector<int>& box,
+                         int width,
+                         int height,
+                         std::vector<float>& center,
+                         std::vector<float>& scale);
+void get_max_preds(std::vector<float>& heatmap,
+                   std::vector<int>& dim,
+                   std::vector<float>& preds,
+                   std::vector<float>& maxvals,
+                   int batchid,
+                   int joint_idx);
+void get_final_preds(std::vector<float>& heatmap,
+                     std::vector<int>& dim,
+                     std::vector<int>& idxout,
+                     std::vector<int>& idxdim,
+                     std::vector<float>& center,
+                     std::vector<float> scale,
+                     std::vector<float>& preds,
+                     int batchid,
+                     bool DARK = true);
+
+void CropImg(cv::Mat& img,
+             cv::Mat& crop_img,
+             std::vector<int>& area,
+             std::vector<float>& center,
+             std::vector<float>& scale,
+             float expandratio = 0.25);
--- a/deploy/third_engine/demo_mnn_kpts/main.cpp
+++ b/deploy/third_engine/demo_mnn_kpts/main.cpp
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// reference from https://github.com/RangiLyu/nanodet/tree/main/demo_mnn
+
+#include <iostream>
+#include <opencv2/core/core.hpp>
+#include <opencv2/highgui/highgui.hpp>
+#include <opencv2/imgproc/imgproc.hpp>
+#include "keypoint_detector.h"
+#include "picodet_mnn.h"
+
+#define __SAVE_RESULT__  // if defined save drawed results to ../results, else
+                         // show it in windows
+
+using namespace PaddleDetection;
+
+struct object_rect {
+  int x;
+  int y;
+  int width;
+  int height;
+};
+
+int resize_uniform(cv::Mat& src,
+                   cv::Mat& dst,
+                   cv::Size dst_size,
+                   object_rect& effect_area) {
+  int w = src.cols;
+  int h = src.rows;
+  int dst_w = dst_size.width;
+  int dst_h = dst_size.height;
+  dst = cv::Mat(cv::Size(dst_w, dst_h), CV_8UC3, cv::Scalar(0));
+
+  float ratio_src = w * 1.0 / h;
+  float ratio_dst = dst_w * 1.0 / dst_h;
+
+  int tmp_w = 0;
+  int tmp_h = 0;
+  if (ratio_src > ratio_dst) {
+    tmp_w = dst_w;
+    tmp_h = floor((dst_w * 1.0 / w) * h);
+  } else if (ratio_src < ratio_dst) {
+    tmp_h = dst_h;
+    tmp_w = floor((dst_h * 1.0 / h) * w);
+  } else {
+    cv::resize(src, dst, dst_size);
+    effect_area.x = 0;
+    effect_area.y = 0;
+    effect_area.width = dst_w;
+    effect_area.height = dst_h;
+    return 0;
+  }
+  cv::Mat tmp;
+  cv::resize(src, tmp, cv::Size(tmp_w, tmp_h));
+
+  if (tmp_w != dst_w) {
+    int index_w = floor((dst_w - tmp_w) / 2.0);
+    for (int i = 0; i < dst_h; i++) {
+      memcpy(dst.data + i * dst_w * 3 + index_w * 3,
+             tmp.data + i * tmp_w * 3,
+             tmp_w * 3);
+    }
+    effect_area.x = index_w;
+    effect_area.y = 0;
+    effect_area.width = tmp_w;
+    effect_area.height = tmp_h;
+  } else if (tmp_h != dst_h) {
+    int index_h = floor((dst_h - tmp_h) / 2.0);
+    memcpy(dst.data + index_h * dst_w * 3, tmp.data, tmp_w * tmp_h * 3);
+    effect_area.x = 0;
+    effect_area.y = index_h;
+    effect_area.width = tmp_w;
+    effect_area.height = tmp_h;
+  } else {
+    printf("error\n");
+  }
+  return 0;
+}
+
+const int color_list[80][3] = {
+    {216, 82, 24},   {236, 176, 31},  {125, 46, 141},  {118, 171, 47},
+    {76, 189, 237},  {238, 19, 46},   {76, 76, 76},    {153, 153, 153},
+    {255, 0, 0},     {255, 127, 0},   {190, 190, 0},   {0, 255, 0},
+    {0, 0, 255},     {170, 0, 255},   {84, 84, 0},     {84, 170, 0},
+    {84, 255, 0},    {170, 84, 0},    {170, 170, 0},   {170, 255, 0},
+    {255, 84, 0},    {255, 170, 0},   {255, 255, 0},   {0, 84, 127},
+    {0, 170, 127},   {0, 255, 127},   {84, 0, 127},    {84, 84, 127},
+    {84, 170, 127},  {84, 255, 127},  {170, 0, 127},   {170, 84, 127},
+    {170, 170, 127}, {170, 255, 127}, {255, 0, 127},   {255, 84, 127},
+    {255, 170, 127}, {255, 255, 127}, {0, 84, 255},    {0, 170, 255},
+    {0, 255, 255},   {84, 0, 255},    {84, 84, 255},   {84, 170, 255},
+    {84, 255, 255},  {170, 0, 255},   {170, 84, 255},  {170, 170, 255},
+    {170, 255, 255}, {255, 0, 255},   {255, 84, 255},  {255, 170, 255},
+    {42, 0, 0},      {84, 0, 0},      {127, 0, 0},     {170, 0, 0},
+    {212, 0, 0},     {255, 0, 0},     {0, 42, 0},      {0, 84, 0},
+    {0, 127, 0},     {0, 170, 0},     {0, 212, 0},     {0, 255, 0},
+    {0, 0, 42},      {0, 0, 84},      {0, 0, 127},     {0, 0, 170},
+    {0, 0, 212},     {0, 0, 255},     {0, 0, 0},       {36, 36, 36},
+    {72, 72, 72},    {109, 109, 109}, {145, 145, 145}, {182, 182, 182},
+    {218, 218, 218}, {0, 113, 188},   {80, 182, 188},  {127, 127, 0},
+};
+
+void draw_bboxes(const cv::Mat& bgr,
+                 const std::vector<BoxInfo>& bboxes,
+                 object_rect effect_roi,
+                 std::string save_path = "None") {
+  static const char* class_names[] = {
+      "person",        "bicycle",      "car",
+      "motorcycle",    "airplane",     "bus",
+      "train",         "truck",        "boat",
+      "traffic light", "fire hydrant", "stop sign",
+      "parking meter", "bench",        "bird",
+      "cat",           "dog",          "horse",
+      "sheep",         "cow",          "elephant",
+      "bear",          "zebra",        "giraffe",
+      "backpack",      "umbrella",     "handbag",
+      "tie",           "suitcase",     "frisbee",
+      "skis",          "snowboard",    "sports ball",
+      "kite",          "baseball bat", "baseball glove",
+      "skateboard",    "surfboard",    "tennis racket",
+      "bottle",        "wine glass",   "cup",
+      "fork",          "knife",        "spoon",
+      "bowl",          "banana",       "apple",
+      "sandwich",      "orange",       "broccoli",
+      "carrot",        "hot dog",      "pizza",
+      "donut",         "cake",         "chair",
+      "couch",         "potted plant", "bed",
+      "dining table",  "toilet",       "tv",
+      "laptop",        "mouse",        "remote",
+      "keyboard",      "cell phone",   "microwave",
+      "oven",          "toaster",      "sink",
+      "refrigerator",  "book",         "clock",
+      "vase",          "scissors",     "teddy bear",
+      "hair drier",    "toothbrush"};
+
+  cv::Mat image = bgr.clone();
+  int src_w = image.cols;
+  int src_h = image.rows;
+  int dst_w = effect_roi.width;
+  int dst_h = effect_roi.height;
+  float width_ratio = (float)src_w / (float)dst_w;
+  float height_ratio = (float)src_h / (float)dst_h;
+
+  for (size_t i = 0; i < bboxes.size(); i++) {
+    const BoxInfo& bbox = bboxes[i];
+    cv::Scalar color = cv::Scalar(color_list[bbox.label][0],
+                                  color_list[bbox.label][1],
+                                  color_list[bbox.label][2]);
+    cv::rectangle(image,
+                  cv::Rect(cv::Point((bbox.x1 - effect_roi.x) * width_ratio,
+                                     (bbox.y1 - effect_roi.y) * height_ratio),
+                           cv::Point((bbox.x2 - effect_roi.x) * width_ratio,
+                                     (bbox.y2 - effect_roi.y) * height_ratio)),
+                  color);
+
+    char text[256];
+    sprintf(text, "%s %.1f%%", class_names[bbox.label], bbox.score * 100);
+
+    int baseLine = 0;
+    cv::Size label_size =
+        cv::getTextSize(text, cv::FONT_HERSHEY_SIMPLEX, 0.4, 1, &baseLine);
+
+    int x = (bbox.x1 - effect_roi.x) * width_ratio;
+    int y =
+        (bbox.y1 - effect_roi.y) * height_ratio - label_size.height - baseLine;
+    if (y < 0) y = 0;
+    if (x + label_size.width > image.cols) x = image.cols - label_size.width;
+
+    cv::rectangle(
+        image,
+        cv::Rect(cv::Point(x, y),
+                 cv::Size(label_size.width, label_size.height + baseLine)),
+        color,
+        -1);
+
+    cv::putText(image,
+                text,
+                cv::Point(x, y + label_size.height),
+                cv::FONT_HERSHEY_SIMPLEX,
+                0.4,
+                cv::Scalar(255, 255, 255));
+  }
+
+  if (save_path == "None") {
+    cv::imshow("image", image);
+  } else {
+    cv::imwrite(save_path, image);
+    std::cout << save_path << std::endl;
+  }
+}
+
+std::vector<BoxInfo> coordsback(const cv::Mat image,
+                                const object_rect effect_roi,
+                                const std::vector<BoxInfo>& bboxes) {
+  int src_w = image.cols;
+  int src_h = image.rows;
+  int dst_w = effect_roi.width;
+  int dst_h = effect_roi.height;
+  float width_ratio = (float)src_w / (float)dst_w;
+  float height_ratio = (float)src_h / (float)dst_h;
+
+  std::vector<BoxInfo> bboxes_oimg;
+
+  for (int i = 0; i < bboxes.size(); i++) {
+    auto bbox = bboxes[i];
+    bbox.x1 = (bbox.x1 - effect_roi.x) * width_ratio;
+    bbox.y1 = (bbox.y1 - effect_roi.y) * height_ratio;
+    bbox.x2 = (bbox.x2 - effect_roi.x) * width_ratio;
+    bbox.y2 = (bbox.y2 - effect_roi.y) * height_ratio;
+    bboxes_oimg.emplace_back(bbox);
+  }
+  return bboxes_oimg;
+}
+
+void image_infer_kpts(KeyPointDetector* kpts_detector,
+                      cv::Mat image,
+                      const object_rect effect_roi,
+                      const std::vector<BoxInfo>& results,
+                      std::string img_name = "kpts_vis",
+                      bool save_img = true) {
+  std::vector<cv::Mat> cropimgs;
+  std::vector<std::vector<float>> center_bs;
+  std::vector<std::vector<float>> scale_bs;
+  std::vector<KeyPointResult> kpts_results;
+  auto results_oimg = coordsback(image, effect_roi, results);
+
+  for (int i = 0; i < results_oimg.size(); i++) {
+    auto rect = results_oimg[i];
+    if (rect.label == 0) {
+      cv::Mat cropimg;
+      std::vector<float> center, scale;
+      std::vector<int> area = {static_cast<int>(rect.x1),
+                               static_cast<int>(rect.y1),
+                               static_cast<int>(rect.x2),
+                               static_cast<int>(rect.y2)};
+      CropImg(image, cropimg, area, center, scale);
+      //   cv::imwrite("./test_crop_"+std::to_string(i)+".jpg", cropimg);
+      cropimgs.emplace_back(cropimg);
+      center_bs.emplace_back(center);
+      scale_bs.emplace_back(scale);
+    }
+    if (cropimgs.size() == 1 ||
+        (cropimgs.size() > 0 && i == results_oimg.size() - 1)) {
+      kpts_detector->Predict(cropimgs, center_bs, scale_bs, &kpts_results);
+      cropimgs.clear();
+      center_bs.clear();
+      scale_bs.clear();
+    }
+  }
+  std::vector<int> compression_params;
+  compression_params.push_back(cv::IMWRITE_JPEG_QUALITY);
+  compression_params.push_back(95);
+  std::string kpts_savepath =
+      "keypoint_" + img_name.substr(img_name.find_last_of('/') + 1);
+  cv::Mat kpts_vis_img =
+      VisualizeKptsResult(image, kpts_results, {0, 255, 0}, 0.3);
+  if (save_img) {
+    cv::imwrite(kpts_savepath, kpts_vis_img, compression_params);
+    printf("Visualized output saved as %s\n", kpts_savepath.c_str());
+  } else {
+    cv::imshow("image", kpts_vis_img);
+  }
+}
+
+int image_demo(PicoDet& detector,
+               KeyPointDetector* kpts_detector,
+               const char* imagepath) {
+  std::vector<cv::String> filenames;
+  cv::glob(imagepath, filenames, false);
+
+  for (auto img_name : filenames) {
+    cv::Mat image = cv::imread(img_name);
+    if (image.empty()) {
+      fprintf(stderr, "cv::imread %s failed\n", img_name.c_str());
+      return -1;
+    }
+    object_rect effect_roi;
+    cv::Mat resized_img;
+    resize_uniform(image, resized_img, cv::Size(320, 320), effect_roi);
+    std::vector<BoxInfo> results;
+    detector.detect(resized_img, results);
+    if (kpts_detector) {
+      image_infer_kpts(kpts_detector, image, effect_roi, results, img_name);
+    }
+  }
+  return 0;
+}
+
+int webcam_demo(PicoDet& detector,
+                KeyPointDetector* kpts_detector,
+                int cam_id) {
+  cv::Mat image;
+  cv::VideoCapture cap(cam_id);
+
+  while (true) {
+    cap >> image;
+    object_rect effect_roi;
+    cv::Mat resized_img;
+    resize_uniform(image, resized_img, cv::Size(320, 320), effect_roi);
+    std::vector<BoxInfo> results;
+    detector.detect(resized_img, results);
+    if (kpts_detector) {
+      image_infer_kpts(kpts_detector, image, effect_roi, results, "", false);
+    }
+  }
+  return 0;
+}
+
+int video_demo(PicoDet& detector,
+               KeyPointDetector* kpts_detector,
+               const char* path) {
+  cv::Mat image;
+  cv::VideoCapture cap(path);
+
+  while (true) {
+    cap >> image;
+    object_rect effect_roi;
+    cv::Mat resized_img;
+    resize_uniform(image, resized_img, cv::Size(320, 320), effect_roi);
+    std::vector<BoxInfo> results;
+    detector.detect(resized_img, results);
+    if (kpts_detector) {
+      image_infer_kpts(kpts_detector, image, effect_roi, results, "", false);
+    }
+  }
+  return 0;
+}
+
+int benchmark(KeyPointDetector* kpts_detector) {
+  int loop_num = 100;
+  int warm_up = 8;
+
+  double time_min = DBL_MAX;
+  double time_max = -DBL_MAX;
+  double time_avg = 0;
+  cv::Mat image(256, 192, CV_8UC3, cv::Scalar(1, 1, 1));
+  std::vector<float> center = {128, 96};
+  std::vector<float> scale = {256, 192};
+  std::vector<cv::Mat> cropimgs = {image};
+  std::vector<std::vector<float>> center_bs = {center};
+  std::vector<std::vector<float>> scale_bs = {scale};
+  std::vector<KeyPointResult> kpts_results;
+
+  for (int i = 0; i < warm_up + loop_num; i++) {
+    auto start = std::chrono::steady_clock::now();
+    std::vector<BoxInfo> results;
+    kpts_detector->Predict(cropimgs, center_bs, scale_bs, &kpts_results);
+    auto end = std::chrono::steady_clock::now();
+
+    std::chrono::duration<double> elapsed = end - start;
+    double time = elapsed.count();
+    if (i >= warm_up) {
+      time_min = (std::min)(time_min, time);
+      time_max = (std::max)(time_max, time);
+      time_avg += time;
+    }
+  }
+  time_avg /= loop_num;
+  fprintf(stderr,
+          "%20s  min = %7.2f  max = %7.2f  avg = %7.2f\n",
+          "tinypose",
+          time_min,
+          time_max,
+          time_avg);
+  return 0;
+}
+
+int main(int argc, char** argv) {
+  if (argc != 3) {
+    fprintf(stderr,
+            "usage: %s [mode] [path]. \n For webcam mode=0, path is cam id; \n "
+            "For image demo, mode=1, path=xxx/xxx/*.jpg; \n For video, mode=2; "
+            "\n For benchmark, mode=3 path=0.\n",
+            argv[0]);
+    return -1;
+  }
+  PicoDet detector =
+      PicoDet("../weight/picodet_m_416.mnn", 416, 416, 4, 0.45, 0.3);
+  KeyPointDetector* kpts_detector =
+      new KeyPointDetector("../weight/tinypose256.mnn", 4, 256, 192);
+  int mode = atoi(argv[1]);
+  switch (mode) {
+    case 0: {
+      int cam_id = atoi(argv[2]);
+      webcam_demo(detector, kpts_detector, cam_id);
+      break;
+    }
+    case 1: {
+      const char* images = argv[2];
+      image_demo(detector, kpts_detector, images);
+      break;
+    }
+    case 2: {
+      const char* path = argv[2];
+      video_demo(detector, kpts_detector, path);
+      break;
+    }
+    case 3: {
+      benchmark(kpts_detector);
+      break;
+    }
+    default: {
+      fprintf(stderr,
+              "usage: %s [mode] [path]. \n For webcam mode=0, path is cam id; "
+              "\n For image demo, mode=1, path=xxx/xxx/*.jpg; \n For video, "
+              "mode=2; \n For benchmark, mode=3 path=0.\n",
+              argv[0]);
+      break;
+    }
+  }
+  delete kpts_detector;
+  kpts_detector = nullptr;
+}
--- a/deploy/third_engine/demo_mnn_kpts/picodet_mnn.cpp
+++ b/deploy/third_engine/demo_mnn_kpts/picodet_mnn.cpp
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// reference from https://github.com/RangiLyu/nanodet/tree/main/demo_mnn
+
+#include "picodet_mnn.h"
+
+using namespace std;
+
+PicoDet::PicoDet(const std::string &mnn_path,
+                 int input_width,
+                 int input_length,
+                 int num_thread_,
+                 float score_threshold_,
+                 float nms_threshold_) {
+  num_thread = num_thread_;
+  in_w = input_width;
+  in_h = input_length;
+  score_threshold = score_threshold_;
+  nms_threshold = nms_threshold_;
+
+  PicoDet_interpreter = std::shared_ptr<MNN::Interpreter>(
+      MNN::Interpreter::createFromFile(mnn_path.c_str()));
+  MNN::ScheduleConfig config;
+  config.numThread = num_thread;
+  MNN::BackendConfig backendConfig;
+  backendConfig.precision = (MNN::BackendConfig::PrecisionMode)2;
+  config.backendConfig = &backendConfig;
+
+  PicoDet_session = PicoDet_interpreter->createSession(config);
+
+  input_tensor = PicoDet_interpreter->getSessionInput(PicoDet_session, nullptr);
+}
+
+PicoDet::~PicoDet() {
+  PicoDet_interpreter->releaseModel();
+  PicoDet_interpreter->releaseSession(PicoDet_session);
+}
+
+int PicoDet::detect(cv::Mat &raw_image, std::vector<BoxInfo> &result_list) {
+  if (raw_image.empty()) {
+    std::cout << "image is empty ,please check!" << std::endl;
+    return -1;
+  }
+
+  image_h = raw_image.rows;
+  image_w = raw_image.cols;
+  cv::Mat image;
+  cv::resize(raw_image, image, cv::Size(in_w, in_h));
+
+  PicoDet_interpreter->resizeTensor(input_tensor, {1, 3, in_h, in_w});
+  PicoDet_interpreter->resizeSession(PicoDet_session);
+  std::shared_ptr<MNN::CV::ImageProcess> pretreat(MNN::CV::ImageProcess::create(
+      MNN::CV::BGR, MNN::CV::BGR, mean_vals, 3, norm_vals, 3));
+  pretreat->convert(image.data, in_w, in_h, image.step[0], input_tensor);
+
+  auto start = chrono::steady_clock::now();
+
+  // run network
+  PicoDet_interpreter->runSession(PicoDet_session);
+
+  // get output data
+  std::vector<std::vector<BoxInfo>> results;
+  results.resize(num_class);
+
+  for (const auto &head_info : heads_info) {
+    MNN::Tensor *tensor_scores = PicoDet_interpreter->getSessionOutput(
+        PicoDet_session, head_info.cls_layer.c_str());
+    MNN::Tensor *tensor_boxes = PicoDet_interpreter->getSessionOutput(
+        PicoDet_session, head_info.dis_layer.c_str());
+
+    MNN::Tensor tensor_scores_host(tensor_scores,
+                                   tensor_scores->getDimensionType());
+    tensor_scores->copyToHostTensor(&tensor_scores_host);
+
+    MNN::Tensor tensor_boxes_host(tensor_boxes,
+                                  tensor_boxes->getDimensionType());
+    tensor_boxes->copyToHostTensor(&tensor_boxes_host);
+
+    decode_infer(&tensor_scores_host,
+                 &tensor_boxes_host,
+                 head_info.stride,
+                 score_threshold,
+                 results);
+  }
+
+  auto end = chrono::steady_clock::now();
+  chrono::duration<double> elapsed = end - start;
+  cout << "inference time:" << elapsed.count() << " s, ";
+
+  for (int i = 0; i < (int)results.size(); i++) {
+    nms(results[i], nms_threshold);
+
+    for (auto box : results[i]) {
+      box.x1 = box.x1 / in_w * image_w;
+      box.x2 = box.x2 / in_w * image_w;
+      box.y1 = box.y1 / in_h * image_h;
+      box.y2 = box.y2 / in_h * image_h;
+      result_list.push_back(box);
+    }
+  }
+  cout << "detect " << result_list.size() << " objects." << std::endl;
+  ;
+
+  return 0;
+}
+
+void PicoDet::decode_infer(MNN::Tensor *cls_pred,
+                           MNN::Tensor *dis_pred,
+                           int stride,
+                           float threshold,
+                           std::vector<std::vector<BoxInfo>> &results) {
+  int feature_h = in_h / stride;
+  int feature_w = in_w / stride;
+
+  for (int idx = 0; idx < feature_h * feature_w; idx++) {
+    const float *scores = cls_pred->host<float>() + (idx * num_class);
+    int row = idx / feature_w;
+    int col = idx % feature_w;
+    float score = 0;
+    int cur_label = 0;
+    for (int label = 0; label < num_class; label++) {
+      if (scores[label] > score) {
+        score = scores[label];
+        cur_label = label;
+      }
+    }
+    if (score > threshold) {
+      const float *bbox_pred =
+          dis_pred->host<float>() + (idx * 4 * (reg_max + 1));
+      results[cur_label].push_back(
+          disPred2Bbox(bbox_pred, cur_label, score, col, row, stride));
+    }
+  }
+}
+
+BoxInfo PicoDet::disPred2Bbox(
+    const float *&dfl_det, int label, float score, int x, int y, int stride) {
+  float ct_x = (x + 0.5) * stride;
+  float ct_y = (y + 0.5) * stride;
+  std::vector<float> dis_pred;
+  dis_pred.resize(4);
+  for (int i = 0; i < 4; i++) {
+    float dis = 0;
+    float *dis_after_sm = new float[reg_max + 1];
+    activation_function_softmax(
+        dfl_det + i * (reg_max + 1), dis_after_sm, reg_max + 1);
+    for (int j = 0; j < reg_max + 1; j++) {
+      dis += j * dis_after_sm[j];
+    }
+    dis *= stride;
+    dis_pred[i] = dis;
+    delete[] dis_after_sm;
+  }
+  float xmin = (std::max)(ct_x - dis_pred[0], .0f);
+  float ymin = (std::max)(ct_y - dis_pred[1], .0f);
+  float xmax = (std::min)(ct_x + dis_pred[2], (float)in_w);
+  float ymax = (std::min)(ct_y + dis_pred[3], (float)in_h);
+  return BoxInfo{xmin, ymin, xmax, ymax, score, label};
+}
+
+void PicoDet::nms(std::vector<BoxInfo> &input_boxes, float NMS_THRESH) {
+  std::sort(input_boxes.begin(), input_boxes.end(), [](BoxInfo a, BoxInfo b) {
+    return a.score > b.score;
+  });
+  std::vector<float> vArea(input_boxes.size());
+  for (int i = 0; i < int(input_boxes.size()); ++i) {
+    vArea[i] = (input_boxes.at(i).x2 - input_boxes.at(i).x1 + 1) *
+               (input_boxes.at(i).y2 - input_boxes.at(i).y1 + 1);
+  }
+  for (int i = 0; i < int(input_boxes.size()); ++i) {
+    for (int j = i + 1; j < int(input_boxes.size());) {
+      float xx1 = (std::max)(input_boxes[i].x1, input_boxes[j].x1);
+      float yy1 = (std::max)(input_boxes[i].y1, input_boxes[j].y1);
+      float xx2 = (std::min)(input_boxes[i].x2, input_boxes[j].x2);
+      float yy2 = (std::min)(input_boxes[i].y2, input_boxes[j].y2);
+      float w = (std::max)(float(0), xx2 - xx1 + 1);
+      float h = (std::max)(float(0), yy2 - yy1 + 1);
+      float inter = w * h;
+      float ovr = inter / (vArea[i] + vArea[j] - inter);
+      if (ovr >= NMS_THRESH) {
+        input_boxes.erase(input_boxes.begin() + j);
+        vArea.erase(vArea.begin() + j);
+      } else {
+        j++;
+      }
+    }
+  }
+}
+
+string PicoDet::get_label_str(int label) { return labels[label]; }
+
+inline float fast_exp(float x) {
+  union {
+    uint32_t i;
+    float f;
+  } v{};
+  v.i = (1 << 23) * (1.4426950409 * x + 126.93490512f);
+  return v.f;
+}
+
+inline float sigmoid(float x) { return 1.0f / (1.0f + fast_exp(-x)); }
+
+template <typename _Tp>
+int activation_function_softmax(const _Tp *src, _Tp *dst, int length) {
+  const _Tp alpha = *std::max_element(src, src + length);
+  _Tp denominator{0};
+
+  for (int i = 0; i < length; ++i) {
+    dst[i] = fast_exp(src[i] - alpha);
+    denominator += dst[i];
+  }
+
+  for (int i = 0; i < length; ++i) {
+    dst[i] /= denominator;
+  }
+
+  return 0;
+}
--- a/deploy/third_engine/demo_mnn_kpts/picodet_mnn.h
+++ b/deploy/third_engine/demo_mnn_kpts/picodet_mnn.h
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// reference from https://github.com/RangiLyu/nanodet/tree/main/demo_mnn
+
+#ifndef __PicoDet_H__
+#define __PicoDet_H__
+
+#pragma once
+
+#include "Interpreter.hpp"
+
+#include <algorithm>
+#include <chrono>
+#include <iostream>
+#include <memory>
+#include <opencv2/opencv.hpp>
+#include <string>
+#include <vector>
+#include "ImageProcess.hpp"
+#include "MNNDefine.h"
+#include "Tensor.hpp"
+
+typedef struct HeadInfo_ {
+  std::string cls_layer;
+  std::string dis_layer;
+  int stride;
+} HeadInfo;
+
+typedef struct BoxInfo_ {
+  float x1;
+  float y1;
+  float x2;
+  float y2;
+  float score;
+  int label;
+} BoxInfo;
+
+class PicoDet {
+ public:
+  PicoDet(const std::string &mnn_path,
+          int input_width,
+          int input_length,
+          int num_thread_ = 4,
+          float score_threshold_ = 0.5,
+          float nms_threshold_ = 0.3);
+
+  ~PicoDet();
+
+  int detect(cv::Mat &img, std::vector<BoxInfo> &result_list);
+  std::string get_label_str(int label);
+
+ private:
+  void decode_infer(MNN::Tensor *cls_pred,
+                    MNN::Tensor *dis_pred,
+                    int stride,
+                    float threshold,
+                    std::vector<std::vector<BoxInfo>> &results);
+  BoxInfo disPred2Bbox(
+      const float *&dfl_det, int label, float score, int x, int y, int stride);
+  void nms(std::vector<BoxInfo> &input_boxes, float NMS_THRESH);
+
+ private:
+  std::shared_ptr<MNN::Interpreter> PicoDet_interpreter;
+  MNN::Session *PicoDet_session = nullptr;
+  MNN::Tensor *input_tensor = nullptr;
+
+  int num_thread;
+  int image_w;
+  int image_h;
+
+  int in_w = 320;
+  int in_h = 320;
+
+  float score_threshold;
+  float nms_threshold;
+
+  const float mean_vals[3] = {103.53f, 116.28f, 123.675f};
+  const float norm_vals[3] = {0.017429f, 0.017507f, 0.017125f};
+
+  const int num_class = 80;
+  const int reg_max = 7;
+
+  std::vector<HeadInfo> heads_info{
+      // cls_pred|dis_pred|stride
+      {"save_infer_model/scale_0.tmp_1", "save_infer_model/scale_4.tmp_1", 8},
+      {"save_infer_model/scale_1.tmp_1", "save_infer_model/scale_5.tmp_1", 16},
+      {"save_infer_model/scale_2.tmp_1", "save_infer_model/scale_6.tmp_1", 32},
+      {"save_infer_model/scale_3.tmp_1", "save_infer_model/scale_7.tmp_1", 64},
+  };
+
+  std::vector<std::string> labels{
+      "person",        "bicycle",      "car",
+      "motorcycle",    "airplane",     "bus",
+      "train",         "truck",        "boat",
+      "traffic light", "fire hydrant", "stop sign",
+      "parking meter", "bench",        "bird",
+      "cat",           "dog",          "horse",
+      "sheep",         "cow",          "elephant",
+      "bear",          "zebra",        "giraffe",
+      "backpack",      "umbrella",     "handbag",
+      "tie",           "suitcase",     "frisbee",
+      "skis",          "snowboard",    "sports ball",
+      "kite",          "baseball bat", "baseball glove",
+      "skateboard",    "surfboard",    "tennis racket",
+      "bottle",        "wine glass",   "cup",
+      "fork",          "knife",        "spoon",
+      "bowl",          "banana",       "apple",
+      "sandwich",      "orange",       "broccoli",
+      "carrot",        "hot dog",      "pizza",
+      "donut",         "cake",         "chair",
+      "couch",         "potted plant", "bed",
+      "dining table",  "toilet",       "tv",
+      "laptop",        "mouse",        "remote",
+      "keyboard",      "cell phone",   "microwave",
+      "oven",          "toaster",      "sink",
+      "refrigerator",  "book",         "clock",
+      "vase",          "scissors",     "teddy bear",
+      "hair drier",    "toothbrush"};
+};
+
+template <typename _Tp>
+int activation_function_softmax(const _Tp *src, _Tp *dst, int length);
+
+inline float fast_exp(float x);
+inline float sigmoid(float x);
+
+#endif
--- a/deploy/third_engine/demo_ncnn/CMakeLists.txt
+++ b/deploy/third_engine/demo_ncnn/CMakeLists.txt
+cmake_minimum_required(VERSION 3.9)
+set(CMAKE_CXX_STANDARD 17)
+
+project(picodet_demo)
+
+find_package(OpenMP REQUIRED)
+if(OPENMP_FOUND)
+    message("OPENMP FOUND")
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
+    set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}")
+endif()
+
+# find_package(OpenCV REQUIRED)
+find_package(OpenCV REQUIRED PATHS "/path/to/opencv-3.4.16_gcc8.2_ffmpeg")
+
+# find_package(ncnn REQUIRED)
+find_package(ncnn REQUIRED PATHS "/path/to/ncnn/build/install/lib/cmake/ncnn")
+if(NOT TARGET ncnn)
+    message(WARNING "ncnn NOT FOUND!  Please set ncnn_DIR environment variable")
+else()
+    message("ncnn FOUND ")
+endif()
+
+include_directories(
+    ${OpenCV_INCLUDE_DIRS}
+    ${CMAKE_CURRENT_SOURCE_DIR}
+    ${CMAKE_CURRENT_BINARY_DIR}
+)
+
+
+add_executable(picodet_demo main.cpp picodet.cpp)
+
+target_link_libraries(
+    picodet_demo
+    ncnn
+    ${OpenCV_LIBS}
+)
--- a/deploy/third_engine/demo_ncnn/README.md
+++ b/deploy/third_engine/demo_ncnn/README.md
+# PicoDet NCNN Demo
+
+该Demo提供的预测代码是根据[Tencent's NCNN framework](https://github.com/Tencent/ncnn)推理库预测的。
+
+# 第一步：编译
+## Windows
+### Step1.
+Download and Install Visual Studio from https://visualstudio.microsoft.com/vs/community/
+
+### Step2.
+Download and install OpenCV from https://github.com/opencv/opencv/releases
+
+为了方便，如果环境是gcc8.2 x86环境，可直接下载以下库：
+```shell
+wget https://paddledet.bj.bcebos.com/data/opencv-3.4.16_gcc8.2_ffmpeg.tar.gz
+tar -xf opencv-3.4.16_gcc8.2_ffmpeg.tar.gz
+```
+
+### Step3(可选).
+Download and install Vulkan SDK from https://vulkan.lunarg.com/sdk/home
+
+### Step4：编译NCNN
+
+``` shell script
+git clone --recursive https://github.com/Tencent/ncnn.git
+```
+Build NCNN following this tutorial: [Build for Windows x64 using VS2017](https://github.com/Tencent/ncnn/wiki/how-to-build#build-for-windows-x64-using-visual-studio-community-2017)
+
+### Step5.
+
+增加 `ncnn_DIR` = `YOUR_NCNN_PATH/build/install/lib/cmake/ncnn` 到系统变量中
+
+Build project: Open x64 Native Tools Command Prompt for VS 2019 or 2017
+
+``` cmd
+cd <this-folder>
+mkdir -p build
+cd build
+cmake ..
+msbuild picodet_demo.vcxproj /p:configuration=release /p:platform=x64
+```
+
+## Linux
+
+### Step1.
+Build and install OpenCV from https://github.com/opencv/opencv
+
+### Step2(可选).
+Download Vulkan SDK from https://vulkan.lunarg.com/sdk/home
+
+### Step3：编译NCNN
+Clone NCNN repository
+
+``` shell script
+git clone --recursive https://github.com/Tencent/ncnn.git
+```
+
+Build NCNN following this tutorial: [Build for Linux / NVIDIA Jetson / Raspberry Pi](https://github.com/Tencent/ncnn/wiki/how-to-build#build-for-linux)
+
+### Step4：编译可执行文件
+
+``` shell script
+cd <this-folder>
+mkdir build
+cd build
+cmake ..
+make
+```
+# Run demo
+
+- 准备模型
+    ```shell
+    modelName=picodet_s_320_coco_lcnet
+    # 导出Inference model
+    python tools/export_model.py \
+            -c configs/picodet/${modelName}.yml \
+            -o weights=${modelName}.pdparams \
+            --output_dir=inference_model
+    # 转换到ONNX
+    paddle2onnx --model_dir inference_model/${modelName} \
+            --model_filename model.pdmodel  \
+            --params_filename model.pdiparams \
+            --opset_version 11 \
+            --save_file ${modelName}.onnx
+    # 简化模型
+    python -m onnxsim ${modelName}.onnx ${modelName}_processed.onnx
+    # 将模型转换至NCNN格式
+    Run onnx2ncnn in ncnn tools to generate ncnn .param and .bin file.
+    ```
+转NCNN模型可以利用在线转换工具 [https://convertmodel.com](https://convertmodel.com/)
+
+为了快速测试，可直接下载：[picodet_s_320_coco_lcnet-opt.bin](https://paddledet.bj.bcebos.com/deploy/third_engine/picodet_s_320_coco_lcnet-opt.bin)/ [picodet_s_320_coco_lcnet-opt.param](https://paddledet.bj.bcebos.com/deploy/third_engine/picodet_s_320_coco_lcnet-opt.param)（不带后处理）。
+
+**注意：**由于带后处理后，NCNN预测会出NAN，暂时使用不带后处理Demo即可，带后处理的Demo正在升级中，很快发布。
+
+
+## 开始运行
+
+首先新建预测结果存放目录：
+```shell
+cp -r ../demo_onnxruntime/imgs .
+cd build
+mkdir ../results
+```
+
+- 预测一张图片
+``` shell
+./picodet_demo 0 ../picodet_s_320_coco_lcnet.bin ../picodet_s_320_coco_lcnet.param 320 320 ../imgs/dog.jpg 0
+```
+具体参数解析可参考`main.cpp`。
+
+-测试速度Benchmark
+
+``` shell
+./picodet_demo 1 ../picodet_s_320_lcnet.bin ../picodet_s_320_lcnet.param 320 320  0
+```
+
+## FAQ
+
+- 预测结果精度不对：
+请先确认模型输入shape是否对齐，并且模型输出name是否对齐，不带后处理的PicoDet增强版模型输出name如下：
+```shell
+# 分类分支  |  检测分支
+{"transpose_0.tmp_0", "transpose_1.tmp_0"},
+{"transpose_2.tmp_0", "transpose_3.tmp_0"},
+{"transpose_4.tmp_0", "transpose_5.tmp_0"},
+{"transpose_6.tmp_0", "transpose_7.tmp_0"},
+```
+可使用[netron](https://netron.app)查看具体name，并修改`picodet_mnn.hpp`中相应`non_postprocess_heads_info`数组。
--- a/deploy/third_engine/demo_ncnn/main.cpp
+++ b/deploy/third_engine/demo_ncnn/main.cpp
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// reference from https://github.com/RangiLyu/nanodet/tree/main/demo_ncnn
+
+#include "picodet.h"
+#include <benchmark.h>
+#include <iostream>
+#include <net.h>
+#include <opencv2/core/core.hpp>
+#include <opencv2/highgui/highgui.hpp>
+#include <opencv2/imgproc/imgproc.hpp>
+
+#define __SAVE_RESULT__ // if defined save drawed results to ../results, else
+                        // show it in windows
+struct object_rect {
+  int x;
+  int y;
+  int width;
+  int height;
+};
+
+std::vector<int> GenerateColorMap(int num_class) {
+  auto colormap = std::vector<int>(3 * num_class, 0);
+  for (int i = 0; i < num_class; ++i) {
+    int j = 0;
+    int lab = i;
+    while (lab) {
+      colormap[i * 3] |= (((lab >> 0) & 1) << (7 - j));
+      colormap[i * 3 + 1] |= (((lab >> 1) & 1) << (7 - j));
+      colormap[i * 3 + 2] |= (((lab >> 2) & 1) << (7 - j));
+      ++j;
+      lab >>= 3;
+    }
+  }
+  return colormap;
+}
+
+void draw_bboxes(const cv::Mat &im, const std::vector<BoxInfo> &bboxes,
+                 std::string save_path = "None") {
+  static const char *class_names[] = {
+      "person",        "bicycle",      "car",
+      "motorcycle",    "airplane",     "bus",
+      "train",         "truck",        "boat",
+      "traffic light", "fire hydrant", "stop sign",
+      "parking meter", "bench",        "bird",
+      "cat",           "dog",          "horse",
+      "sheep",         "cow",          "elephant",
+      "bear",          "zebra",        "giraffe",
+      "backpack",      "umbrella",     "handbag",
+      "tie",           "suitcase",     "frisbee",
+      "skis",          "snowboard",    "sports ball",
+      "kite",          "baseball bat", "baseball glove",
+      "skateboard",    "surfboard",    "tennis racket",
+      "bottle",        "wine glass",   "cup",
+      "fork",          "knife",        "spoon",
+      "bowl",          "banana",       "apple",
+      "sandwich",      "orange",       "broccoli",
+      "carrot",        "hot dog",      "pizza",
+      "donut",         "cake",         "chair",
+      "couch",         "potted plant", "bed",
+      "dining table",  "toilet",       "tv",
+      "laptop",        "mouse",        "remote",
+      "keyboard",      "cell phone",   "microwave",
+      "oven",          "toaster",      "sink",
+      "refrigerator",  "book",         "clock",
+      "vase",          "scissors",     "teddy bear",
+      "hair drier",    "toothbrush"};
+
+  cv::Mat image = im.clone();
+  int src_w = image.cols;
+  int src_h = image.rows;
+  int thickness = 2;
+  auto colormap = GenerateColorMap(sizeof(class_names));
+
+  for (size_t i = 0; i < bboxes.size(); i++) {
+    const BoxInfo &bbox = bboxes[i];
+    std::cout << bbox.x1 << ". " << bbox.y1 << ". " << bbox.x2 << ". "
+              << bbox.y2 << ". " << std::endl;
+    int c1 = colormap[3 * bbox.label + 0];
+    int c2 = colormap[3 * bbox.label + 1];
+    int c3 = colormap[3 * bbox.label + 2];
+    cv::Scalar color = cv::Scalar(c1, c2, c3);
+    // cv::Scalar color = cv::Scalar(0, 0, 255);
+    cv::rectangle(image, cv::Rect(cv::Point(bbox.x1, bbox.y1),
+                                  cv::Point(bbox.x2, bbox.y2)),
+                  color, 1);
+
+    char text[256];
+    sprintf(text, "%s %.1f%%", class_names[bbox.label], bbox.score * 100);
+
+    int baseLine = 0;
+    cv::Size label_size =
+        cv::getTextSize(text, cv::FONT_HERSHEY_SIMPLEX, 0.4, 1, &baseLine);
+
+    int x = bbox.x1;
+    int y = bbox.y1 - label_size.height - baseLine;
+    if (y < 0)
+      y = 0;
+    if (x + label_size.width > image.cols)
+      x = image.cols - label_size.width;
+
+    cv::rectangle(image, cv::Rect(cv::Point(x, y),
+                                  cv::Size(label_size.width,
+                                           label_size.height + baseLine)),
+                  color, -1);
+
+    cv::putText(image, text, cv::Point(x, y + label_size.height),
+                cv::FONT_HERSHEY_SIMPLEX, 0.4, cv::Scalar(255, 255, 255), 1);
+  }
+
+  if (save_path == "None") {
+    cv::imshow("image", image);
+  } else {
+    cv::imwrite(save_path, image);
+    std::cout << "Result save in: " << save_path << std::endl;
+  }
+}
+
+int image_demo(PicoDet &detector, const char *imagepath,
+               int has_postprocess = 0) {
+  std::vector<cv::String> filenames;
+  cv::glob(imagepath, filenames, false);
+  bool is_postprocess = has_postprocess > 0 ? true : false;
+  for (auto img_name : filenames) {
+    cv::Mat image = cv::imread(img_name, cv::IMREAD_COLOR);
+    if (image.empty()) {
+      fprintf(stderr, "cv::imread %s failed\n", img_name.c_str());
+      return -1;
+    }
+    std::vector<BoxInfo> results;
+    detector.detect(image, results, is_postprocess);
+    std::cout << "detect done." << std::endl;
+
+#ifdef __SAVE_RESULT__
+    std::string save_path = img_name;
+    draw_bboxes(image, results, save_path.replace(3, 4, "results"));
+#else
+    draw_bboxes(image, results);
+    cv::waitKey(0);
+#endif
+  }
+  return 0;
+}
+
+int benchmark(PicoDet &detector, int width, int height,
+              int has_postprocess = 0) {
+  int loop_num = 100;
+  int warm_up = 8;
+
+  double time_min = DBL_MAX;
+  double time_max = -DBL_MAX;
+  double time_avg = 0;
+  cv::Mat image(width, height, CV_8UC3, cv::Scalar(1, 1, 1));
+  bool is_postprocess = has_postprocess > 0 ? true : false;
+  for (int i = 0; i < warm_up + loop_num; i++) {
+    double start = ncnn::get_current_time();
+    std::vector<BoxInfo> results;
+    detector.detect(image, results, is_postprocess);
+    double end = ncnn::get_current_time();
+
+    double time = end - start;
+    if (i >= warm_up) {
+      time_min = (std::min)(time_min, time);
+      time_max = (std::max)(time_max, time);
+      time_avg += time;
+    }
+  }
+  time_avg /= loop_num;
+  fprintf(stderr, "%20s  min = %7.2f  max = %7.2f  avg = %7.2f\n", "picodet",
+          time_min, time_max, time_avg);
+  return 0;
+}
+
+int main(int argc, char **argv) {
+  int mode = atoi(argv[1]);
+  char *bin_model_path = argv[2];
+  char *param_model_path = argv[3];
+  int height = 320;
+  int width = 320;
+  if (argc == 5) {
+    height = atoi(argv[4]);
+    width = atoi(argv[5]);
+  }
+  PicoDet detector =
+      PicoDet(param_model_path, bin_model_path, width, height, true, 0.45, 0.3);
+  if (mode == 1) {
+
+    benchmark(detector, width, height, atoi(argv[6]));
+  } else {
+    if (argc != 6) {
+      std::cout << "Must set image file, such as ./picodet_demo 0 "
+                   "../picodet_s_320_lcnet.bin ../picodet_s_320_lcnet.param "
+                   "320 320 img.jpg"
+                << std::endl;
+    }
+    const char *images = argv[6];
+    image_demo(detector, images, atoi(argv[7]));
+  }
+}
--- a/deploy/third_engine/demo_ncnn/picodet.cpp
+++ b/deploy/third_engine/demo_ncnn/picodet.cpp
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// reference from https://github.com/RangiLyu/nanodet/tree/main/demo_ncnn
+
+#include "picodet.h"
+#include <benchmark.h>
+#include <iostream>
+
+inline float fast_exp(float x) {
+  union {
+    uint32_t i;
+    float f;
+  } v{};
+  v.i = (1 << 23) * (1.4426950409 * x + 126.93490512f);
+  return v.f;
+}
+
+inline float sigmoid(float x) { return 1.0f / (1.0f + fast_exp(-x)); }
+
+template <typename _Tp>
+int activation_function_softmax(const _Tp *src, _Tp *dst, int length) {
+  const _Tp alpha = *std::max_element(src, src + length);
+  _Tp denominator{0};
+
+  for (int i = 0; i < length; ++i) {
+    dst[i] = fast_exp(src[i] - alpha);
+    denominator += dst[i];
+  }
+
+  for (int i = 0; i < length; ++i) {
+    dst[i] /= denominator;
+  }
+
+  return 0;
+}
+
+bool PicoDet::hasGPU = false;
+PicoDet *PicoDet::detector = nullptr;
+
+PicoDet::PicoDet(const char *param, const char *bin, int input_width,
+                 int input_hight, bool useGPU, float score_threshold_ = 0.5,
+                 float nms_threshold_ = 0.3) {
+  this->Net = new ncnn::Net();
+#if NCNN_VULKAN
+  this->hasGPU = ncnn::get_gpu_count() > 0;
+#endif
+  this->Net->opt.use_vulkan_compute = this->hasGPU && useGPU;
+  this->Net->opt.use_fp16_arithmetic = true;
+  this->Net->load_param(param);
+  this->Net->load_model(bin);
+  this->in_w = input_width;
+  this->in_h = input_hight;
+  this->score_threshold = score_threshold_;
+  this->nms_threshold = nms_threshold_;
+}
+
+PicoDet::~PicoDet() { delete this->Net; }
+
+void PicoDet::preprocess(cv::Mat &image, ncnn::Mat &in) {
+  // cv::resize(image, image, cv::Size(this->in_w, this->in_h), 0.f, 0.f);
+  int img_w = image.cols;
+  int img_h = image.rows;
+  in = ncnn::Mat::from_pixels_resize(image.data, ncnn::Mat::PIXEL_BGR, img_w,
+                                     img_h, this->in_w, this->in_h);
+  const float mean_vals[3] = {103.53f, 116.28f, 123.675f};
+  const float norm_vals[3] = {0.017429f, 0.017507f, 0.017125f};
+  in.substract_mean_normalize(mean_vals, norm_vals);
+}
+
+int PicoDet::detect(cv::Mat image, std::vector<BoxInfo> &result_list,
+                    bool has_postprocess) {
+
+  ncnn::Mat input;
+  preprocess(image, input);
+  auto ex = this->Net->create_extractor();
+  ex.set_light_mode(false);
+  ex.set_num_threads(4);
+#if NCNN_VULKAN
+  ex.set_vulkan_compute(this->hasGPU);
+#endif
+  ex.input("image", input); // picodet
+
+  this->image_h = image.rows;
+  this->image_w = image.cols;
+
+  std::vector<std::vector<BoxInfo>> results;
+  results.resize(this->num_class);
+
+  if (has_postprocess) {
+    ncnn::Mat dis_pred;
+    ncnn::Mat cls_pred;
+    ex.extract(this->nms_heads_info[0].c_str(), dis_pred);
+    ex.extract(this->nms_heads_info[1].c_str(), cls_pred);
+    std::cout << dis_pred.h << "  " << dis_pred.w << std::endl;
+    std::cout << cls_pred.h << "  " << cls_pred.w << std::endl;
+    this->nms_boxes(cls_pred, dis_pred, this->score_threshold, results);
+  } else {
+    for (const auto &head_info : this->non_postprocess_heads_info) {
+      ncnn::Mat dis_pred;
+      ncnn::Mat cls_pred;
+      ex.extract(head_info.dis_layer.c_str(), dis_pred);
+      ex.extract(head_info.cls_layer.c_str(), cls_pred);
+      this->decode_infer(cls_pred, dis_pred, head_info.stride,
+                         this->score_threshold, results);
+    }
+  }
+
+  for (int i = 0; i < (int)results.size(); i++) {
+    this->nms(results[i], this->nms_threshold);
+
+    for (auto box : results[i]) {
+      box.x1 = box.x1 / this->in_w * this->image_w;
+      box.x2 = box.x2 / this->in_w * this->image_w;
+      box.y1 = box.y1 / this->in_h * this->image_h;
+      box.y2 = box.y2 / this->in_h * this->image_h;
+      result_list.push_back(box);
+    }
+  }
+  return 0;
+}
+
+void PicoDet::nms_boxes(ncnn::Mat &cls_pred, ncnn::Mat &dis_pred,
+                        float score_threshold,
+                        std::vector<std::vector<BoxInfo>> &result_list) {
+  BoxInfo bbox;
+  int i, j;
+  for (i = 0; i < dis_pred.h; i++) {
+    bbox.x1 = dis_pred.row(i)[0];
+    bbox.y1 = dis_pred.row(i)[1];
+    bbox.x2 = dis_pred.row(i)[2];
+    bbox.y2 = dis_pred.row(i)[3];
+    const float *scores = cls_pred.row(i);
+    float score = 0;
+    int cur_label = 0;
+    for (int label = 0; label < this->num_class; label++) {
+      float score_ = cls_pred.row(label)[i];
+      if (score_ > score) {
+        score = score_;
+        cur_label = label;
+      }
+    }
+    bbox.score = score;
+    bbox.label = cur_label;
+    result_list[cur_label].push_back(bbox);
+  }
+}
+
+void PicoDet::decode_infer(ncnn::Mat &cls_pred, ncnn::Mat &dis_pred, int stride,
+                           float threshold,
+                           std::vector<std::vector<BoxInfo>> &results) {
+  int feature_h = ceil((float)this->in_w / stride);
+  int feature_w = ceil((float)this->in_h / stride);
+
+  for (int idx = 0; idx < feature_h * feature_w; idx++) {
+    const float *scores = cls_pred.row(idx);
+    int row = idx / feature_w;
+    int col = idx % feature_w;
+    float score = 0;
+    int cur_label = 0;
+    for (int label = 0; label < this->num_class; label++) {
+      if (scores[label] > score) {
+        score = scores[label];
+        cur_label = label;
+      }
+    }
+    if (score > threshold) {
+      const float *bbox_pred = dis_pred.row(idx);
+      results[cur_label].push_back(
+          this->disPred2Bbox(bbox_pred, cur_label, score, col, row, stride));
+    }
+  }
+}
+
+BoxInfo PicoDet::disPred2Bbox(const float *&dfl_det, int label, float score,
+                              int x, int y, int stride) {
+  float ct_x = (x + 0.5) * stride;
+  float ct_y = (y + 0.5) * stride;
+  std::vector<float> dis_pred;
+  dis_pred.resize(4);
+  for (int i = 0; i < 4; i++) {
+    float dis = 0;
+    float *dis_after_sm = new float[this->reg_max + 1];
+    activation_function_softmax(dfl_det + i * (this->reg_max + 1), dis_after_sm,
+                                this->reg_max + 1);
+    for (int j = 0; j < this->reg_max + 1; j++) {
+      dis += j * dis_after_sm[j];
+    }
+    dis *= stride;
+    dis_pred[i] = dis;
+    delete[] dis_after_sm;
+  }
+  float xmin = (std::max)(ct_x - dis_pred[0], .0f);
+  float ymin = (std::max)(ct_y - dis_pred[1], .0f);
+  float xmax = (std::min)(ct_x + dis_pred[2], (float)this->in_w);
+  float ymax = (std::min)(ct_y + dis_pred[3], (float)this->in_w);
+  return BoxInfo{xmin, ymin, xmax, ymax, score, label};
+}
+
+void PicoDet::nms(std::vector<BoxInfo> &input_boxes, float NMS_THRESH) {
+  std::sort(input_boxes.begin(), input_boxes.end(),
+            [](BoxInfo a, BoxInfo b) { return a.score > b.score; });
+  std::vector<float> vArea(input_boxes.size());
+  for (int i = 0; i < int(input_boxes.size()); ++i) {
+    vArea[i] = (input_boxes.at(i).x2 - input_boxes.at(i).x1 + 1) *
+               (input_boxes.at(i).y2 - input_boxes.at(i).y1 + 1);
+  }
+  for (int i = 0; i < int(input_boxes.size()); ++i) {
+    for (int j = i + 1; j < int(input_boxes.size());) {
+      float xx1 = (std::max)(input_boxes[i].x1, input_boxes[j].x1);
+      float yy1 = (std::max)(input_boxes[i].y1, input_boxes[j].y1);
+      float xx2 = (std::min)(input_boxes[i].x2, input_boxes[j].x2);
+      float yy2 = (std::min)(input_boxes[i].y2, input_boxes[j].y2);
+      float w = (std::max)(float(0), xx2 - xx1 + 1);
+      float h = (std::max)(float(0), yy2 - yy1 + 1);
+      float inter = w * h;
+      float ovr = inter / (vArea[i] + vArea[j] - inter);
+      if (ovr >= NMS_THRESH) {
+        input_boxes.erase(input_boxes.begin() + j);
+        vArea.erase(vArea.begin() + j);
+      } else {
+        j++;
+      }
+    }
+  }
+}
--- a/deploy/third_engine/demo_ncnn/picodet.h
+++ b/deploy/third_engine/demo_ncnn/picodet.h
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// reference from https://github.com/RangiLyu/nanodet/tree/main/demo_ncnn
+
+#ifndef PICODET_H
+#define PICODET_H
+
+#include <net.h>
+#include <opencv2/core/core.hpp>
+
+typedef struct NonPostProcessHeadInfo {
+  std::string cls_layer;
+  std::string dis_layer;
+  int stride;
+} NonPostProcessHeadInfo;
+
+typedef struct BoxInfo {
+  float x1;
+  float y1;
+  float x2;
+  float y2;
+  float score;
+  int label;
+} BoxInfo;
+
+class PicoDet {
+public:
+  PicoDet(const char *param, const char *bin, int input_width, int input_hight,
+          bool useGPU, float score_threshold_, float nms_threshold_);
+
+  ~PicoDet();
+
+  static PicoDet *detector;
+  ncnn::Net *Net;
+  static bool hasGPU;
+
+  int detect(cv::Mat image, std::vector<BoxInfo> &result_list,
+             bool has_postprocess);
+
+private:
+  void preprocess(cv::Mat &image, ncnn::Mat &in);
+  void decode_infer(ncnn::Mat &cls_pred, ncnn::Mat &dis_pred, int stride,
+                    float threshold,
+                    std::vector<std::vector<BoxInfo>> &results);
+  BoxInfo disPred2Bbox(const float *&dfl_det, int label, float score, int x,
+                       int y, int stride);
+  static void nms(std::vector<BoxInfo> &result, float nms_threshold);
+  void nms_boxes(ncnn::Mat &cls_pred, ncnn::Mat &dis_pred,
+                 float score_threshold,
+                 std::vector<std::vector<BoxInfo>> &result_list);
+
+  int image_w;
+  int image_h;
+  int in_w = 320;
+  int in_h = 320;
+  int num_class = 80;
+  int reg_max = 7;
+
+  float score_threshold;
+  float nms_threshold;
+
+  std::vector<float> bbox_output_data_;
+  std::vector<float> class_output_data_;
+
+  std::vector<std::string> nms_heads_info{"tmp_16", "concat_4.tmp_0"};
+  // If not export post-process, will use non_postprocess_heads_info
+  std::vector<NonPostProcessHeadInfo> non_postprocess_heads_info{
+      // cls_pred|dis_pred|stride
+      {"transpose_0.tmp_0", "transpose_1.tmp_0", 8},
+      {"transpose_2.tmp_0", "transpose_3.tmp_0", 16},
+      {"transpose_4.tmp_0", "transpose_5.tmp_0", 32},
+      {"transpose_6.tmp_0", "transpose_7.tmp_0", 64},
+  };
+};
+
+#endif
--- a/deploy/third_engine/demo_onnx_trt/README.md
+++ b/deploy/third_engine/demo_onnx_trt/README.md
+# PP-YOLOE 转ONNX-TRT教程
+
+本教程内容为：使用PP-YOLOE模型导出转换为ONNX格式，并定制化修改网络，使用[EfficientNMS_TRT](https://github.com/NVIDIA/TensorRT/tree/main/plugin/efficientNMSPlugin) OP，
+可成功运行在[TensorRT](https://github.com/NVIDIA/TensorRT)上，示例仅供参考
+
+## 1. 环境依赖
+CUDA 10.2 + [cudnn 8.2.1](https://docs.nvidia.com/deeplearning/cudnn/install-guide/index.html) + [TensorRT 8.2](https://docs.nvidia.com/deeplearning/tensorrt/archives/tensorrt-821/install-guide/index.htm)
+```commandline
+onnx
+onnxruntime
+paddle2onnx
+```
+
+## 2. Paddle模型导出
+```commandline
+python tools/export_model.py -c configs/ppyoloe/ppyoloe_crn_l_300e_coco.yml -o weights=https://paddledet.bj.bcebos.com/models/ppyoloe_crn_l_300e_coco.pdparams trt=True exclude_nms=True
+```
+
+## 3. ONNX模型转换 + 定制化修改EfficientNMS_TRT
+```commandline
+python deploy/third_engine/demo_onnx_trt/onnx_custom.py --onnx_file=output_inference/ppyoloe_crn_l_300e_coco/ppyoloe_crn_l_300e_coco.onnx --model_dir=output_inference/ppyoloe_crn_l_300e_coco/ --opset_version=11
+```
+
+## 4. TensorRT Engine
+```commandline
+trtexec --onnx=output_inference/ppyoloe_crn_l_300e_coco/ppyoloe_crn_l_300e_coco.onnx --saveEngine=ppyoloe_crn_l_300e_coco.engine
+```
+**注意**：若运行报错，可尝试添加`--tacticSources=-cublasLt,+cublas`参数解决
+
+## 5. 运行TensorRT推理
+```commandline
+python deploy/third_engine/demo_onnx_trt/trt_infer.py --infer_cfg=output_inference/ppyoloe_crn_l_300e_coco/infer_cfg.yml --trt_engine=ppyoloe_crn_l_300e_coco.engine --image_file=demo/000000014439.jpg
+```
--- a/deploy/third_engine/demo_onnx_trt/onnx_custom.py
+++ b/deploy/third_engine/demo_onnx_trt/onnx_custom.py
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import os
+import onnx
+import onnx_graphsurgeon
+import numpy as np
+from collections import OrderedDict
+from paddle2onnx.command import program2onnx
+
+parser = argparse.ArgumentParser(description=__doc__)
+parser.add_argument(
+    '--onnx_file', required=True, type=str, help='onnx model path')
+parser.add_argument(
+    '--model_dir',
+    type=str,
+    default=None,
+    help=("Directory include:'model.pdiparams', 'model.pdmodel', "
+          "'infer_cfg.yml', created by tools/export_model.py."))
+parser.add_argument(
+    "--opset_version",
+    type=int,
+    default=11,
+    help="set onnx opset version to export")
+parser.add_argument(
+    '--topk_all', type=int, default=300, help='topk objects for every images')
+parser.add_argument(
+    '--iou_thres', type=float, default=0.7, help='iou threshold for NMS')
+parser.add_argument(
+    '--conf_thres', type=float, default=0.01, help='conf threshold for NMS')
+
+
+def main(FLAGS):
+    assert os.path.exists(FLAGS.onnx_file)
+    onnx_model = onnx.load(FLAGS.onnx_file)
+    graph = onnx_graphsurgeon.import_onnx(onnx_model)
+    graph.toposort()
+    graph.fold_constants()
+    graph.cleanup()
+
+    num_anchors = graph.outputs[1].shape[2]
+    num_classes = graph.outputs[1].shape[1]
+    scores = onnx_graphsurgeon.Variable(
+        name='scores', shape=[-1, num_anchors, num_classes], dtype=np.float32)
+    graph.layer(
+        op='Transpose',
+        name='lastTranspose',
+        inputs=[graph.outputs[1]],
+        outputs=[scores],
+        attrs=OrderedDict(perm=[0, 2, 1]))
+
+    attrs = OrderedDict(
+        plugin_version="1",
+        background_class=-1,
+        max_output_boxes=FLAGS.topk_all,
+        score_threshold=FLAGS.conf_thres,
+        iou_threshold=FLAGS.iou_thres,
+        score_activation=False,
+        box_coding=0, )
+    outputs = [
+        onnx_graphsurgeon.Variable("num_dets", np.int32, [-1, 1]),
+        onnx_graphsurgeon.Variable("det_boxes", np.float32,
+                                   [-1, FLAGS.topk_all, 4]),
+        onnx_graphsurgeon.Variable("det_scores", np.float32,
+                                   [-1, FLAGS.topk_all]),
+        onnx_graphsurgeon.Variable("det_classes", np.int32,
+                                   [-1, FLAGS.topk_all])
+    ]
+    graph.layer(
+        op='EfficientNMS_TRT',
+        name="batched_nms",
+        inputs=[graph.outputs[0], scores],
+        outputs=outputs,
+        attrs=attrs)
+    graph.outputs = outputs
+    graph.cleanup().toposort()
+    onnx.save(onnx_graphsurgeon.export_onnx(graph), FLAGS.onnx_file)
+    print(f"The modified onnx model is saved in {FLAGS.onnx_file}")
+
+
+if __name__ == '__main__':
+    FLAGS = parser.parse_args()
+    if FLAGS.model_dir is not None:
+        assert os.path.exists(FLAGS.model_dir)
+        program2onnx(
+            model_dir=FLAGS.model_dir,
+            save_file=FLAGS.onnx_file,
+            model_filename="model.pdmodel",
+            params_filename="model.pdiparams",
+            opset_version=FLAGS.opset_version,
+            enable_onnx_checker=True)
+    main(FLAGS)
--- a/deploy/third_engine/demo_onnx_trt/preprocess.py
+++ b/deploy/third_engine/demo_onnx_trt/preprocess.py
+import numpy as np
+import cv2
+import copy
+
+
+def decode_image(img_path):
+    with open(img_path, 'rb') as f:
+        im_read = f.read()
+    data = np.frombuffer(im_read, dtype='uint8')
+    im = cv2.imdecode(data, 1)  # BGR mode, but need RGB mode
+    im = cv2.cvtColor(im, cv2.COLOR_BGR2RGB)
+    img_info = {
+        "im_shape": np.array(
+            im.shape[:2], dtype=np.float32),
+        "scale_factor": np.array(
+            [1., 1.], dtype=np.float32)
+    }
+    return im, img_info
+
+
+class Resize(object):
+    """resize image by target_size and max_size
+    Args:
+        target_size (int): the target size of image
+        keep_ratio (bool): whether keep_ratio or not, default true
+        interp (int): method of resize
+    """
+
+    def __init__(self, target_size, keep_ratio=True, interp=cv2.INTER_LINEAR):
+        if isinstance(target_size, int):
+            target_size = [target_size, target_size]
+        self.target_size = target_size
+        self.keep_ratio = keep_ratio
+        self.interp = interp
+
+    def __call__(self, im, im_info):
+        """
+        Args:
+            im (np.ndarray): image (np.ndarray)
+            im_info (dict): info of image
+        Returns:
+            im (np.ndarray):  processed image (np.ndarray)
+            im_info (dict): info of processed image
+        """
+        assert len(self.target_size) == 2
+        assert self.target_size[0] > 0 and self.target_size[1] > 0
+        im_channel = im.shape[2]
+        im_scale_y, im_scale_x = self.generate_scale(im)
+        im = cv2.resize(
+            im,
+            None,
+            None,
+            fx=im_scale_x,
+            fy=im_scale_y,
+            interpolation=self.interp)
+        im_info['im_shape'] = np.array(im.shape[:2]).astype('float32')
+        im_info['scale_factor'] = np.array(
+            [im_scale_y, im_scale_x]).astype('float32')
+        return im, im_info
+
+    def generate_scale(self, im):
+        """
+        Args:
+            im (np.ndarray): image (np.ndarray)
+        Returns:
+            im_scale_x: the resize ratio of X
+            im_scale_y: the resize ratio of Y
+        """
+        origin_shape = im.shape[:2]
+        im_c = im.shape[2]
+        if self.keep_ratio:
+            im_size_min = np.min(origin_shape)
+            im_size_max = np.max(origin_shape)
+            target_size_min = np.min(self.target_size)
+            target_size_max = np.max(self.target_size)
+            im_scale = float(target_size_min) / float(im_size_min)
+            if np.round(im_scale * im_size_max) > target_size_max:
+                im_scale = float(target_size_max) / float(im_size_max)
+            im_scale_x = im_scale
+            im_scale_y = im_scale
+        else:
+            resize_h, resize_w = self.target_size
+            im_scale_y = resize_h / float(origin_shape[0])
+            im_scale_x = resize_w / float(origin_shape[1])
+        return im_scale_y, im_scale_x
+
+
+class NormalizeImage(object):
+    """normalize image
+    Args:
+        mean (list): im - mean
+        std (list): im / std
+        is_scale (bool): whether need im / 255
+        norm_type (str): type in ['mean_std', 'none']
+    """
+
+    def __init__(self, mean, std, is_scale=True, norm_type='mean_std'):
+        self.mean = mean
+        self.std = std
+        self.is_scale = is_scale
+        self.norm_type = norm_type
+
+    def __call__(self, im, im_info):
+        """
+        Args:
+            im (np.ndarray): image (np.ndarray)
+            im_info (dict): info of image
+        Returns:
+            im (np.ndarray):  processed image (np.ndarray)
+            im_info (dict): info of processed image
+        """
+        im = im.astype(np.float32, copy=False)
+        if self.is_scale:
+            scale = 1.0 / 255.0
+            im *= scale
+
+        if self.norm_type == 'mean_std':
+            mean = np.array(self.mean)[np.newaxis, np.newaxis, :]
+            std = np.array(self.std)[np.newaxis, np.newaxis, :]
+            im -= mean
+            im /= std
+        return im, im_info
+
+
+class Permute(object):
+    """permute image
+    Args:
+        to_bgr (bool): whether convert RGB to BGR
+        channel_first (bool): whether convert HWC to CHW
+    """
+
+    def __init__(self, ):
+        super(Permute, self).__init__()
+
+    def __call__(self, im, im_info):
+        """
+        Args:
+            im (np.ndarray): image (np.ndarray)
+            im_info (dict): info of image
+        Returns:
+            im (np.ndarray):  processed image (np.ndarray)
+            im_info (dict): info of processed image
+        """
+        im = im.transpose((2, 0, 1)).copy()
+        return im, im_info
+
+
+class PadStride(object):
+    """ padding image for model with FPN, instead PadBatch(pad_to_stride) in original config
+    Args:
+        stride (bool): model with FPN need image shape % stride == 0
+    """
+
+    def __init__(self, stride=0):
+        self.coarsest_stride = stride
+
+    def __call__(self, im, im_info):
+        """
+        Args:
+            im (np.ndarray): image (np.ndarray)
+            im_info (dict): info of image
+        Returns:
+            im (np.ndarray):  processed image (np.ndarray)
+            im_info (dict): info of processed image
+        """
+        coarsest_stride = self.coarsest_stride
+        if coarsest_stride <= 0:
+            return im, im_info
+        im_c, im_h, im_w = im.shape
+        pad_h = int(np.ceil(float(im_h) / coarsest_stride) * coarsest_stride)
+        pad_w = int(np.ceil(float(im_w) / coarsest_stride) * coarsest_stride)
+        padding_im = np.zeros((im_c, pad_h, pad_w), dtype=np.float32)
+        padding_im[:, :im_h, :im_w] = im
+        return padding_im, im_info
+
+
+class LetterBoxResize(object):
+    def __init__(self, target_size):
+        """
+        Resize image to target size, convert normalized xywh to pixel xyxy
+        format ([x_center, y_center, width, height] -> [x0, y0, x1, y1]).
+        Args:
+            target_size (int|list): image target size.
+        """
+        super(LetterBoxResize, self).__init__()
+        if isinstance(target_size, int):
+            target_size = [target_size, target_size]
+        self.target_size = target_size
+
+    def letterbox(self, img, height, width, color=(127.5, 127.5, 127.5)):
+        # letterbox: resize a rectangular image to a padded rectangular
+        shape = img.shape[:2]  # [height, width]
+        ratio_h = float(height) / shape[0]
+        ratio_w = float(width) / shape[1]
+        ratio = min(ratio_h, ratio_w)
+        new_shape = (round(shape[1] * ratio),
+                     round(shape[0] * ratio))  # [width, height]
+        padw = (width - new_shape[0]) / 2
+        padh = (height - new_shape[1]) / 2
+        top, bottom = round(padh - 0.1), round(padh + 0.1)
+        left, right = round(padw - 0.1), round(padw + 0.1)
+
+        img = cv2.resize(
+            img, new_shape, interpolation=cv2.INTER_AREA)  # resized, no border
+        img = cv2.copyMakeBorder(
+            img, top, bottom, left, right, cv2.BORDER_CONSTANT,
+            value=color)  # padded rectangular
+        return img, ratio, padw, padh
+
+    def __call__(self, im, im_info):
+        """
+        Args:
+            im (np.ndarray): image (np.ndarray)
+            im_info (dict): info of image
+        Returns:
+            im (np.ndarray):  processed image (np.ndarray)
+            im_info (dict): info of processed image
+        """
+        assert len(self.target_size) == 2
+        assert self.target_size[0] > 0 and self.target_size[1] > 0
+        height, width = self.target_size
+        h, w = im.shape[:2]
+        im, ratio, padw, padh = self.letterbox(im, height=height, width=width)
+
+        new_shape = [round(h * ratio), round(w * ratio)]
+        im_info['im_shape'] = np.array(new_shape, dtype=np.float32)
+        im_info['scale_factor'] = np.array([ratio, ratio], dtype=np.float32)
+        return im, im_info
+
+
+class Pad(object):
+    def __init__(self, size, fill_value=[114.0, 114.0, 114.0]):
+        """
+        Pad image to a specified size.
+        Args:
+            size (list[int]): image target size
+            fill_value (list[float]): rgb value of pad area, default (114.0, 114.0, 114.0)
+        """
+        super(Pad, self).__init__()
+        if isinstance(size, int):
+            size = [size, size]
+        self.size = size
+        self.fill_value = fill_value
+
+    def __call__(self, im, im_info):
+        im_h, im_w = im.shape[:2]
+        h, w = self.size
+        if h == im_h and w == im_w:
+            im = im.astype(np.float32)
+            return im, im_info
+
+        canvas = np.ones((h, w, 3), dtype=np.float32)
+        canvas *= np.array(self.fill_value, dtype=np.float32)
+        canvas[0:im_h, 0:im_w, :] = im.astype(np.float32)
+        im = canvas
+        return im, im_info
+
+
+def rotate_point(pt, angle_rad):
+    """Rotate a point by an angle.
+    Args:
+        pt (list[float]): 2 dimensional point to be rotated
+        angle_rad (float): rotation angle by radian
+    Returns:
+        list[float]: Rotated point.
+    """
+    assert len(pt) == 2
+    sn, cs = np.sin(angle_rad), np.cos(angle_rad)
+    new_x = pt[0] * cs - pt[1] * sn
+    new_y = pt[0] * sn + pt[1] * cs
+    rotated_pt = [new_x, new_y]
+
+    return rotated_pt
+
+
+def _get_3rd_point(a, b):
+    """To calculate the affine matrix, three pairs of points are required. This
+    function is used to get the 3rd point, given 2D points a & b.
+    The 3rd point is defined by rotating vector `a - b` by 90 degrees
+    anticlockwise, using b as the rotation center.
+    Args:
+        a (np.ndarray): point(x,y)
+        b (np.ndarray): point(x,y)
+    Returns:
+        np.ndarray: The 3rd point.
+    """
+    assert len(a) == 2
+    assert len(b) == 2
+    direction = a - b
+    third_pt = b + np.array([-direction[1], direction[0]], dtype=np.float32)
+
+    return third_pt
+
+
+def get_affine_transform(center,
+                         input_size,
+                         rot,
+                         output_size,
+                         shift=(0., 0.),
+                         inv=False):
+    """Get the affine transform matrix, given the center/scale/rot/output_size.
+    Args:
+        center (np.ndarray[2, ]): Center of the bounding box (x, y).
+        scale (np.ndarray[2, ]): Scale of the bounding box
+            wrt [width, height].
+        rot (float): Rotation angle (degree).
+        output_size (np.ndarray[2, ]): Size of the destination heatmaps.
+        shift (0-100%): Shift translation ratio wrt the width/height.
+            Default (0., 0.).
+        inv (bool): Option to inverse the affine transform direction.
+            (inv=False: src->dst or inv=True: dst->src)
+    Returns:
+        np.ndarray: The transform matrix.
+    """
+    assert len(center) == 2
+    assert len(output_size) == 2
+    assert len(shift) == 2
+    if not isinstance(input_size, (np.ndarray, list)):
+        input_size = np.array([input_size, input_size], dtype=np.float32)
+    scale_tmp = input_size
+
+    shift = np.array(shift)
+    src_w = scale_tmp[0]
+    dst_w = output_size[0]
+    dst_h = output_size[1]
+
+    rot_rad = np.pi * rot / 180
+    src_dir = rotate_point([0., src_w * -0.5], rot_rad)
+    dst_dir = np.array([0., dst_w * -0.5])
+
+    src = np.zeros((3, 2), dtype=np.float32)
+    src[0, :] = center + scale_tmp * shift
+    src[1, :] = center + src_dir + scale_tmp * shift
+    src[2, :] = _get_3rd_point(src[0, :], src[1, :])
+
+    dst = np.zeros((3, 2), dtype=np.float32)
+    dst[0, :] = [dst_w * 0.5, dst_h * 0.5]
+    dst[1, :] = np.array([dst_w * 0.5, dst_h * 0.5]) + dst_dir
+    dst[2, :] = _get_3rd_point(dst[0, :], dst[1, :])
+
+    if inv:
+        trans = cv2.getAffineTransform(np.float32(dst), np.float32(src))
+    else:
+        trans = cv2.getAffineTransform(np.float32(src), np.float32(dst))
+
+    return trans
+
+
+class WarpAffine(object):
+    """Warp affine the image
+    """
+
+    def __init__(self,
+                 keep_res=False,
+                 pad=31,
+                 input_h=512,
+                 input_w=512,
+                 scale=0.4,
+                 shift=0.1):
+        self.keep_res = keep_res
+        self.pad = pad
+        self.input_h = input_h
+        self.input_w = input_w
+        self.scale = scale
+        self.shift = shift
+
+    def __call__(self, im, im_info):
+        """
+        Args:
+            im (np.ndarray): image (np.ndarray)
+            im_info (dict): info of image
+        Returns:
+            im (np.ndarray):  processed image (np.ndarray)
+            im_info (dict): info of processed image
+        """
+        img = cv2.cvtColor(im, cv2.COLOR_RGB2BGR)
+
+        h, w = img.shape[:2]
+
+        if self.keep_res:
+            input_h = (h | self.pad) + 1
+            input_w = (w | self.pad) + 1
+            s = np.array([input_w, input_h], dtype=np.float32)
+            c = np.array([w // 2, h // 2], dtype=np.float32)
+
+        else:
+            s = max(h, w) * 1.0
+            input_h, input_w = self.input_h, self.input_w
+            c = np.array([w / 2., h / 2.], dtype=np.float32)
+
+        trans_input = get_affine_transform(c, s, 0, [input_w, input_h])
+        img = cv2.resize(img, (w, h))
+        inp = cv2.warpAffine(
+            img, trans_input, (input_w, input_h), flags=cv2.INTER_LINEAR)
+        return inp, im_info
+
+
+# keypoint preprocess
+def get_warp_matrix(theta, size_input, size_dst, size_target):
+    """This code is based on
+        https://github.com/open-mmlab/mmpose/blob/master/mmpose/core/post_processing/post_transforms.py
+        Calculate the transformation matrix under the constraint of unbiased.
+    Paper ref: Huang et al. The Devil is in the Details: Delving into Unbiased
+    Data Processing for Human Pose Estimation (CVPR 2020).
+    Args:
+        theta (float): Rotation angle in degrees.
+        size_input (np.ndarray): Size of input image [w, h].
+        size_dst (np.ndarray): Size of output image [w, h].
+        size_target (np.ndarray): Size of ROI in input plane [w, h].
+    Returns:
+        matrix (np.ndarray): A matrix for transformation.
+    """
+    theta = np.deg2rad(theta)
+    matrix = np.zeros((2, 3), dtype=np.float32)
+    scale_x = size_dst[0] / size_target[0]
+    scale_y = size_dst[1] / size_target[1]
+    matrix[0, 0] = np.cos(theta) * scale_x
+    matrix[0, 1] = -np.sin(theta) * scale_x
+    matrix[0, 2] = scale_x * (
+        -0.5 * size_input[0] * np.cos(theta) + 0.5 * size_input[1] *
+        np.sin(theta) + 0.5 * size_target[0])
+    matrix[1, 0] = np.sin(theta) * scale_y
+    matrix[1, 1] = np.cos(theta) * scale_y
+    matrix[1, 2] = scale_y * (
+        -0.5 * size_input[0] * np.sin(theta) - 0.5 * size_input[1] *
+        np.cos(theta) + 0.5 * size_target[1])
+    return matrix
+
+
+class TopDownEvalAffine(object):
+    """apply affine transform to image and coords
+    Args:
+        trainsize (list): [w, h], the standard size used to train
+        use_udp (bool): whether to use Unbiased Data Processing.
+        records(dict): the dict contained the image and coords
+    Returns:
+        records (dict): contain the image and coords after tranformed
+    """
+
+    def __init__(self, trainsize, use_udp=False):
+        self.trainsize = trainsize
+        self.use_udp = use_udp
+
+    def __call__(self, image, im_info):
+        rot = 0
+        imshape = im_info['im_shape'][::-1]
+        center = im_info['center'] if 'center' in im_info else imshape / 2.
+        scale = im_info['scale'] if 'scale' in im_info else imshape
+        if self.use_udp:
+            trans = get_warp_matrix(
+                rot, center * 2.0,
+                [self.trainsize[0] - 1.0, self.trainsize[1] - 1.0], scale)
+            image = cv2.warpAffine(
+                image,
+                trans, (int(self.trainsize[0]), int(self.trainsize[1])),
+                flags=cv2.INTER_LINEAR)
+        else:
+            trans = get_affine_transform(center, scale, rot, self.trainsize)
+            image = cv2.warpAffine(
+                image,
+                trans, (int(self.trainsize[0]), int(self.trainsize[1])),
+                flags=cv2.INTER_LINEAR)
+
+        return image, im_info
+
+
+class Compose:
+    def __init__(self, transforms):
+        self.transforms = []
+        for op_info in transforms:
+            new_op_info = op_info.copy()
+            op_type = new_op_info.pop('type')
+            self.transforms.append(eval(op_type)(**new_op_info))
+
+    def __call__(self, img_path):
+        img, im_info = decode_image(img_path)
+        for t in self.transforms:
+            img, im_info = t(img, im_info)
+        inputs = copy.deepcopy(im_info)
+        inputs['image'] = np.ascontiguousarray(img.astype('float32'))
+        return inputs
+
+
+coco_clsid2catid = {
+    0: 1,
+    1: 2,
+    2: 3,
+    3: 4,
+    4: 5,
+    5: 6,
+    6: 7,
+    7: 8,
+    8: 9,
+    9: 10,
+    10: 11,
+    11: 13,
+    12: 14,
+    13: 15,
+    14: 16,
+    15: 17,
+    16: 18,
+    17: 19,
+    18: 20,
+    19: 21,
+    20: 22,
+    21: 23,
+    22: 24,
+    23: 25,
+    24: 27,
+    25: 28,
+    26: 31,
+    27: 32,
+    28: 33,
+    29: 34,
+    30: 35,
+    31: 36,
+    32: 37,
+    33: 38,
+    34: 39,
+    35: 40,
+    36: 41,
+    37: 42,
+    38: 43,
+    39: 44,
+    40: 46,
+    41: 47,
+    42: 48,
+    43: 49,
+    44: 50,
+    45: 51,
+    46: 52,
+    47: 53,
+    48: 54,
+    49: 55,
+    50: 56,
+    51: 57,
+    52: 58,
+    53: 59,
+    54: 60,
+    55: 61,
+    56: 62,
+    57: 63,
+    58: 64,
+    59: 65,
+    60: 67,
+    61: 70,
+    62: 72,
+    63: 73,
+    64: 74,
+    65: 75,
+    66: 76,
+    67: 77,
+    68: 78,
+    69: 79,
+    70: 80,
+    71: 81,
+    72: 82,
+    73: 84,
+    74: 85,
+    75: 86,
+    76: 87,
+    77: 88,
+    78: 89,
+    79: 90
+}
--- a/deploy/third_engine/demo_onnx_trt/trt_infer.py
+++ b/deploy/third_engine/demo_onnx_trt/trt_infer.py
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import time
+import numpy as np
+import pycuda.autoinit
+import pycuda.driver as cuda
+
+import tensorrt as trt
+from collections import OrderedDict
+import os
+import yaml
+import json
+import glob
+import argparse
+
+from preprocess import Compose
+from preprocess import coco_clsid2catid
+
+parser = argparse.ArgumentParser(description=__doc__)
+parser.add_argument("--infer_cfg", type=str, help="infer_cfg.yml")
+parser.add_argument(
+    "--trt_engine", required=True, type=str, help="trt engine path")
+parser.add_argument("--image_dir", type=str)
+parser.add_argument("--image_file", type=str)
+parser.add_argument(
+    "--repeats",
+    type=int,
+    default=1,
+    help="Repeat the running test `repeats` times in benchmark")
+parser.add_argument(
+    "--save_coco",
+    action='store_true',
+    default=False,
+    help="Whether to save coco results")
+parser.add_argument(
+    "--coco_file", type=str, default="results.json", help="coco results path")
+
+TRT_LOGGER = trt.Logger()
+trt.init_libnvinfer_plugins(TRT_LOGGER, namespace="")
+# Global dictionary
+SUPPORT_MODELS = {
+    'YOLO', 'PPYOLOE', 'YOLOX', 'YOLOv5', 'YOLOv6', 'YOLOv7', 'YOLOv8', 'RTMDet'
+}
+
+
+def get_test_images(infer_dir, infer_img):
+    """
+    Get image path list in TEST mode
+    """
+    assert infer_img is not None or infer_dir is not None, \
+        "--image_file or --image_dir should be set"
+    assert infer_img is None or os.path.isfile(infer_img), \
+            "{} is not a file".format(infer_img)
+    assert infer_dir is None or os.path.isdir(infer_dir), \
+            "{} is not a directory".format(infer_dir)
+
+    # infer_img has a higher priority
+    if infer_img and os.path.isfile(infer_img):
+        return [infer_img]
+
+    images = set()
+    infer_dir = os.path.abspath(infer_dir)
+    assert os.path.isdir(infer_dir), \
+        "infer_dir {} is not a directory".format(infer_dir)
+    exts = ['jpg', 'jpeg', 'png', 'bmp']
+    exts += [ext.upper() for ext in exts]
+    for ext in exts:
+        images.update(glob.glob('{}/*.{}'.format(infer_dir, ext)))
+    images = list(images)
+
+    assert len(images) > 0, "no image found in {}".format(infer_dir)
+    print("Found {} inference images in total.".format(len(images)))
+
+    return images
+
+
+class PredictConfig(object):
+    """set config of preprocess, postprocess and visualize
+    Args:
+        infer_config (str): path of infer_cfg.yml
+    """
+
+    def __init__(self, infer_config):
+        # parsing Yaml config for Preprocess
+        with open(infer_config) as f:
+            yml_conf = yaml.safe_load(f)
+        self.check_model(yml_conf)
+        self.arch = yml_conf['arch']
+        self.preprocess_infos = yml_conf['Preprocess']
+        self.min_subgraph_size = yml_conf['min_subgraph_size']
+        self.label_list = yml_conf['label_list']
+        self.use_dynamic_shape = yml_conf['use_dynamic_shape']
+        self.draw_threshold = yml_conf.get("draw_threshold", 0.5)
+        self.mask = yml_conf.get("mask", False)
+        self.tracker = yml_conf.get("tracker", None)
+        self.nms = yml_conf.get("NMS", None)
+        self.fpn_stride = yml_conf.get("fpn_stride", None)
+        if self.arch == 'RCNN' and yml_conf.get('export_onnx', False):
+            print(
+                'The RCNN export model is used for ONNX and it only supports batch_size = 1'
+            )
+        self.print_config()
+
+    def check_model(self, yml_conf):
+        """
+        Raises:
+            ValueError: loaded model not in supported model type
+        """
+        for support_model in SUPPORT_MODELS:
+            if support_model in yml_conf['arch']:
+                return True
+        raise ValueError("Unsupported arch: {}, expect {}".format(yml_conf[
+            'arch'], SUPPORT_MODELS))
+
+    def print_config(self):
+        print('-----------  Model Configuration -----------')
+        print('%s: %s' % ('Model Arch', self.arch))
+        print('%s: ' % ('Transform Order'))
+        for op_info in self.preprocess_infos:
+            print('--%s: %s' % ('transform op', op_info['type']))
+        print('--------------------------------------------')
+
+
+def load_trt_engine(engine_path):
+    assert os.path.exists(engine_path)
+    print("Reading engine from file {}".format(engine_path))
+    with open(engine_path, "rb") as f, trt.Runtime(TRT_LOGGER) as runtime:
+        return runtime.deserialize_cuda_engine(f.read())
+
+
+def predict_image(infer_config, engine, img_list, save_coco=False, repeats=1):
+    # load preprocess transforms
+    transforms = Compose(infer_config.preprocess_infos)
+
+    stream = cuda.Stream()
+    coco_results = []
+    num_data = len(img_list)
+    avg_time = []
+    with engine.create_execution_context() as context:
+        # Allocate host and device buffers
+        bindings = create_trt_bindings(engine, context)
+        # warmup
+        run_trt_context(context, bindings, stream, repeats=10)
+        # predict image
+        for i, img_path in enumerate(img_list):
+            inputs = transforms(img_path)
+            inputs_name = [k for k, v in bindings.items() if v['is_input']]
+            inputs = {
+                k: inputs[k][None, ]
+                for k in inputs.keys() if k in inputs_name
+            }
+            # run infer
+            for k, v in inputs.items():
+                bindings[k]['cpu_data'][...] = v
+            output = run_trt_context(context, bindings, stream, repeats=repeats)
+            print(f"{i + 1}/{num_data} infer time: {output['infer_time']} ms.")
+            avg_time.append(output['infer_time'])
+            # get output
+            for k, v in output.items():
+                if k in bindings.keys():
+                    output[k] = np.reshape(v, bindings[k]['shape'])
+            if save_coco:
+                coco_results.extend(
+                    format_coco_results(os.path.split(img_path)[-1], output))
+    avg_time = np.mean(avg_time)
+    print(
+        f"Run on {num_data} data, repeats {repeats} times, avg time: {avg_time} ms."
+    )
+    if save_coco:
+        with open(FLAGS.coco_file, 'w') as f:
+            json.dump(coco_results, f)
+        print(f"save coco json to {FLAGS.coco_file}")
+
+
+def create_trt_bindings(engine, context):
+    bindings = OrderedDict()
+    for name in engine:
+        binding_idx = engine.get_binding_index(name)
+        size = trt.volume(context.get_binding_shape(binding_idx))
+        dtype = trt.nptype(engine.get_binding_dtype(name))
+        shape = list(engine.get_binding_shape(binding_idx))
+        if shape[0] == -1:
+            shape[0] = 1
+        bindings[name] = {
+            "idx": binding_idx,
+            "size": size,
+            "dtype": dtype,
+            "shape": shape,
+            "cpu_data": None,
+            "cuda_ptr": None,
+            "is_input": True if engine.binding_is_input(name) else False
+        }
+        if engine.binding_is_input(name):
+            bindings[name]['cpu_data'] = np.random.randn(*shape).astype(
+                np.float32)
+            bindings[name]['cuda_ptr'] = cuda.mem_alloc(bindings[name][
+                'cpu_data'].nbytes)
+        else:
+            bindings[name]['cpu_data'] = cuda.pagelocked_empty(size, dtype)
+            bindings[name]['cuda_ptr'] = cuda.mem_alloc(bindings[name][
+                'cpu_data'].nbytes)
+    return bindings
+
+
+def run_trt_context(context, bindings, stream, repeats=1):
+    # Transfer input data to the GPU.
+    for k, v in bindings.items():
+        if v['is_input']:
+            cuda.memcpy_htod_async(v['cuda_ptr'], v['cpu_data'], stream)
+    in_bindings = [int(v['cuda_ptr']) for k, v in bindings.items()]
+    output_data = {}
+    avg_time = []
+    for _ in range(repeats):
+        # Run inference
+        t1 = time.time()
+        context.execute_async_v2(
+            bindings=in_bindings, stream_handle=stream.handle)
+        # Transfer prediction output from the GPU.
+        for k, v in bindings.items():
+            if not v['is_input']:
+                cuda.memcpy_dtoh_async(v['cpu_data'], v['cuda_ptr'], stream)
+                output_data[k] = v['cpu_data']
+        # Synchronize the stream
+        stream.synchronize()
+        t2 = time.time()
+        avg_time.append(t2 - t1)
+    output_data['infer_time'] = np.mean(avg_time) * 1000
+    return output_data
+
+
+def format_coco_results(file_name, result):
+    try:
+        image_id = int(os.path.splitext(file_name)[0])
+    except:
+        image_id = file_name
+    num_dets = result['num_dets'].tolist()
+    det_classes = result['det_classes'].tolist()
+    det_scores = result['det_scores'].tolist()
+    det_boxes = result['det_boxes'].tolist()
+    per_result = [
+        {
+            'image_id': image_id,
+            'category_id': coco_clsid2catid[int(det_classes[0][idx])],
+            'file_name': file_name,
+            'bbox': [
+                det_boxes[0][idx][0], det_boxes[0][idx][1],
+                det_boxes[0][idx][2] - det_boxes[0][idx][0],
+                det_boxes[0][idx][3] - det_boxes[0][idx][1]
+            ],  # xyxy -> xywh
+            'score': det_scores[0][idx]
+        } for idx in range(num_dets[0][0])
+    ]
+
+    return per_result
+
+
+if __name__ == '__main__':
+    FLAGS = parser.parse_args()
+    # load image list
+    img_list = get_test_images(FLAGS.image_dir, FLAGS.image_file)
+    # load trt engine
+    engine = load_trt_engine(FLAGS.trt_engine)
+    # load infer config
+    infer_config = PredictConfig(FLAGS.infer_cfg)
+
+    predict_image(infer_config, engine, img_list, FLAGS.save_coco,
+                  FLAGS.repeats)
+    print('Done!')
--- a/deploy/third_engine/demo_onnxruntime/README.md
+++ b/deploy/third_engine/demo_onnxruntime/README.md
+# PicoDet ONNX Runtime Demo
+
+本文件夹提供利用[ONNX Runtime](https://onnxruntime.ai/docs/)进行 PicoDet 部署与Inference images 的 Demo。
+
+## 安装 ONNX Runtime
+
+本demo采用的是 ONNX Runtime 1.10.0，可直接运行如下指令安装：
+```shell
+pip install onnxruntime
+```
+
+详细安装步骤，可参考 [Install ONNX Runtime](https://onnxruntime.ai/docs/install/)。
+
+## Inference images
+
+- 准备测试模型：根据[PicoDet](https://github.com/PaddlePaddle/PaddleDetection/tree/develop/configs/picodet)中【导出及转换模型】步骤，采用包含后处理的方式导出模型（`-o export.benchmark=False` ），并生成待测试模型简化后的onnx模型（可在下文链接中直接下载）。同时在本目录下新建```onnx_file```文件夹，将导出的onnx模型放在该目录下。
+
+- 准备测试所用图片：将待测试图片放在```./imgs```文件夹下，本demo已提供了两张测试图片。
+
+- 在本目录下直接运行：
+    ```shell
+    python infer_demo.py --modelpath ./onnx_file/picodet_s_320_lcnet_postprocessed.onnx
+    ```
+    将会对```./imgs```文件夹下所有图片进行识别，并将识别结果保存在```./results```文件夹下。
+
+- 结果：
+    <div align="center">
+      <img src="../../../docs/images/bus.jpg" height="300px" ><img src="../../../docs/images/dog.jpg" height="300px" >
+    </div>
+
+## 模型下载
+
+| 模型     | 输入尺寸 | ONNX( w/ 后处理)  |
+| :-------- | :--------: | :---------------------: |
+| PicoDet-XS |  320*320   | [model](https://paddledet.bj.bcebos.com/deploy/third_engine/picodet_xs_320_lcnet_postprocessed.onnx) |
+| PicoDet-XS |  416*416   | [model](https://paddledet.bj.bcebos.com/deploy/third_engine/picodet_xs_416_lcnet_postprocessed.onnx) |
+| PicoDet-S |  320*320   | [model](https://paddledet.bj.bcebos.com/deploy/third_engine/picodet_s_320_lcnet_postprocessed.onnx) |
+| PicoDet-S |  416*416   |  [model](https://paddledet.bj.bcebos.com/deploy/third_engine/picodet_s_416_lcnet_postprocessed.onnx) |
+| PicoDet-M |  320*320   | [model](https://paddledet.bj.bcebos.com/deploy/third_engine/picodet_m_320_lcnet_postprocessed.onnx) |
+| PicoDet-M |  416*416   | [model](https://paddledet.bj.bcebos.com/deploy/third_engine/picodet_m_416_lcnet_postprocessed.onnx) |
+| PicoDet-L |  320*320   | [model](https://paddledet.bj.bcebos.com/deploy/third_engine/picodet_l_320_lcnet_postprocessed.onnx) |
+| PicoDet-L |  416*416   | [model](https://paddledet.bj.bcebos.com/deploy/third_engine/picodet_l_416_lcnet_postprocessed.onnx) |
+| PicoDet-L |  640*640   | [model](https://paddledet.bj.bcebos.com/deploy/third_engine/picodet_l_640_lcnet_postprocessed.onnx) |
--- a/deploy/third_engine/demo_onnxruntime/coco_label.txt
+++ b/deploy/third_engine/demo_onnxruntime/coco_label.txt
+person
+bicycle
+car
+motorbike
+aeroplane
+bus
+train
+truck
+boat
+traffic light
+fire hydrant
+stop sign
+parking meter
+bench
+bird
+cat
+dog
+horse
+sheep
+cow
+elephant
+bear
+zebra
+giraffe
+backpack
+umbrella
+handbag
+tie
+suitcase
+frisbee
+skis
+snowboard
+sports ball
+kite
+baseball bat
+baseball glove
+skateboard
+surfboard
+tennis racket
+bottle
+wine glass
+cup
+fork
+knife
+spoon
+bowl
+banana
+apple
+sandwich
+orange
+broccoli
+carrot
+hot dog
+pizza
+donut
+cake
+chair
+sofa
+pottedplant
+bed
+diningtable
+toilet
+tvmonitor
+laptop
+mouse
+remote
+keyboard
+cell phone
+microwave
+oven
+toaster
+sink
+refrigerator
+book
+clock
+vase
+scissors
+teddy bear
+hair drier
+toothbrush
--- a/deploy/third_engine/demo_onnxruntime/imgs/bus.jpg
+++ b/deploy/third_engine/demo_onnxruntime/imgs/bus.jpg
--- a/deploy/third_engine/demo_onnxruntime/imgs/dog.jpg
+++ b/deploy/third_engine/demo_onnxruntime/imgs/dog.jpg
--- a/deploy/third_engine/demo_onnxruntime/infer_demo.py
+++ b/deploy/third_engine/demo_onnxruntime/infer_demo.py
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import cv2
+import numpy as np
+import argparse
+import onnxruntime as ort
+from pathlib import Path
+from tqdm import tqdm
+
+
+class PicoDet():
+    def __init__(self,
+                 model_pb_path,
+                 label_path,
+                 prob_threshold=0.4,
+                 iou_threshold=0.3):
+        self.classes = list(
+            map(lambda x: x.strip(), open(label_path, 'r').readlines()))
+        self.num_classes = len(self.classes)
+        self.prob_threshold = prob_threshold
+        self.iou_threshold = iou_threshold
+        self.mean = np.array(
+            [103.53, 116.28, 123.675], dtype=np.float32).reshape(1, 1, 3)
+        self.std = np.array(
+            [57.375, 57.12, 58.395], dtype=np.float32).reshape(1, 1, 3)
+        so = ort.SessionOptions()
+        so.log_severity_level = 3
+        self.net = ort.InferenceSession(model_pb_path, so)
+        inputs_name = [a.name for a in self.net.get_inputs()]
+        inputs_shape = {
+            k: v.shape
+            for k, v in zip(inputs_name, self.net.get_inputs())
+        }
+        self.input_shape = inputs_shape['image'][2:]
+
+    def _normalize(self, img):
+        img = img.astype(np.float32)
+        img = (img / 255.0 - self.mean / 255.0) / (self.std / 255.0)
+        return img
+
+    def resize_image(self, srcimg, keep_ratio=False):
+        top, left, newh, neww = 0, 0, self.input_shape[0], self.input_shape[1]
+        origin_shape = srcimg.shape[:2]
+        im_scale_y = newh / float(origin_shape[0])
+        im_scale_x = neww / float(origin_shape[1])
+        img_shape = np.array([
+            [float(self.input_shape[0]), float(self.input_shape[1])]
+        ]).astype('float32')
+        scale_factor = np.array([[im_scale_y, im_scale_x]]).astype('float32')
+
+        if keep_ratio and srcimg.shape[0] != srcimg.shape[1]:
+            hw_scale = srcimg.shape[0] / srcimg.shape[1]
+            if hw_scale > 1:
+                newh, neww = self.input_shape[0], int(self.input_shape[1] /
+                                                      hw_scale)
+                img = cv2.resize(
+                    srcimg, (neww, newh), interpolation=cv2.INTER_AREA)
+                left = int((self.input_shape[1] - neww) * 0.5)
+                img = cv2.copyMakeBorder(
+                    img,
+                    0,
+                    0,
+                    left,
+                    self.input_shape[1] - neww - left,
+                    cv2.BORDER_CONSTANT,
+                    value=0)  # add border
+            else:
+                newh, neww = int(self.input_shape[0] *
+                                 hw_scale), self.input_shape[1]
+                img = cv2.resize(
+                    srcimg, (neww, newh), interpolation=cv2.INTER_AREA)
+                top = int((self.input_shape[0] - newh) * 0.5)
+                img = cv2.copyMakeBorder(
+                    img,
+                    top,
+                    self.input_shape[0] - newh - top,
+                    0,
+                    0,
+                    cv2.BORDER_CONSTANT,
+                    value=0)
+        else:
+            img = cv2.resize(
+                srcimg, self.input_shape, interpolation=cv2.INTER_LINEAR)
+
+        return img, img_shape, scale_factor
+
+    def get_color_map_list(self, num_classes):
+        color_map = num_classes * [0, 0, 0]
+        for i in range(0, num_classes):
+            j = 0
+            lab = i
+            while lab:
+                color_map[i * 3] |= (((lab >> 0) & 1) << (7 - j))
+                color_map[i * 3 + 1] |= (((lab >> 1) & 1) << (7 - j))
+                color_map[i * 3 + 2] |= (((lab >> 2) & 1) << (7 - j))
+                j += 1
+                lab >>= 3
+        color_map = [color_map[i:i + 3] for i in range(0, len(color_map), 3)]
+        return color_map
+
+    def detect(self, srcimg):
+        img, im_shape, scale_factor = self.resize_image(srcimg)
+        img = self._normalize(img)
+
+        blob = np.expand_dims(np.transpose(img, (2, 0, 1)), axis=0)
+
+        inputs_dict = {
+            'im_shape': im_shape,
+            'image': blob,
+            'scale_factor': scale_factor
+        }
+        inputs_name = [a.name for a in self.net.get_inputs()]
+        net_inputs = {k: inputs_dict[k] for k in inputs_name}
+
+        outs = self.net.run(None, net_inputs)
+
+        outs = np.array(outs[0])
+        expect_boxes = (outs[:, 1] > 0.5) & (outs[:, 0] > -1)
+        np_boxes = outs[expect_boxes, :]
+
+        color_list = self.get_color_map_list(self.num_classes)
+        clsid2color = {}
+
+        for i in range(np_boxes.shape[0]):
+            classid, conf = int(np_boxes[i, 0]), np_boxes[i, 1]
+            xmin, ymin, xmax, ymax = int(np_boxes[i, 2]), int(np_boxes[
+                i, 3]), int(np_boxes[i, 4]), int(np_boxes[i, 5])
+
+            if classid not in clsid2color:
+                clsid2color[classid] = color_list[classid]
+            color = tuple(clsid2color[classid])
+
+            cv2.rectangle(
+                srcimg, (xmin, ymin), (xmax, ymax), color, thickness=2)
+            print(self.classes[classid] + ': ' + str(round(conf, 3)))
+            cv2.putText(
+                srcimg,
+                self.classes[classid] + ':' + str(round(conf, 3)), (xmin,
+                                                                    ymin - 10),
+                cv2.FONT_HERSHEY_SIMPLEX,
+                0.8, (0, 255, 0),
+                thickness=2)
+
+        return srcimg
+
+    def detect_folder(self, img_fold, result_path):
+        img_fold = Path(img_fold)
+        result_path = Path(result_path)
+        result_path.mkdir(parents=True, exist_ok=True)
+
+        img_name_list = filter(
+            lambda x: str(x).endswith(".png") or str(x).endswith(".jpg"),
+            img_fold.iterdir(), )
+        img_name_list = list(img_name_list)
+        print(f"find {len(img_name_list)} images")
+
+        for img_path in tqdm(img_name_list):
+            img = cv2.imread(str(img_path), 1)
+            img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
+
+            srcimg = net.detect(img)
+            save_path = str(result_path / img_path.name.replace(".png", ".jpg"))
+            cv2.imwrite(save_path, srcimg)
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        '--modelpath',
+        type=str,
+        default='onnx_file/picodet_s_320_lcnet_postprocessed.onnx',
+        help="onnx filepath")
+    parser.add_argument(
+        '--classfile',
+        type=str,
+        default='coco_label.txt',
+        help="classname filepath")
+    parser.add_argument(
+        '--confThreshold', default=0.5, type=float, help='class confidence')
+    parser.add_argument(
+        '--nmsThreshold', default=0.6, type=float, help='nms iou thresh')
+    parser.add_argument(
+        "--img_fold", dest="img_fold", type=str, default="./imgs")
+    parser.add_argument(
+        "--result_fold", dest="result_fold", type=str, default="results")
+    args = parser.parse_args()
+
+    net = PicoDet(
+        args.modelpath,
+        args.classfile,
+        prob_threshold=args.confThreshold,
+        iou_threshold=args.nmsThreshold)
+
+    net.detect_folder(args.img_fold, args.result_fold)
+    print(
+        f'infer results in ./deploy/third_engine/demo_onnxruntime/{args.result_fold}'
+    )