update code

fccfdfa5 · dlyrm · dcc7bf4f · fccfdfa5 · fccfdfa5 · fccfdfa5
Commit fccfdfa5 authored Dec 25, 2023 by dlyrm
20 changed files
--- a/deploy/third_engine/demo_openvino/CMakeLists.txt
+++ b/deploy/third_engine/demo_openvino/CMakeLists.txt
+cmake_minimum_required(VERSION 3.4.1)
+set(CMAKE_CXX_STANDARD 14)
+project(picodet_demo)
+find_package(OpenCV REQUIRED)
+find_package(InferenceEngine REQUIRED)
+find_package(ngraph REQUIRED)
+include_directories(
+    ${OpenCV_INCLUDE_DIRS}
+    ${CMAKE_CURRENT_SOURCE_DIR}
+    ${CMAKE_CURRENT_BINARY_DIR}
+)
+add_executable(picodet_demo main.cpp picodet_openvino.cpp)
+target_link_libraries(
+    picodet_demo
+    ${InferenceEngine_LIBRARIES}
+    ${NGRAPH_LIBRARIES}
+    ${OpenCV_LIBS}
+)
--- a/deploy/third_engine/demo_openvino/README.md
+++ b/deploy/third_engine/demo_openvino/README.md
+# PicoDet OpenVINO Demo
+This fold provides PicoDet inference code using
+[Intel's OpenVINO Toolkit](https://software.intel.com/content/www/us/en/develop/tools/openvino-toolkit.html). Most of the implements in this fold are same as *demo_ncnn*.  
+**Recommand** to use the xxx.tar.gz file to install instead of github method, [link](https://registrationcenter-download.intel.com/akdlm/irc_nas/18096/l_openvino_toolkit_p_2021.4.689.tgz).
+## Install OpenVINO Toolkit
+Go to [OpenVINO HomePage](https://software.intel.com/content/www/us/en/develop/tools/openvino-toolkit.html)
+Download a suitable version and install.
+Follow the official Get Started Guides: https://docs.openvinotoolkit.org/latest/get_started_guides.html
+## Set the Environment Variables
+### Windows:
+Run this command in cmd. (Every time before using OpenVINO)
+```cmd
+<INSTSLL_DIR>\openvino_2021\bin\setupvars.bat
+```
+Or set the system environment variables once for all:
+Name                  |Value
+:--------------------:|:--------:
+INTEL_OPENVINO_DIR | <INSTSLL_DIR>\openvino_2021
+INTEL_CVSDK_DIR | %INTEL_OPENVINO_DIR%
+InferenceEngine_DIR | %INTEL_OPENVINO_DIR%\deployment_tools\inference_engine\share
+HDDL_INSTALL_DIR | %INTEL_OPENVINO_DIR%\deployment_tools\inference_engine\external\hddl
+ngraph_DIR | %INTEL_OPENVINO_DIR%\deployment_tools\ngraph\cmake
+And add this to ```Path```
+```
+%INTEL_OPENVINO_DIR%\deployment_tools\inference_engine\bin\intel64\Debug;%INTEL_OPENVINO_DIR%\deployment_tools\inference_engine\bin\intel64\Release;%HDDL_INSTALL_DIR%\bin;%INTEL_OPENVINO_DIR%\deployment_tools\inference_engine\external\tbb\bin;%INTEL_OPENVINO_DIR%\deployment_tools\ngraph\lib
+```
+### Linux
+Run this command in shell. (Every time before using OpenVINO)
+```shell
+source /opt/intel/openvino_2021/bin/setupvars.sh
+```
+Or edit .bashrc
+```shell
+vi ~/.bashrc
+```
+Add this line to the end of the file
+```shell
+source /opt/intel/openvino_2021/bin/setupvars.sh
+```
+## Convert model
+   Convert to OpenVINO
+   ``` shell
+   cd <INSTSLL_DIR>/openvino_2021/deployment_tools/model_optimizer
+   ```
+   Install requirements for convert tool
+   ```shell
+   cd ./install_prerequisites
+   sudo install_prerequisites_onnx.sh
+   ```
+   Then convert model. Notice: mean_values and scale_values should be the same with your training settings in YAML config file.
+   ```shell
+   python3 mo_onnx.py --input_model <ONNX_MODEL> --mean_values [103.53,116.28,123.675] --scale_values [57.375,57.12,58.395]
+   ```
+## Build
+### Windows
+```cmd
+<OPENVINO_INSTSLL_DIR>\openvino_2021\bin\setupvars.bat
+mkdir -p build
+cd build
+cmake ..
+msbuild picodet_demo.vcxproj /p:configuration=release /p:platform=x64
+```
+### Linux
+```shell
+source /opt/intel/openvino_2021/bin/setupvars.sh
+mkdir build
+cd build
+cmake ..
+make
+```
+## Run demo
+Download PicoDet openvino model [PicoDet openvino model download link](https://paddledet.bj.bcebos.com/deploy/third_engine/picodet_m_416_openvino.zip).
+move picodet openvino model files to the demo's weight folder.
+### Edit file
+```
+step1:
+main.cpp
+#define image_size 416
+...
+auto detector = PicoDet("../weight/picodet_m_416.xml");
+...
+step2:
+picodet_openvino.h
+#define image_size 416
+```
+### Webcam
+```shell
+picodet_demo 0 0
+```
+### Inference images
+```shell
+picodet_demo 1 IMAGE_FOLDER/*.jpg
+```
+### Inference video
+```shell
+picodet_demo 2 VIDEO_PATH
+```
+### Benchmark
+```shell
+picodet_demo 3 0
+```
--- a/deploy/third_engine/demo_openvino/main.cpp
+++ b/deploy/third_engine/demo_openvino/main.cpp
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// reference from https://github.com/RangiLyu/nanodet
+#include "picodet_openvino.h"
+#include <iostream>
+#include <opencv2/core/core.hpp>
+#include <opencv2/highgui/highgui.hpp>
+#include <opencv2/imgproc/imgproc.hpp>
+#define image_size 416
+struct object_rect {
+  int x;
+  int y;
+  int width;
+  int height;
+};
+int resize_uniform(cv::Mat &src, cv::Mat &dst, cv::Size dst_size,
+                   object_rect &effect_area) {
+  int w = src.cols;
+  int h = src.rows;
+  int dst_w = dst_size.width;
+  int dst_h = dst_size.height;
+  dst = cv::Mat(cv::Size(dst_w, dst_h), CV_8UC3, cv::Scalar(0));
+  float ratio_src = w * 1.0 / h;
+  float ratio_dst = dst_w * 1.0 / dst_h;
+  int tmp_w = 0;
+  int tmp_h = 0;
+  if (ratio_src > ratio_dst) {
+    tmp_w = dst_w;
+    tmp_h = floor((dst_w * 1.0 / w) * h);
+  } else if (ratio_src < ratio_dst) {
+    tmp_h = dst_h;
+    tmp_w = floor((dst_h * 1.0 / h) * w);
+  } else {
+    cv::resize(src, dst, dst_size);
+    effect_area.x = 0;
+    effect_area.y = 0;
+    effect_area.width = dst_w;
+    effect_area.height = dst_h;
+    return 0;
+  }
+  cv::Mat tmp;
+  cv::resize(src, tmp, cv::Size(tmp_w, tmp_h));
+  if (tmp_w != dst_w) {
+    int index_w = floor((dst_w - tmp_w) / 2.0);
+    for (int i = 0; i < dst_h; i++) {
+      memcpy(dst.data + i * dst_w * 3 + index_w * 3, tmp.data + i * tmp_w * 3,
+             tmp_w * 3);
+    }
+    effect_area.x = index_w;
+    effect_area.y = 0;
+    effect_area.width = tmp_w;
+    effect_area.height = tmp_h;
+  } else if (tmp_h != dst_h) {
+    int index_h = floor((dst_h - tmp_h) / 2.0);
+    memcpy(dst.data + index_h * dst_w * 3, tmp.data, tmp_w * tmp_h * 3);
+    effect_area.x = 0;
+    effect_area.y = index_h;
+    effect_area.width = tmp_w;
+    effect_area.height = tmp_h;
+  } else {
+    printf("error\n");
+  }
+  return 0;
+}
+const int color_list[80][3] = {
+    {216, 82, 24},   {236, 176, 31},  {125, 46, 141},  {118, 171, 47},
+    {76, 189, 237},  {238, 19, 46},   {76, 76, 76},    {153, 153, 153},
+    {255, 0, 0},     {255, 127, 0},   {190, 190, 0},   {0, 255, 0},
+    {0, 0, 255},     {170, 0, 255},   {84, 84, 0},     {84, 170, 0},
+    {84, 255, 0},    {170, 84, 0},    {170, 170, 0},   {170, 255, 0},
+    {255, 84, 0},    {255, 170, 0},   {255, 255, 0},   {0, 84, 127},
+    {0, 170, 127},   {0, 255, 127},   {84, 0, 127},    {84, 84, 127},
+    {84, 170, 127},  {84, 255, 127},  {170, 0, 127},   {170, 84, 127},
+    {170, 170, 127}, {170, 255, 127}, {255, 0, 127},   {255, 84, 127},
+    {255, 170, 127}, {255, 255, 127}, {0, 84, 255},    {0, 170, 255},
+    {0, 255, 255},   {84, 0, 255},    {84, 84, 255},   {84, 170, 255},
+    {84, 255, 255},  {170, 0, 255},   {170, 84, 255},  {170, 170, 255},
+    {170, 255, 255}, {255, 0, 255},   {255, 84, 255},  {255, 170, 255},
+    {42, 0, 0},      {84, 0, 0},      {127, 0, 0},     {170, 0, 0},
+    {212, 0, 0},     {255, 0, 0},     {0, 42, 0},      {0, 84, 0},
+    {0, 127, 0},     {0, 170, 0},     {0, 212, 0},     {0, 255, 0},
+    {0, 0, 42},      {0, 0, 84},      {0, 0, 127},     {0, 0, 170},
+    {0, 0, 212},     {0, 0, 255},     {0, 0, 0},       {36, 36, 36},
+    {72, 72, 72},    {109, 109, 109}, {145, 145, 145}, {182, 182, 182},
+    {218, 218, 218}, {0, 113, 188},   {80, 182, 188},  {127, 127, 0},
+};
+void draw_bboxes(const cv::Mat &bgr, const std::vector<BoxInfo> &bboxes,
+                 object_rect effect_roi) {
+  static const char *class_names[] = {
+      "person",        "bicycle",      "car",
+      "motorcycle",    "airplane",     "bus",
+      "train",         "truck",        "boat",
+      "traffic light", "fire hydrant", "stop sign",
+      "parking meter", "bench",        "bird",
+      "cat",           "dog",          "horse",
+      "sheep",         "cow",          "elephant",
+      "bear",          "zebra",        "giraffe",
+      "backpack",      "umbrella",     "handbag",
+      "tie",           "suitcase",     "frisbee",
+      "skis",          "snowboard",    "sports ball",
+      "kite",          "baseball bat", "baseball glove",
+      "skateboard",    "surfboard",    "tennis racket",
+      "bottle",        "wine glass",   "cup",
+      "fork",          "knife",        "spoon",
+      "bowl",          "banana",       "apple",
+      "sandwich",      "orange",       "broccoli",
+      "carrot",        "hot dog",      "pizza",
+      "donut",         "cake",         "chair",
+      "couch",         "potted plant", "bed",
+      "dining table",  "toilet",       "tv",
+      "laptop",        "mouse",        "remote",
+      "keyboard",      "cell phone",   "microwave",
+      "oven",          "toaster",      "sink",
+      "refrigerator",  "book",         "clock",
+      "vase",          "scissors",     "teddy bear",
+      "hair drier",    "toothbrush"};
+  cv::Mat image = bgr.clone();
+  int src_w = image.cols;
+  int src_h = image.rows;
+  int dst_w = effect_roi.width;
+  int dst_h = effect_roi.height;
+  float width_ratio = (float)src_w / (float)dst_w;
+  float height_ratio = (float)src_h / (float)dst_h;
+  for (size_t i = 0; i < bboxes.size(); i++) {
+    const BoxInfo &bbox = bboxes[i];
+    cv::Scalar color =
+        cv::Scalar(color_list[bbox.label][0], color_list[bbox.label][1],
+                   color_list[bbox.label][2]);
+    cv::rectangle(image,
+                  cv::Rect(cv::Point((bbox.x1 - effect_roi.x) * width_ratio,
+                                     (bbox.y1 - effect_roi.y) * height_ratio),
+                           cv::Point((bbox.x2 - effect_roi.x) * width_ratio,
+                                     (bbox.y2 - effect_roi.y) * height_ratio)),
+                  color);
+    char text[256];
+    sprintf(text, "%s %.1f%%", class_names[bbox.label], bbox.score * 100);
+    int baseLine = 0;
+    cv::Size label_size =
+        cv::getTextSize(text, cv::FONT_HERSHEY_SIMPLEX, 0.4, 1, &baseLine);
+    int x = (bbox.x1 - effect_roi.x) * width_ratio;
+    int y =
+        (bbox.y1 - effect_roi.y) * height_ratio - label_size.height - baseLine;
+    if (y < 0)
+      y = 0;
+    if (x + label_size.width > image.cols)
+      x = image.cols - label_size.width;
+    cv::rectangle(image, cv::Rect(cv::Point(x, y),
+                                  cv::Size(label_size.width,
+                                           label_size.height + baseLine)),
+                  color, -1);
+    cv::putText(image, text, cv::Point(x, y + label_size.height),
+                cv::FONT_HERSHEY_SIMPLEX, 0.4, cv::Scalar(255, 255, 255));
+  }
+  cv::imwrite("../predict.jpg", image);
+}
+int image_demo(PicoDet &detector, const char *imagepath) {
+  std::vector<std::string> filenames;
+  cv::glob(imagepath, filenames, false);
+  for (auto img_name : filenames) {
+    cv::Mat image = cv::imread(img_name);
+    if (image.empty()) {
+      return -1;
+    }
+    object_rect effect_roi;
+    cv::Mat resized_img;
+    resize_uniform(image, resized_img, cv::Size(image_size, image_size),
+                   effect_roi);
+    auto results = detector.detect(resized_img, 0.4, 0.5);
+    draw_bboxes(image, results, effect_roi);
+  }
+  return 0;
+}
+int webcam_demo(PicoDet &detector, int cam_id) {
+  cv::Mat image;
+  cv::VideoCapture cap(cam_id);
+  while (true) {
+    cap >> image;
+    object_rect effect_roi;
+    cv::Mat resized_img;
+    resize_uniform(image, resized_img, cv::Size(image_size, image_size),
+                   effect_roi);
+    auto results = detector.detect(resized_img, 0.4, 0.5);
+    draw_bboxes(image, results, effect_roi);
+    cv::waitKey(1);
+  }
+  return 0;
+}
+int video_demo(PicoDet &detector, const char *path) {
+  cv::Mat image;
+  cv::VideoCapture cap(path);
+  while (true) {
+    cap >> image;
+    object_rect effect_roi;
+    cv::Mat resized_img;
+    resize_uniform(image, resized_img, cv::Size(image_size, image_size),
+                   effect_roi);
+    auto results = detector.detect(resized_img, 0.4, 0.5);
+    draw_bboxes(image, results, effect_roi);
+    cv::waitKey(1);
+  }
+  return 0;
+}
+int benchmark(PicoDet &detector) {
+  int loop_num = 100;
+  int warm_up = 8;
+  double time_min = DBL_MAX;
+  double time_max = -DBL_MAX;
+  double time_avg = 0;
+  cv::Mat image(image_size, image_size, CV_8UC3, cv::Scalar(1, 1, 1));
+  for (int i = 0; i < warm_up + loop_num; i++) {
+    auto start = std::chrono::steady_clock::now();
+    std::vector<BoxInfo> results;
+    results = detector.detect(image, 0.4, 0.5);
+    auto end = std::chrono::steady_clock::now();
+    double time =
+        std::chrono::duration<double, std::milli>(end - start).count();
+    if (i >= warm_up) {
+      time_min = (std::min)(time_min, time);
+      time_max = (std::max)(time_max, time);
+      time_avg += time;
+    }
+  }
+  time_avg /= loop_num;
+  fprintf(stderr, "%20s  min = %7.2f  max = %7.2f  avg = %7.2f\n", "picodet",
+          time_min, time_max, time_avg);
+  return 0;
+}
+int main(int argc, char **argv) {
+  if (argc != 3) {
+    fprintf(stderr, "usage: %s [mode] [path]. \n For webcam mode=0, path is "
+                    "cam id; \n For image demo, mode=1, path=xxx/xxx/*.jpg; \n "
+                    "For video, mode=2; \n For benchmark, mode=3 path=0.\n",
+            argv[0]);
+    return -1;
+  }
+  std::cout << "start init model" << std::endl;
+  auto detector = PicoDet("../weight/picodet_m_416.xml");
+  std::cout << "success" << std::endl;
+  int mode = atoi(argv[1]);
+  switch (mode) {
+  case 0: {
+    int cam_id = atoi(argv[2]);
+    webcam_demo(detector, cam_id);
+    break;
+  }
+  case 1: {
+    const char *images = argv[2];
+    image_demo(detector, images);
+    break;
+  }
+  case 2: {
+    const char *path = argv[2];
+    video_demo(detector, path);
+    break;
+  }
+  case 3: {
+    benchmark(detector);
+    break;
+  }
+  default: {
+    fprintf(stderr, "usage: %s [mode] [path]. \n For webcam mode=0, path is "
+                    "cam id; \n For image demo, mode=1, path=xxx/xxx/*.jpg; \n "
+                    "For video, mode=2; \n For benchmark, mode=3 path=0.\n",
+            argv[0]);
+    break;
+  }
+  }
+}
--- a/deploy/third_engine/demo_openvino/picodet_openvino.cpp
+++ b/deploy/third_engine/demo_openvino/picodet_openvino.cpp
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// reference from https://github.com/RangiLyu/nanodet/tree/main/demo_openvino
+#include "picodet_openvino.h"
+inline float fast_exp(float x) {
+  union {
+    uint32_t i;
+    float f;
+  } v{};
+  v.i = (1 << 23) * (1.4426950409 * x + 126.93490512f);
+  return v.f;
+}
+inline float sigmoid(float x) { return 1.0f / (1.0f + fast_exp(-x)); }
+template <typename _Tp>
+int activation_function_softmax(const _Tp *src, _Tp *dst, int length) {
+  const _Tp alpha = *std::max_element(src, src + length);
+  _Tp denominator{0};
+  for (int i = 0; i < length; ++i) {
+    dst[i] = fast_exp(src[i] - alpha);
+    denominator += dst[i];
+  }
+  for (int i = 0; i < length; ++i) {
+    dst[i] /= denominator;
+  }
+  return 0;
+}
+PicoDet::PicoDet(const char *model_path) {
+  InferenceEngine::Core ie;
+  InferenceEngine::CNNNetwork model = ie.ReadNetwork(model_path);
+  // prepare input settings
+  InferenceEngine::InputsDataMap inputs_map(model.getInputsInfo());
+  input_name_ = inputs_map.begin()->first;
+  InferenceEngine::InputInfo::Ptr input_info = inputs_map.begin()->second;
+  // prepare output settings
+  InferenceEngine::OutputsDataMap outputs_map(model.getOutputsInfo());
+  for (auto &output_info : outputs_map) {
+    output_info.second->setPrecision(InferenceEngine::Precision::FP32);
+  }
+  // get network
+  network_ = ie.LoadNetwork(model, "CPU");
+  infer_request_ = network_.CreateInferRequest();
+}
+PicoDet::~PicoDet() {}
+void PicoDet::preprocess(cv::Mat &image, InferenceEngine::Blob::Ptr &blob) {
+  int img_w = image.cols;
+  int img_h = image.rows;
+  int channels = 3;
+  InferenceEngine::MemoryBlob::Ptr mblob =
+      InferenceEngine::as<InferenceEngine::MemoryBlob>(blob);
+  if (!mblob) {
+    THROW_IE_EXCEPTION
+        << "We expect blob to be inherited from MemoryBlob in matU8ToBlob, "
+        << "but by fact we were not able to cast inputBlob to MemoryBlob";
+  }
+  auto mblobHolder = mblob->wmap();
+  float *blob_data = mblobHolder.as<float *>();
+  for (size_t c = 0; c < channels; c++) {
+    for (size_t h = 0; h < img_h; h++) {
+      for (size_t w = 0; w < img_w; w++) {
+        blob_data[c * img_w * img_h + h * img_w + w] =
+            (float)image.at<cv::Vec3b>(h, w)[c];
+      }
+    }
+  }
+}
+std::vector<BoxInfo> PicoDet::detect(cv::Mat image, float score_threshold,
+                                     float nms_threshold) {
+  InferenceEngine::Blob::Ptr input_blob = infer_request_.GetBlob(input_name_);
+  preprocess(image, input_blob);
+  // do inference
+  infer_request_.Infer();
+  // get output
+  std::vector<std::vector<BoxInfo>> results;
+  results.resize(this->num_class_);
+  for (const auto &head_info : this->heads_info_) {
+    const InferenceEngine::Blob::Ptr dis_pred_blob =
+        infer_request_.GetBlob(head_info.dis_layer);
+    const InferenceEngine::Blob::Ptr cls_pred_blob =
+        infer_request_.GetBlob(head_info.cls_layer);
+    auto mdis_pred =
+        InferenceEngine::as<InferenceEngine::MemoryBlob>(dis_pred_blob);
+    auto mdis_pred_holder = mdis_pred->rmap();
+    const float *dis_pred = mdis_pred_holder.as<const float *>();
+    auto mcls_pred =
+        InferenceEngine::as<InferenceEngine::MemoryBlob>(cls_pred_blob);
+    auto mcls_pred_holder = mcls_pred->rmap();
+    const float *cls_pred = mcls_pred_holder.as<const float *>();
+    this->decode_infer(cls_pred, dis_pred, head_info.stride, score_threshold,
+                       results);
+  }
+  std::vector<BoxInfo> dets;
+  for (int i = 0; i < (int)results.size(); i++) {
+    this->nms(results[i], nms_threshold);
+    for (auto &box : results[i]) {
+      dets.push_back(box);
+    }
+  }
+  return dets;
+}
+void PicoDet::decode_infer(const float *&cls_pred, const float *&dis_pred,
+                           int stride, float threshold,
+                           std::vector<std::vector<BoxInfo>> &results) {
+  int feature_h = ceil((float)input_size_ / stride);
+  int feature_w = ceil((float)input_size_ / stride);
+  for (int idx = 0; idx < feature_h * feature_w; idx++) {
+    int row = idx / feature_w;
+    int col = idx % feature_w;
+    float score = 0;
+    int cur_label = 0;
+    for (int label = 0; label < num_class_; label++) {
+      if (cls_pred[idx * num_class_ + label] > score) {
+        score = cls_pred[idx * num_class_ + label];
+        cur_label = label;
+      }
+    }
+    if (score > threshold) {
+      const float *bbox_pred = dis_pred + idx * (reg_max_ + 1) * 4;
+      results[cur_label].push_back(
+          this->disPred2Bbox(bbox_pred, cur_label, score, col, row, stride));
+    }
+  }
+}
+BoxInfo PicoDet::disPred2Bbox(const float *&dfl_det, int label, float score,
+                              int x, int y, int stride) {
+  float ct_x = (x + 0.5) * stride;
+  float ct_y = (y + 0.5) * stride;
+  std::vector<float> dis_pred;
+  dis_pred.resize(4);
+  for (int i = 0; i < 4; i++) {
+    float dis = 0;
+    float *dis_after_sm = new float[reg_max_ + 1];
+    activation_function_softmax(dfl_det + i * (reg_max_ + 1), dis_after_sm,
+                                reg_max_ + 1);
+    for (int j = 0; j < reg_max_ + 1; j++) {
+      dis += j * dis_after_sm[j];
+    }
+    dis *= stride;
+    dis_pred[i] = dis;
+    delete[] dis_after_sm;
+  }
+  float xmin = (std::max)(ct_x - dis_pred[0], .0f);
+  float ymin = (std::max)(ct_y - dis_pred[1], .0f);
+  float xmax = (std::min)(ct_x + dis_pred[2], (float)this->input_size_);
+  float ymax = (std::min)(ct_y + dis_pred[3], (float)this->input_size_);
+  return BoxInfo{xmin, ymin, xmax, ymax, score, label};
+}
+void PicoDet::nms(std::vector<BoxInfo> &input_boxes, float NMS_THRESH) {
+  std::sort(input_boxes.begin(), input_boxes.end(),
+            [](BoxInfo a, BoxInfo b) { return a.score > b.score; });
+  std::vector<float> vArea(input_boxes.size());
+  for (int i = 0; i < int(input_boxes.size()); ++i) {
+    vArea[i] = (input_boxes.at(i).x2 - input_boxes.at(i).x1 + 1) *
+               (input_boxes.at(i).y2 - input_boxes.at(i).y1 + 1);
+  }
+  for (int i = 0; i < int(input_boxes.size()); ++i) {
+    for (int j = i + 1; j < int(input_boxes.size());) {
+      float xx1 = (std::max)(input_boxes[i].x1, input_boxes[j].x1);
+      float yy1 = (std::max)(input_boxes[i].y1, input_boxes[j].y1);
+      float xx2 = (std::min)(input_boxes[i].x2, input_boxes[j].x2);
+      float yy2 = (std::min)(input_boxes[i].y2, input_boxes[j].y2);
+      float w = (std::max)(float(0), xx2 - xx1 + 1);
+      float h = (std::max)(float(0), yy2 - yy1 + 1);
+      float inter = w * h;
+      float ovr = inter / (vArea[i] + vArea[j] - inter);
+      if (ovr >= NMS_THRESH) {
+        input_boxes.erase(input_boxes.begin() + j);
+        vArea.erase(vArea.begin() + j);
+      } else {
+        j++;
+      }
+    }
+  }
+}
--- a/deploy/third_engine/demo_openvino/picodet_openvino.h
+++ b/deploy/third_engine/demo_openvino/picodet_openvino.h
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// reference from https://github.com/RangiLyu/nanodet/tree/main/demo_openvino
+#ifndef _PICODET_OPENVINO_H_
+#define _PICODET_OPENVINO_H_
+#include <inference_engine.hpp>
+#include <opencv2/core.hpp>
+#include <string>
+#define image_size 416
+typedef struct HeadInfo {
+  std::string cls_layer;
+  std::string dis_layer;
+  int stride;
+} HeadInfo;
+typedef struct BoxInfo {
+  float x1;
+  float y1;
+  float x2;
+  float y2;
+  float score;
+  int label;
+} BoxInfo;
+class PicoDet {
+public:
+  PicoDet(const char *param);
+  ~PicoDet();
+  InferenceEngine::ExecutableNetwork network_;
+  InferenceEngine::InferRequest infer_request_;
+  // static bool hasGPU;
+  std::vector<HeadInfo> heads_info_{
+      // cls_pred|dis_pred|stride
+      {"transpose_0.tmp_0", "transpose_1.tmp_0", 8},
+      {"transpose_2.tmp_0", "transpose_3.tmp_0", 16},
+      {"transpose_4.tmp_0", "transpose_5.tmp_0", 32},
+      {"transpose_6.tmp_0", "transpose_7.tmp_0", 64},
+  };
+  std::vector<BoxInfo> detect(cv::Mat image, float score_threshold,
+                              float nms_threshold);
+private:
+  void preprocess(cv::Mat &image, InferenceEngine::Blob::Ptr &blob);
+  void decode_infer(const float *&cls_pred, const float *&dis_pred, int stride,
+                    float threshold,
+                    std::vector<std::vector<BoxInfo>> &results);
+  BoxInfo disPred2Bbox(const float *&dfl_det, int label, float score, int x,
+                       int y, int stride);
+  static void nms(std::vector<BoxInfo> &result, float nms_threshold);
+  std::string input_name_;
+  int input_size_ = image_size;
+  int num_class_ = 80;
+  int reg_max_ = 7;
+};
+#endif
--- a/deploy/third_engine/demo_openvino/python/README.md
+++ b/deploy/third_engine/demo_openvino/python/README.md
+# PicoDet OpenVINO Benchmark Demo
+本文件夹提供利用[Intel's OpenVINO Toolkit](https://software.intel.com/content/www/us/en/develop/tools/openvino-toolkit.html)进行PicoDet测速的Benchmark Demo与带后处理的模型Inference Demo。
+## 安装 OpenVINO Toolkit
+前往 [OpenVINO HomePage](https://software.intel.com/content/www/us/en/develop/tools/openvino-toolkit.html)，下载对应版本并安装。
+本demo安装的是 OpenVINO 2022.1.0，可直接运行如下指令安装：
+```shell
+pip install openvino==2022.1.0
+```
+详细安装步骤，可参考[OpenVINO官网](https://docs.openvinotoolkit.org/latest/get_started_guides.html)
+## Benchmark测试
+- 准备测试模型：根据[PicoDet](https://github.com/PaddlePaddle/PaddleDetection/tree/develop/configs/picodet)中【导出及转换模型】步骤，采用不包含后处理的方式导出模型（`-o export.benchmark=True` ），并生成待测试模型简化后的onnx模型（可在下文链接中直接下载）。同时在本目录下新建```out_onnxsim```文件夹，将导出的onnx模型放在该目录下。
+- 准备测试所用图片：本demo默认利用PaddleDetection/demo/[000000014439.jpg](https://github.com/PaddlePaddle/PaddleDetection/blob/develop/demo/000000014439.jpg)
+- 在本目录下直接运行：
+```shell
+# Linux
+python openvino_benchmark.py --img_path ../../../../demo/000000014439.jpg --onnx_path out_onnxsim/picodet_s_320_coco_lcnet.onnx --in_shape 320
+# Windows
+python openvino_benchmark.py --img_path ..\..\..\..\demo\000000014439.jpg --onnx_path out_onnxsim\picodet_s_320_coco_lcnet.onnx --in_shape 320
+```
+- 注意：```--in_shape```为对应模型输入size，默认为320
+## 真实图片测试(网络包含后处理，但不包含NMS)
+- 准备测试模型：根据[PicoDet](https://github.com/PaddlePaddle/PaddleDetection/tree/develop/configs/picodet)中【导出及转换模型】步骤，采用**包含后处理**但**不包含NMS**的方式导出模型（`-o export.benchmark=False export.nms=False` ），并生成待测试模型简化后的onnx模型（可在下文链接中直接下载）。同时在本目录下新建```out_onnxsim_infer```文件夹，将导出的onnx模型放在该目录下。
+- 准备测试所用图片：默认利用../../demo_onnxruntime/imgs/[bus.jpg](https://github.com/PaddlePaddle/PaddleDetection/blob/develop/deploy/third_engine/demo_onnxruntime/imgs/bus.jpg)
+```shell
+# Linux
+python openvino_infer.py --img_path ../../demo_onnxruntime/imgs/bus.jpg --onnx_path out_onnxsim_infer/picodet_s_320_postproccesed_woNMS.onnx --in_shape 320
+# Windows
+python openvino_infer.py --img_path ..\..\demo_onnxruntime\imgs\bus.jpg --onnx_path out_onnxsim_infer\picodet_s_320_postproccesed_woNMS.onnx --in_shape 320
+```
+### 真实图片测试(网络不包含后处理)
+```shell
+# Linux
+python openvino_benchmark.py --benchmark 0 --img_path ../../../../demo/000000014439.jpg --onnx_path out_onnxsim/picodet_s_320_coco_lcnet.onnx --in_shape 320
+# Windows
+python openvino_benchmark.py --benchmark 0 --img_path ..\..\..\..\demo\000000014439.jpg --onnx_path out_onnxsim\picodet_s_320_coco_lcnet.onnx --in_shape 320
+```
+- 结果：
+    <div align="center">
+      <img src="../../../../docs/images/res.jpg" height="500px" >
+    </div>
+## Benchmark结果
+- 测速结果如下：
+| 模型     | 输入尺寸 | ONNX  | 预测时延<sup><small>[CPU](#latency)|
+| :-------- | :--------: | :---------------------: | :----------------: |
+| PicoDet-XS |  320*320   | [( w/ 后处理;w/o NMS)](https://paddledet.bj.bcebos.com/deploy/third_engine/picodet_xs_320_lcnet_postproccesed_woNMS.onnx) &#124; [( w/o 后处理)](https://paddledet.bj.bcebos.com/deploy/third_engine/picodet_xs_320_coco_lcnet.onnx) | 3.9ms |
+| PicoDet-XS |  416*416   | [( w/ 后处理;w/o NMS)](https://paddledet.bj.bcebos.com/deploy/third_engine/picodet_xs_416_lcnet_postproccesed_woNMS.onnx) &#124; [( w/o 后处理)](https://paddledet.bj.bcebos.com/deploy/third_engine/picodet_xs_416_coco_lcnet.onnx) | 6.1ms |
+| PicoDet-S |  320*320   | [( w/ 后处理;w/o NMS)](https://paddledet.bj.bcebos.com/deploy/third_engine/picodet_s_320_lcnet_postproccesed_woNMS.onnx) &#124; [( w/o 后处理)](https://paddledet.bj.bcebos.com/deploy/third_engine/picodet_s_320_coco_lcnet.onnx) |     4.8ms |
+| PicoDet-S |  416*416   |  [( w/ 后处理;w/o NMS)](https://paddledet.bj.bcebos.com/deploy/third_engine/picodet_s_416_lcnet_postproccesed_woNMS.onnx) &#124; [( w/o 后处理)](https://paddledet.bj.bcebos.com/deploy/third_engine/picodet_s_416_coco_lcnet.onnx) |     6.6ms |
+| PicoDet-M |  320*320   | [( w/ 后处理;w/o NMS)](https://paddledet.bj.bcebos.com/deploy/third_engine/picodet_m_320_lcnet_postproccesed_woNMS.onnx) &#124; [( w/o 后处理)](https://paddledet.bj.bcebos.com/deploy/third_engine/picodet_m_320_coco_lcnet.onnx) | 8.2ms  |
+| PicoDet-M |  416*416   | [( w/ 后处理;w/o NMS)](https://paddledet.bj.bcebos.com/deploy/third_engine/picodet_m_416_lcnet_postproccesed_woNMS.onnx) &#124; [( w/o 后处理)](https://paddledet.bj.bcebos.com/deploy/third_engine/picodet_m_416_coco_lcnet.onnx) | 12.7ms |
+| PicoDet-L |  320*320   | [( w/ 后处理;w/o NMS)](https://paddledet.bj.bcebos.com/deploy/third_engine/picodet_l_320_lcnet_postproccesed_woNMS.onnx) &#124; [( w/o 后处理)](https://paddledet.bj.bcebos.com/deploy/third_engine/picodet_l_320_coco_lcnet.onnx) | 11.5ms |
+| PicoDet-L |  416*416   | [( w/ 后处理;w/o NMS)](https://paddledet.bj.bcebos.com/deploy/third_engine/picodet_l_416_lcnet_postproccesed_woNMS.onnx) &#124; [( w/o 后处理)](https://paddledet.bj.bcebos.com/deploy/third_engine/picodet_l_416_coco_lcnet.onnx) |     20.7ms |
+| PicoDet-L |  640*640   | [( w/ 后处理;w/o NMS)](https://paddledet.bj.bcebos.com/deploy/third_engine/picodet_l_640_lcnet_postproccesed_woNMS.onnx) &#124; [( w/o 后处理)](https://paddledet.bj.bcebos.com/deploy/third_engine/picodet_l_640_coco_lcnet.onnx) |     62.5ms |
+- <a name="latency">测试环境：</a> 英特尔酷睿i7 10750H CPU。
--- a/deploy/third_engine/demo_openvino/python/coco_label.txt
+++ b/deploy/third_engine/demo_openvino/python/coco_label.txt
+person
+bicycle
+car
+motorbike
+aeroplane
+bus
+train
+truck
+boat
+traffic light
+fire hydrant
+stop sign
+parking meter
+bench
+bird
+cat
+dog
+horse
+sheep
+cow
+elephant
+bear
+zebra
+giraffe
+backpack
+umbrella
+handbag
+tie
+suitcase
+frisbee
+skis
+snowboard
+sports ball
+kite
+baseball bat
+baseball glove
+skateboard
+surfboard
+tennis racket
+bottle
+wine glass
+cup
+fork
+knife
+spoon
+bowl
+banana
+apple
+sandwich
+orange
+broccoli
+carrot
+hot dog
+pizza
+donut
+cake
+chair
+sofa
+pottedplant
+bed
+diningtable
+toilet
+tvmonitor
+laptop
+mouse
+remote
+keyboard
+cell phone
+microwave
+oven
+toaster
+sink
+refrigerator
+book
+clock
+vase
+scissors
+teddy bear
+hair drier
+toothbrush
--- a/deploy/third_engine/demo_openvino/python/openvino_benchmark.py
+++ b/deploy/third_engine/demo_openvino/python/openvino_benchmark.py
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import cv2
+import numpy as np
+import time
+import argparse
+from scipy.special import softmax
+from openvino.runtime import Core
+def image_preprocess(img_path, re_shape):
+    img = cv2.imread(img_path)
+    img = cv2.resize(
+        img, (re_shape, re_shape), interpolation=cv2.INTER_LANCZOS4)
+    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
+    img = np.transpose(img, [2, 0, 1]) / 255
+    img = np.expand_dims(img, 0)
+    img_mean = np.array([0.485, 0.456, 0.406]).reshape((3, 1, 1))
+    img_std = np.array([0.229, 0.224, 0.225]).reshape((3, 1, 1))
+    img -= img_mean
+    img /= img_std
+    return img.astype(np.float32)
+def draw_box(img, results, class_label, scale_x, scale_y):
+    label_list = list(
+        map(lambda x: x.strip(), open(class_label, 'r').readlines()))
+    for i in range(len(results)):
+        print(label_list[int(results[i][0])], ':', results[i][1])
+        bbox = results[i, 2:]
+        label_id = int(results[i, 0])
+        score = results[i, 1]
+        if (score > 0.20):
+            xmin, ymin, xmax, ymax = [
+                int(bbox[0] * scale_x), int(bbox[1] * scale_y),
+                int(bbox[2] * scale_x), int(bbox[3] * scale_y)
+            ]
+            cv2.rectangle(img, (xmin, ymin), (xmax, ymax), (0, 255, 0), 3)
+            font = cv2.FONT_HERSHEY_SIMPLEX
+            label_text = label_list[label_id]
+            cv2.rectangle(img, (xmin, ymin), (xmax, ymin - 60), (0, 255, 0), -1)
+            cv2.putText(img, "#" + label_text, (xmin, ymin - 10), font, 1,
+                        (255, 255, 255), 2, cv2.LINE_AA)
+            cv2.putText(img,
+                        str(round(score, 3)), (xmin, ymin - 40), font, 0.8,
+                        (255, 255, 255), 2, cv2.LINE_AA)
+    return img
+def hard_nms(box_scores, iou_threshold, top_k=-1, candidate_size=200):
+    """
+    Args:
+        box_scores (N, 5): boxes in corner-form and probabilities.
+        iou_threshold: intersection over union threshold.
+        top_k: keep top_k results. If k <= 0, keep all the results.
+        candidate_size: only consider the candidates with the highest scores.
+    Returns:
+         picked: a list of indexes of the kept boxes
+    """
+    scores = box_scores[:, -1]
+    boxes = box_scores[:, :-1]
+    picked = []
+    indexes = np.argsort(scores)
+    indexes = indexes[-candidate_size:]
+    while len(indexes) > 0:
+        current = indexes[-1]
+        picked.append(current)
+        if 0 < top_k == len(picked) or len(indexes) == 1:
+            break
+        current_box = boxes[current, :]
+        indexes = indexes[:-1]
+        rest_boxes = boxes[indexes, :]
+        iou = iou_of(
+            rest_boxes,
+            np.expand_dims(
+                current_box, axis=0), )
+        indexes = indexes[iou <= iou_threshold]
+    return box_scores[picked, :]
+def iou_of(boxes0, boxes1, eps=1e-5):
+    """Return intersection-over-union (Jaccard index) of boxes.
+    Args:
+        boxes0 (N, 4): ground truth boxes.
+        boxes1 (N or 1, 4): predicted boxes.
+        eps: a small number to avoid 0 as denominator.
+    Returns:
+        iou (N): IoU values.
+    """
+    overlap_left_top = np.maximum(boxes0[..., :2], boxes1[..., :2])
+    overlap_right_bottom = np.minimum(boxes0[..., 2:], boxes1[..., 2:])
+    overlap_area = area_of(overlap_left_top, overlap_right_bottom)
+    area0 = area_of(boxes0[..., :2], boxes0[..., 2:])
+    area1 = area_of(boxes1[..., :2], boxes1[..., 2:])
+    return overlap_area / (area0 + area1 - overlap_area + eps)
+def area_of(left_top, right_bottom):
+    """Compute the areas of rectangles given two corners.
+    Args:
+        left_top (N, 2): left top corner.
+        right_bottom (N, 2): right bottom corner.
+    Returns:
+        area (N): return the area.
+    """
+    hw = np.clip(right_bottom - left_top, 0.0, None)
+    return hw[..., 0] * hw[..., 1]
+class PicoDetPostProcess(object):
+    """
+    Args:
+        input_shape (int): network input image size
+        ori_shape (int): ori image shape of before padding
+        scale_factor (float): scale factor of ori image
+        enable_mkldnn (bool): whether to open MKLDNN
+    """
+    def __init__(self,
+                 input_shape,
+                 ori_shape,
+                 scale_factor,
+                 strides=[8, 16, 32, 64],
+                 score_threshold=0.4,
+                 nms_threshold=0.5,
+                 nms_top_k=1000,
+                 keep_top_k=100):
+        self.ori_shape = ori_shape
+        self.input_shape = input_shape
+        self.scale_factor = scale_factor
+        self.strides = strides
+        self.score_threshold = score_threshold
+        self.nms_threshold = nms_threshold
+        self.nms_top_k = nms_top_k
+        self.keep_top_k = keep_top_k
+    def warp_boxes(self, boxes, ori_shape):
+        """Apply transform to boxes
+        """
+        width, height = ori_shape[1], ori_shape[0]
+        n = len(boxes)
+        if n:
+            # warp points
+            xy = np.ones((n * 4, 3))
+            xy[:, :2] = boxes[:, [0, 1, 2, 3, 0, 3, 2, 1]].reshape(
+                n * 4, 2)  # x1y1, x2y2, x1y2, x2y1
+            # xy = xy @ M.T  # transform
+            xy = (xy[:, :2] / xy[:, 2:3]).reshape(n, 8)  # rescale
+            # create new boxes
+            x = xy[:, [0, 2, 4, 6]]
+            y = xy[:, [1, 3, 5, 7]]
+            xy = np.concatenate(
+                (x.min(1), y.min(1), x.max(1), y.max(1))).reshape(4, n).T
+            # clip boxes
+            xy[:, [0, 2]] = xy[:, [0, 2]].clip(0, width)
+            xy[:, [1, 3]] = xy[:, [1, 3]].clip(0, height)
+            return xy.astype(np.float32)
+        else:
+            return boxes
+    def __call__(self, scores, raw_boxes):
+        batch_size = raw_boxes[0].shape[0]
+        reg_max = int(raw_boxes[0].shape[-1] / 4 - 1)
+        out_boxes_num = []
+        out_boxes_list = []
+        for batch_id in range(batch_size):
+            # generate centers
+            decode_boxes = []
+            select_scores = []
+            for stride, box_distribute, score in zip(self.strides, raw_boxes,
+                                                     scores):
+                box_distribute = box_distribute[batch_id]
+                score = score[batch_id]
+                # centers
+                fm_h = self.input_shape[0] / stride
+                fm_w = self.input_shape[1] / stride
+                h_range = np.arange(fm_h)
+                w_range = np.arange(fm_w)
+                ww, hh = np.meshgrid(w_range, h_range)
+                ct_row = (hh.flatten() + 0.5) * stride
+                ct_col = (ww.flatten() + 0.5) * stride
+                center = np.stack((ct_col, ct_row, ct_col, ct_row), axis=1)
+                # box distribution to distance
+                reg_range = np.arange(reg_max + 1)
+                box_distance = box_distribute.reshape((-1, reg_max + 1))
+                box_distance = softmax(box_distance, axis=1)
+                box_distance = box_distance * np.expand_dims(reg_range, axis=0)
+                box_distance = np.sum(box_distance, axis=1).reshape((-1, 4))
+                box_distance = box_distance * stride
+                # top K candidate
+                topk_idx = np.argsort(score.max(axis=1))[::-1]
+                topk_idx = topk_idx[:self.nms_top_k]
+                center = center[topk_idx]
+                score = score[topk_idx]
+                box_distance = box_distance[topk_idx]
+                # decode box
+                decode_box = center + [-1, -1, 1, 1] * box_distance
+                select_scores.append(score)
+                decode_boxes.append(decode_box)
+            # nms
+            bboxes = np.concatenate(decode_boxes, axis=0)
+            confidences = np.concatenate(select_scores, axis=0)
+            picked_box_probs = []
+            picked_labels = []
+            for class_index in range(0, confidences.shape[1]):
+                probs = confidences[:, class_index]
+                mask = probs > self.score_threshold
+                probs = probs[mask]
+                if probs.shape[0] == 0:
+                    continue
+                subset_boxes = bboxes[mask, :]
+                box_probs = np.concatenate(
+                    [subset_boxes, probs.reshape(-1, 1)], axis=1)
+                box_probs = hard_nms(
+                    box_probs,
+                    iou_threshold=self.nms_threshold,
+                    top_k=self.keep_top_k, )
+                picked_box_probs.append(box_probs)
+                picked_labels.extend([class_index] * box_probs.shape[0])
+            if len(picked_box_probs) == 0:
+                out_boxes_list.append(np.empty((0, 4)))
+                out_boxes_num.append(0)
+            else:
+                picked_box_probs = np.concatenate(picked_box_probs)
+                # resize output boxes
+                picked_box_probs[:, :4] = self.warp_boxes(
+                    picked_box_probs[:, :4], self.ori_shape[batch_id])
+                im_scale = np.concatenate([
+                    self.scale_factor[batch_id][::-1],
+                    self.scale_factor[batch_id][::-1]
+                ])
+                picked_box_probs[:, :4] /= im_scale
+                # clas score box
+                out_boxes_list.append(
+                    np.concatenate(
+                        [
+                            np.expand_dims(
+                                np.array(picked_labels),
+                                axis=-1), np.expand_dims(
+                                    picked_box_probs[:, 4], axis=-1),
+                            picked_box_probs[:, :4]
+                        ],
+                        axis=1))
+                out_boxes_num.append(len(picked_labels))
+        out_boxes_list = np.concatenate(out_boxes_list, axis=0)
+        out_boxes_num = np.asarray(out_boxes_num).astype(np.int32)
+        return out_boxes_list, out_boxes_num
+def detect(img_file, compiled_model, re_shape, class_label):
+    output = compiled_model.infer_new_request({0: test_image})
+    result_ie = list(output.values())  #[0]
+    test_im_shape = np.array([[re_shape, re_shape]]).astype('float32')
+    test_scale_factor = np.array([[1, 1]]).astype('float32')
+    np_score_list = []
+    np_boxes_list = []
+    num_outs = int(len(result_ie) / 2)
+    for out_idx in range(num_outs):
+        np_score_list.append(result_ie[out_idx])
+        np_boxes_list.append(result_ie[out_idx + num_outs])
+    postprocess = PicoDetPostProcess(test_image.shape[2:], test_im_shape,
+                                     test_scale_factor)
+    np_boxes, np_boxes_num = postprocess(np_score_list, np_boxes_list)
+    image = cv2.imread(img_file, 1)
+    scale_x = image.shape[1] / test_image.shape[3]
+    scale_y = image.shape[0] / test_image.shape[2]
+    res_image = draw_box(image, np_boxes, class_label, scale_x, scale_y)
+    cv2.imwrite('res.jpg', res_image)
+    cv2.imshow("res", res_image)
+    cv2.waitKey()
+def benchmark(test_image, compiled_model):
+    # benchmark       
+    loop_num = 100
+    warm_up = 8
+    timeall = 0
+    time_min = float("inf")
+    time_max = float('-inf')
+    for i in range(loop_num + warm_up):
+        time0 = time.time()
+        #perform the inference step
+        output = compiled_model.infer_new_request({0: test_image})
+        time1 = time.time()
+        timed = time1 - time0
+        if i >= warm_up:
+            timeall = timeall + timed
+            time_min = min(time_min, timed)
+            time_max = max(time_max, timed)
+    time_avg = timeall / loop_num
+    print('inference_time(ms): min={}, max={}, avg={}'.format(
+        round(time_min * 1000, 2),
+        round(time_max * 1000, 1), round(time_avg * 1000, 1)))
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        '--benchmark', type=int, default=1, help="0:detect; 1:benchmark")
+    parser.add_argument(
+        '--img_path',
+        type=str,
+        default='../../../../demo/000000014439.jpg',
+        help="image path")
+    parser.add_argument(
+        '--onnx_path',
+        type=str,
+        default='out_onnxsim/picodet_s_320_processed.onnx',
+        help="onnx filepath")
+    parser.add_argument('--in_shape', type=int, default=320, help="input_size")
+    parser.add_argument(
+        '--class_label',
+        type=str,
+        default='coco_label.txt',
+        help="class label file")
+    args = parser.parse_args()
+    ie = Core()
+    net = ie.read_model(args.onnx_path)
+    test_image = image_preprocess(args.img_path, args.in_shape)
+    compiled_model = ie.compile_model(net, 'CPU')
+    if args.benchmark == 0:
+        detect(args.img_path, compiled_model, args.in_shape, args.class_label)
+    if args.benchmark == 1:
+        benchmark(test_image, compiled_model)
--- a/deploy/third_engine/demo_openvino/python/openvino_infer.py
+++ b/deploy/third_engine/demo_openvino/python/openvino_infer.py
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import cv2
+import numpy as np
+import argparse
+from scipy.special import softmax
+from openvino.runtime import Core
+def image_preprocess(img_path, re_shape):
+    img = cv2.imread(img_path)
+    img = cv2.resize(
+        img, (re_shape, re_shape), interpolation=cv2.INTER_LANCZOS4)
+    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
+    img = np.transpose(img, [2, 0, 1]) / 255
+    img = np.expand_dims(img, 0)
+    img_mean = np.array([0.485, 0.456, 0.406]).reshape((3, 1, 1))
+    img_std = np.array([0.229, 0.224, 0.225]).reshape((3, 1, 1))
+    img -= img_mean
+    img /= img_std
+    return img.astype(np.float32)
+def get_color_map_list(num_classes):
+    color_map = num_classes * [0, 0, 0]
+    for i in range(0, num_classes):
+        j = 0
+        lab = i
+        while lab:
+            color_map[i * 3] |= (((lab >> 0) & 1) << (7 - j))
+            color_map[i * 3 + 1] |= (((lab >> 1) & 1) << (7 - j))
+            color_map[i * 3 + 2] |= (((lab >> 2) & 1) << (7 - j))
+            j += 1
+            lab >>= 3
+    color_map = [color_map[i:i + 3] for i in range(0, len(color_map), 3)]
+    return color_map
+def draw_box(srcimg, results, class_label):
+    label_list = list(
+        map(lambda x: x.strip(), open(class_label, 'r').readlines()))
+    for i in range(len(results)):
+        color_list = get_color_map_list(len(label_list))
+        clsid2color = {}
+        classid, conf = int(results[i, 0]), results[i, 1]
+        xmin, ymin, xmax, ymax = int(results[i, 2]), int(results[i, 3]), int(
+            results[i, 4]), int(results[i, 5])
+        if classid not in clsid2color:
+            clsid2color[classid] = color_list[classid]
+        color = tuple(clsid2color[classid])
+        cv2.rectangle(srcimg, (xmin, ymin), (xmax, ymax), color, thickness=2)
+        print(label_list[classid] + ': ' + str(round(conf, 3)))
+        cv2.putText(
+            srcimg,
+            label_list[classid] + ':' + str(round(conf, 3)), (xmin, ymin - 10),
+            cv2.FONT_HERSHEY_SIMPLEX,
+            0.8, (0, 255, 0),
+            thickness=2)
+    return srcimg
+def hard_nms(box_scores, iou_threshold, top_k=-1, candidate_size=200):
+    """
+    Args:
+        box_scores (N, 5): boxes in corner-form and probabilities.
+        iou_threshold: intersection over union threshold.
+        top_k: keep top_k results. If k <= 0, keep all the results.
+        candidate_size: only consider the candidates with the highest scores.
+    Returns:
+         picked: a list of indexes of the kept boxes
+    """
+    scores = box_scores[:, -1]
+    boxes = box_scores[:, :-1]
+    picked = []
+    indexes = np.argsort(scores)
+    indexes = indexes[-candidate_size:]
+    while len(indexes) > 0:
+        current = indexes[-1]
+        picked.append(current)
+        if 0 < top_k == len(picked) or len(indexes) == 1:
+            break
+        current_box = boxes[current, :]
+        indexes = indexes[:-1]
+        rest_boxes = boxes[indexes, :]
+        iou = iou_of(
+            rest_boxes,
+            np.expand_dims(
+                current_box, axis=0), )
+        indexes = indexes[iou <= iou_threshold]
+    return box_scores[picked, :]
+def iou_of(boxes0, boxes1, eps=1e-5):
+    """Return intersection-over-union (Jaccard index) of boxes.
+    Args:
+        boxes0 (N, 4): ground truth boxes.
+        boxes1 (N or 1, 4): predicted boxes.
+        eps: a small number to avoid 0 as denominator.
+    Returns:
+        iou (N): IoU values.
+    """
+    overlap_left_top = np.maximum(boxes0[..., :2], boxes1[..., :2])
+    overlap_right_bottom = np.minimum(boxes0[..., 2:], boxes1[..., 2:])
+    overlap_area = area_of(overlap_left_top, overlap_right_bottom)
+    area0 = area_of(boxes0[..., :2], boxes0[..., 2:])
+    area1 = area_of(boxes1[..., :2], boxes1[..., 2:])
+    return overlap_area / (area0 + area1 - overlap_area + eps)
+def area_of(left_top, right_bottom):
+    """Compute the areas of rectangles given two corners.
+    Args:
+        left_top (N, 2): left top corner.
+        right_bottom (N, 2): right bottom corner.
+    Returns:
+        area (N): return the area.
+    """
+    hw = np.clip(right_bottom - left_top, 0.0, None)
+    return hw[..., 0] * hw[..., 1]
+class PicoDetNMS(object):
+    """
+    Args:
+        input_shape (int): network input image size
+        scale_factor (float): scale factor of ori image
+    """
+    def __init__(self,
+                 input_shape,
+                 scale_x,
+                 scale_y,
+                 strides=[8, 16, 32, 64],
+                 score_threshold=0.4,
+                 nms_threshold=0.5,
+                 nms_top_k=1000,
+                 keep_top_k=100):
+        self.input_shape = input_shape
+        self.scale_x = scale_x
+        self.scale_y = scale_y
+        self.strides = strides
+        self.score_threshold = score_threshold
+        self.nms_threshold = nms_threshold
+        self.nms_top_k = nms_top_k
+        self.keep_top_k = keep_top_k
+    def __call__(self, decode_boxes, select_scores):
+        batch_size = 1
+        out_boxes_list = []
+        for batch_id in range(batch_size):
+            # nms
+            bboxes = np.concatenate(decode_boxes, axis=0)
+            confidences = np.concatenate(select_scores, axis=0)
+            picked_box_probs = []
+            picked_labels = []
+            for class_index in range(0, confidences.shape[1]):
+                probs = confidences[:, class_index]
+                mask = probs > self.score_threshold
+                probs = probs[mask]
+                if probs.shape[0] == 0:
+                    continue
+                subset_boxes = bboxes[mask, :]
+                box_probs = np.concatenate(
+                    [subset_boxes, probs.reshape(-1, 1)], axis=1)
+                box_probs = hard_nms(
+                    box_probs,
+                    iou_threshold=self.nms_threshold,
+                    top_k=self.keep_top_k, )
+                picked_box_probs.append(box_probs)
+                picked_labels.extend([class_index] * box_probs.shape[0])
+            if len(picked_box_probs) == 0:
+                out_boxes_list.append(np.empty((0, 4)))
+            else:
+                picked_box_probs = np.concatenate(picked_box_probs)
+                # resize output boxes
+                picked_box_probs[:, 0] *= self.scale_x
+                picked_box_probs[:, 2] *= self.scale_x
+                picked_box_probs[:, 1] *= self.scale_y
+                picked_box_probs[:, 3] *= self.scale_y
+                # clas score box
+                out_boxes_list.append(
+                    np.concatenate(
+                        [
+                            np.expand_dims(
+                                np.array(picked_labels),
+                                axis=-1), np.expand_dims(
+                                    picked_box_probs[:, 4], axis=-1),
+                            picked_box_probs[:, :4]
+                        ],
+                        axis=1))
+        out_boxes_list = np.concatenate(out_boxes_list, axis=0)
+        return out_boxes_list
+def detect(img_file, compiled_model, class_label):
+    output = compiled_model.infer_new_request({0: test_image})
+    result_ie = list(output.values())
+    decode_boxes = []
+    select_scores = []
+    num_outs = int(len(result_ie) / 2)
+    for out_idx in range(num_outs):
+        decode_boxes.append(result_ie[out_idx])
+        select_scores.append(result_ie[out_idx + num_outs])
+    image = cv2.imread(img_file, 1)
+    scale_x = image.shape[1] / test_image.shape[3]
+    scale_y = image.shape[0] / test_image.shape[2]
+    nms = PicoDetNMS(test_image.shape[2:], scale_x, scale_y)
+    np_boxes = nms(decode_boxes, select_scores)
+    res_image = draw_box(image, np_boxes, class_label)
+    cv2.imwrite('res.jpg', res_image)
+    cv2.imshow("res", res_image)
+    cv2.waitKey()
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        '--img_path',
+        type=str,
+        default='../../demo_onnxruntime/imgs/bus.jpg',
+        help="image path")
+    parser.add_argument(
+        '--onnx_path',
+        type=str,
+        default='out_onnxsim_infer/picodet_s_320_postproccesed_woNMS.onnx',
+        help="onnx filepath")
+    parser.add_argument('--in_shape', type=int, default=320, help="input_size")
+    parser.add_argument(
+        '--class_label',
+        type=str,
+        default='coco_label.txt',
+        help="class label file")
+    args = parser.parse_args()
+    ie = Core()
+    net = ie.read_model(args.onnx_path)
+    test_image = image_preprocess(args.img_path, args.in_shape)
+    compiled_model = ie.compile_model(net, 'CPU')
+    detect(args.img_path, compiled_model, args.class_label)
--- a/deploy/third_engine/demo_openvino_kpts/CMakeLists.txt
+++ b/deploy/third_engine/demo_openvino_kpts/CMakeLists.txt
+cmake_minimum_required(VERSION 3.4.1)
+set(CMAKE_CXX_STANDARD 14)
+project(tinypose_demo)
+find_package(OpenCV REQUIRED)
+find_package(InferenceEngine REQUIRED)
+find_package(ngraph REQUIRED)
+include_directories(
+    ${OpenCV_INCLUDE_DIRS}
+    ${CMAKE_CURRENT_SOURCE_DIR}
+    ${CMAKE_CURRENT_BINARY_DIR}
+)
+add_executable(tinypose_demo main.cpp picodet_openvino.cpp keypoint_detector.cpp keypoint_postprocess.cpp)
+target_link_libraries(
+    tinypose_demo
+    ${InferenceEngine_LIBRARIES}
+    ${NGRAPH_LIBRARIES}
+    ${OpenCV_LIBS}
+)
--- a/deploy/third_engine/demo_openvino_kpts/README.md
+++ b/deploy/third_engine/demo_openvino_kpts/README.md
+# TinyPose OpenVINO Demo
+This fold provides TinyPose inference code using
+[Intel's OpenVINO Toolkit](https://software.intel.com/content/www/us/en/develop/tools/openvino-toolkit.html). Most of the implements in this fold are same as *demo_ncnn*.  
+**Recommand** 
+1. To use the xxx.tar.gz file to install instead of github method, [link](https://registrationcenter-download.intel.com/akdlm/irc_nas/18096/l_openvino_toolkit_p_2021.4.689.tgz).
+2. Your can also deploy openvino with docker, the command is :
+```
+docker pull openvino/ubuntu18_dev:2021.4.1
+```
+## Install OpenVINO Toolkit
+Go to [OpenVINO HomePage](https://software.intel.com/content/www/us/en/develop/tools/openvino-toolkit.html)
+Download a suitable version and install.
+Follow the official Get Started Guides: https://docs.openvinotoolkit.org/latest/get_started_guides.html
+## Set the Environment Variables
+### Windows:
+Run this command in cmd. (Every time before using OpenVINO)
+```cmd
+<INSTSLL_DIR>\openvino_2021\bin\setupvars.bat
+```
+Or set the system environment variables once for all:
+Name                  |Value
+:--------------------:|:--------:
+INTEL_OPENVINO_DIR | <INSTSLL_DIR>\openvino_2021
+INTEL_CVSDK_DIR | %INTEL_OPENVINO_DIR%
+InferenceEngine_DIR | %INTEL_OPENVINO_DIR%\deployment_tools\inference_engine\share
+HDDL_INSTALL_DIR | %INTEL_OPENVINO_DIR%\deployment_tools\inference_engine\external\hddl
+ngraph_DIR | %INTEL_OPENVINO_DIR%\deployment_tools\ngraph\cmake
+And add this to ```Path```
+```
+%INTEL_OPENVINO_DIR%\deployment_tools\inference_engine\bin\intel64\Debug;%INTEL_OPENVINO_DIR%\deployment_tools\inference_engine\bin\intel64\Release;%HDDL_INSTALL_DIR%\bin;%INTEL_OPENVINO_DIR%\deployment_tools\inference_engine\external\tbb\bin;%INTEL_OPENVINO_DIR%\deployment_tools\ngraph\lib
+```
+### Linux
+Run this command in shell. (Every time before using OpenVINO)
+```shell
+source /opt/intel/openvino_2021/bin/setupvars.sh
+```
+Or edit .bashrc
+```shell
+vi ~/.bashrc
+```
+Add this line to the end of the file
+```shell
+source /opt/intel/openvino_2021/bin/setupvars.sh
+```
+## Convert model
+  **1. Conver to onnx**
+  Create picodet_m_416_coco.onnx and tinypose256.onnx
+  example:
+    ```shell
+    modelName=picodet_m_416_coco
+    # export model
+    python tools/export_model.py \
+            -c configs/picodet/${modelName}.yml \
+            -o weights=${modelName}.pdparams \
+            --output_dir=inference_model
+    # convert to onnx
+    paddle2onnx --model_dir inference_model/${modelName} \
+            --model_filename model.pdmodel  \
+            --params_filename model.pdiparams \
+            --opset_version 11 \
+            --save_file ${modelName}.onnx
+    # onnxsim
+    python -m onnxsim ${modelName}.onnx ${modelName}_sim.onnx
+    ```
+  **2.Convert to OpenVINO**
+   ``` shell
+   cd <INSTSLL_DIR>/openvino_2021/deployment_tools/model_optimizer
+   ```
+   Install requirements for convert tool
+   ```shell
+   cd ./install_prerequisites
+   sudo install_prerequisites_onnx.sh
+   ```
+   Then convert model. Notice: mean_values and scale_values should be the same with your training settings in YAML config file.
+   ```shell
+   mo_onnx.py --input_model <ONNX_MODEL> --mean_values [103.53,116.28,123.675] --scale_values [57.375,57.12,58.395] --input_shape [1,3,256,192]
+   ```
+   **Note: The new version of openvino convert tools may cause error in Resize op. If you has problem with this, please try the version: openvino_2021.4.689**
+## Build
+### Windows
+```cmd
+<OPENVINO_INSTSLL_DIR>\openvino_2021\bin\setupvars.bat
+mkdir -p build
+cd build
+cmake ..
+msbuild tinypose_demo.vcxproj /p:configuration=release /p:platform=x64
+```
+### Linux
+```shell
+source /opt/intel/openvino_2021/bin/setupvars.sh
+mkdir build
+cd build
+cmake ..
+make
+```
+## Run demo
+Download PicoDet openvino model [PicoDet openvino model download link](https://paddledet.bj.bcebos.com/deploy/third_engine/picodet_m_416_openvino.zip).
+Download TinyPose openvino model [TinyPose openvino model download link](https://bj.bcebos.com/v1/paddledet/deploy/third_engine/demo_openvino_kpts.tar.gz), the origin paddlepaddle model is [Tinypose256](https://bj.bcebos.com/v1/paddledet/models/keypoint/tinypose_enhance/tinypose_256x192.pdparams).
+move picodet and tinypose openvino model files to the demo's weight folder. 
+Note:
+1. The model output node name may update by new version of paddle\paddle2onnx\onnxsim\openvino, please checkout your own model output node when the code can't find "conv2d_441.tmp_1"\"argmax_0.tmp_0".
+2. If you happened with this error "Cannot find blob with name: transpose_1.tmp_0", it means your picodet model is oldversion. you can modify the below code to fix it.
+```
+#picodet_openvino.h line 50-54
+  std::vector<HeadInfo> heads_info_{
+      // cls_pred|dis_pred|stride
+      {"transpose_0.tmp_0", "transpose_1.tmp_0", 8},
+      {"transpose_2.tmp_0", "transpose_3.tmp_0", 16},
+      {"transpose_4.tmp_0", "transpose_5.tmp_0", 32},
+      {"transpose_6.tmp_0", "transpose_7.tmp_0", 64},
+  };
+  modify to:
+  std::vector<HeadInfo> heads_info_{
+    // cls_pred|dis_pred|stride
+    {"save_infer_model/scale_0.tmp_1", "save_infer_model/scale_4.tmp_1", 8},
+    {"save_infer_model/scale_1.tmp_1", "save_infer_model/scale_5.tmp_1", 16},
+    {"save_infer_model/scale_2.tmp_1", "save_infer_model/scale_6.tmp_1", 32},
+    {"save_infer_model/scale_3.tmp_1", "save_infer_model/scale_7.tmp_1", 64},
+  };
+```
+3. you can view your onnx model with [Netron](https://netron.app/).
+### Edit file
+```
+step1:
+main.cpp
+#define image_size 416
+...
+  cv::Mat image(256, 192, CV_8UC3, cv::Scalar(1, 1, 1));
+  std::vector<float> center = {128, 96};
+  std::vector<float> scale = {256, 192};
+...
+  auto detector = PicoDet("../weight/picodet_m_416.xml");
+  auto kpts_detector = new KeyPointDetector("../weight/tinypose256.xml", -1, 256, 192);
+...
+step2:
+picodet_openvino.h
+#define image_size 416
+```
+### Run
+Run command:
+``` shell
+./tinypose_demo [mode] [image_file]
+```
+|  param   | detail  |
+|  ----  | ----  |
+| --mode  | input mode，0:camera；1:image；2:video；3:benchmark |
+| --image_file  | input image path |
+#### Webcam
+```shell
+tinypose_demo 0 0
+```
+#### Inference images
+```shell
+tinypose_demo 1 IMAGE_FOLDER/*.jpg
+```
+#### Inference video
+```shell
+tinypose_demo 2 VIDEO_PATH
+```
+### Benchmark
+```shell
+tinypose_demo 3 0
+```
+Plateform: Intel(R) Xeon(R) CPU E5-2650 v4 @ 2.20GHz x 24(核)
+Model: [Tinypose256_Openvino](https://paddledet.bj.bcebos.com/deploy/third_engine/tinypose_256_openvino.zip)
+| param         | Min   | Max   | Avg   |
+| ------------- | ----- | ----- | ----- |
+| infer time(s) | 0.018 | 0.062 | 0.028 |
--- a/deploy/third_engine/demo_openvino_kpts/keypoint_detector.cpp
+++ b/deploy/third_engine/demo_openvino_kpts/keypoint_detector.cpp
+//   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <sstream>
+// for setprecision
+#include <chrono>
+#include <iomanip>
+#include "keypoint_detector.h"
+namespace PaddleDetection {
+// Visualiztion MaskDetector results
+cv::Mat VisualizeKptsResult(const cv::Mat& img,
+                            const std::vector<KeyPointResult>& results,
+                            const std::vector<int>& colormap,
+                            float threshold) {
+  const int edge[][2] = {{0, 1},
+                         {0, 2},
+                         {1, 3},
+                         {2, 4},
+                         {3, 5},
+                         {4, 6},
+                         {5, 7},
+                         {6, 8},
+                         {7, 9},
+                         {8, 10},
+                         {5, 11},
+                         {6, 12},
+                         {11, 13},
+                         {12, 14},
+                         {13, 15},
+                         {14, 16},
+                         {11, 12}};
+  cv::Mat vis_img = img.clone();
+  for (int batchid = 0; batchid < results.size(); batchid++) {
+    for (int i = 0; i < results[batchid].num_joints; i++) {
+      if (results[batchid].keypoints[i * 3] > threshold) {
+        int x_coord = int(results[batchid].keypoints[i * 3 + 1]);
+        int y_coord = int(results[batchid].keypoints[i * 3 + 2]);
+        cv::circle(vis_img,
+                   cv::Point2d(x_coord, y_coord),
+                   1,
+                   cv::Scalar(0, 0, 255),
+                   2);
+      }
+    }
+    for (int i = 0; i < results[batchid].num_joints; i++) {
+      if (results[batchid].keypoints[edge[i][0] * 3] > threshold &&
+          results[batchid].keypoints[edge[i][1] * 3] > threshold) {
+        int x_start = int(results[batchid].keypoints[edge[i][0] * 3 + 1]);
+        int y_start = int(results[batchid].keypoints[edge[i][0] * 3 + 2]);
+        int x_end = int(results[batchid].keypoints[edge[i][1] * 3 + 1]);
+        int y_end = int(results[batchid].keypoints[edge[i][1] * 3 + 2]);
+        cv::line(vis_img,
+                 cv::Point2d(x_start, y_start),
+                 cv::Point2d(x_end, y_end),
+                 colormap[i],
+                 1);
+      }
+    }
+  }
+  return vis_img;
+}
+void KeyPointDetector::Postprocess(std::vector<float>& output,
+                                   std::vector<uint64_t>& output_shape,
+                                   std::vector<float>& idxout,
+                                   std::vector<uint64_t>& idx_shape,
+                                   std::vector<KeyPointResult>* result,
+                                   std::vector<std::vector<float>>& center_bs,
+                                   std::vector<std::vector<float>>& scale_bs) {
+  std::vector<float> preds(output_shape[1] * 3, 0);
+  for (int batchid = 0; batchid < output_shape[0]; batchid++) {
+    get_final_preds(output,
+                    output_shape,
+                    idxout,
+                    idx_shape,
+                    center_bs[batchid],
+                    scale_bs[batchid],
+                    preds,
+                    batchid,
+                    this->use_dark());
+    KeyPointResult result_item;
+    result_item.num_joints = output_shape[1];
+    result_item.keypoints.clear();
+    for (int i = 0; i < output_shape[1]; i++) {
+      result_item.keypoints.emplace_back(preds[i * 3]);
+      result_item.keypoints.emplace_back(preds[i * 3 + 1]);
+      result_item.keypoints.emplace_back(preds[i * 3 + 2]);
+    }
+    result->push_back(result_item);
+  }
+}
+void KeyPointDetector::Predict(const std::vector<cv::Mat> imgs,
+                               std::vector<std::vector<float>>& center_bs,
+                               std::vector<std::vector<float>>& scale_bs,
+                               std::vector<KeyPointResult>* result) {
+  int batch_size = imgs.size();
+  auto insize = 3 * in_h * in_w;
+  InferenceEngine::Blob::Ptr input_blob = infer_request_.GetBlob(input_name_);
+  // Preprocess image
+  InferenceEngine::MemoryBlob::Ptr mblob =
+      InferenceEngine::as<InferenceEngine::MemoryBlob>(input_blob);
+  if (!mblob) {
+    THROW_IE_EXCEPTION
+        << "We expect blob to be inherited from MemoryBlob in matU8ToBlob, "
+        << "but by fact we were not able to cast inputBlob to MemoryBlob";
+  }
+  auto mblobHolder = mblob->wmap();
+  float* blob_data = mblobHolder.as<float*>();
+  cv::Mat resized_im;
+  for (int bs_idx = 0; bs_idx < batch_size; bs_idx++) {
+    cv::Mat im = imgs.at(bs_idx);
+    cv::resize(im, resized_im, cv::Size(in_w, in_h));
+    for (size_t c = 0; c < 3; c++) {
+      for (size_t h = 0; h < in_h; h++) {
+        for (size_t w = 0; w < in_w; w++) {
+          blob_data[c * in_w * in_h + h * in_w + w] =
+              (float)resized_im.at<cv::Vec3b>(h, w)[c];
+        }
+      }
+    }
+  }
+  // Run predictor
+  auto inference_start = std::chrono::steady_clock::now();
+  // do inference
+  infer_request_.Infer();
+  InferenceEngine::Blob::Ptr output_blob =
+      infer_request_.GetBlob("conv2d_441.tmp_1");
+  auto output_shape = output_blob->getTensorDesc().getDims();
+  InferenceEngine::MemoryBlob::Ptr moutput =
+      InferenceEngine::as<InferenceEngine::MemoryBlob>(output_blob);
+  if (moutput) {
+    // locked memory holder should be alive all time while access to its
+    // buffer happens
+    auto minputHolder = moutput->rmap();
+    auto data = minputHolder.as<const InferenceEngine::PrecisionTrait<
+        InferenceEngine::Precision::FP32>::value_type*>();
+    // Calculate output length
+    int output_size = 1;
+    for (int j = 0; j < output_shape.size(); ++j) {
+      output_size *= output_shape[j];
+    }
+    output_data_.resize(output_size);
+    std::copy_n(data, output_size, output_data_.data());
+  }
+  InferenceEngine::Blob::Ptr output_blob2 =
+      infer_request_.GetBlob("argmax_0.tmp_0");
+  auto idx_shape = output_blob2->getTensorDesc().getDims();
+  InferenceEngine::MemoryBlob::Ptr moutput2 =
+      InferenceEngine::as<InferenceEngine::MemoryBlob>(output_blob2);
+  if (moutput2) {
+    // locked memory holder should be alive all time while access to its
+    // buffer happens
+    auto minputHolder = moutput2->rmap();
+    // Original I64 precision was converted to I32
+    auto data = minputHolder.as<const InferenceEngine::PrecisionTrait<
+        InferenceEngine::Precision::FP32>::value_type*>();
+    // Calculate output length
+    int output_size = 1;
+    for (int j = 0; j < idx_shape.size(); ++j) {
+      output_size *= idx_shape[j];
+    }
+    idx_data_.resize(output_size);
+    std::copy_n(data, output_size, idx_data_.data());
+  }
+  auto inference_end = std::chrono::steady_clock::now();
+  std::chrono::duration<double> elapsed = inference_end - inference_start;
+  printf("keypoint inference time: %f s\n", elapsed.count());
+  // Postprocessing result
+  Postprocess(output_data_,
+              output_shape,
+              idx_data_,
+              idx_shape,
+              result,
+              center_bs,
+              scale_bs);
+}
+}  // namespace PaddleDetection
--- a/deploy/third_engine/demo_openvino_kpts/keypoint_detector.h
+++ b/deploy/third_engine/demo_openvino_kpts/keypoint_detector.h
+//   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <ctime>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+#include <opencv2/core/core.hpp>
+#include <opencv2/highgui/highgui.hpp>
+#include <opencv2/imgproc/imgproc.hpp>
+#include <inference_engine.hpp>
+#include "keypoint_postprocess.h"
+namespace PaddleDetection {
+// Object KeyPoint Result
+struct KeyPointResult {
+  // Keypoints: shape(N x 3); N: number of Joints; 3: x,y,conf
+  std::vector<float> keypoints;
+  int num_joints = -1;
+};
+// Visualiztion KeyPoint Result
+cv::Mat VisualizeKptsResult(const cv::Mat& img,
+                            const std::vector<KeyPointResult>& results,
+                            const std::vector<int>& colormap,
+                            float threshold = 0.2);
+class KeyPointDetector {
+ public:
+  explicit KeyPointDetector(const std::string& model_path,
+                            int input_height = 256,
+                            int input_width = 192,
+                            float score_threshold = 0.3,
+                            const int batch_size = 1,
+                            bool use_dark = true) {
+    use_dark_ = use_dark;
+    in_w = input_width;
+    in_h = input_height;
+    threshold_ = score_threshold;
+    InferenceEngine::Core ie;
+    auto model = ie.ReadNetwork(model_path);
+    // prepare input settings
+    InferenceEngine::InputsDataMap inputs_map(model.getInputsInfo());
+    input_name_ = inputs_map.begin()->first;
+    InferenceEngine::InputInfo::Ptr input_info = inputs_map.begin()->second;
+    // prepare output settings
+    InferenceEngine::OutputsDataMap outputs_map(model.getOutputsInfo());
+    int idx = 0;
+    for (auto& output_info : outputs_map) {
+      if (idx == 0) {
+        output_info.second->setPrecision(InferenceEngine::Precision::FP32);
+      } else {
+        output_info.second->setPrecision(InferenceEngine::Precision::FP32);
+      }
+      idx++;
+    }
+    // get network
+    network_ = ie.LoadNetwork(model, "CPU");
+    infer_request_ = network_.CreateInferRequest();
+  }
+  // Load Paddle inference model
+  void LoadModel(std::string model_file, int num_theads);
+  // Run predictor
+  void Predict(const std::vector<cv::Mat> imgs,
+               std::vector<std::vector<float>>& center,
+               std::vector<std::vector<float>>& scale,
+               std::vector<KeyPointResult>* result = nullptr);
+  bool use_dark() { return this->use_dark_; }
+  inline float get_threshold() { return threshold_; };
+  int in_w = 128;
+  int in_h = 256;
+ private:
+  // Postprocess result
+  void Postprocess(std::vector<float>& output,
+                   std::vector<uint64_t>& output_shape,
+                   std::vector<float>& idxout,
+                   std::vector<uint64_t>& idx_shape,
+                   std::vector<KeyPointResult>* result,
+                   std::vector<std::vector<float>>& center,
+                   std::vector<std::vector<float>>& scale);
+  std::vector<float> output_data_;
+  std::vector<float> idx_data_;
+  float threshold_;
+  bool use_dark_;
+  InferenceEngine::ExecutableNetwork network_;
+  InferenceEngine::InferRequest infer_request_;
+  std::string input_name_;
+};
+}  // namespace PaddleDetection
--- a/deploy/third_engine/demo_openvino_kpts/keypoint_postprocess.cpp
+++ b/deploy/third_engine/demo_openvino_kpts/keypoint_postprocess.cpp
+//   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "keypoint_postprocess.h"
+#define PI 3.1415926535
+#define HALF_CIRCLE_DEGREE 180
+cv::Point2f get_3rd_point(cv::Point2f& a, cv::Point2f& b) {
+  cv::Point2f direct{a.x - b.x, a.y - b.y};
+  return cv::Point2f(a.x - direct.y, a.y + direct.x);
+}
+std::vector<float> get_dir(float src_point_x,
+                           float src_point_y,
+                           float rot_rad) {
+  float sn = sin(rot_rad);
+  float cs = cos(rot_rad);
+  std::vector<float> src_result{0.0, 0.0};
+  src_result[0] = src_point_x * cs - src_point_y * sn;
+  src_result[1] = src_point_x * sn + src_point_y * cs;
+  return src_result;
+}
+void affine_tranform(
+    float pt_x, float pt_y, cv::Mat& trans, std::vector<float>& preds, int p) {
+  double new1[3] = {pt_x, pt_y, 1.0};
+  cv::Mat new_pt(3, 1, trans.type(), new1);
+  cv::Mat w = trans * new_pt;
+  preds[p * 3 + 1] = static_cast<float>(w.at<double>(0, 0));
+  preds[p * 3 + 2] = static_cast<float>(w.at<double>(1, 0));
+}
+void get_affine_transform(std::vector<float>& center,
+                          std::vector<float>& scale,
+                          float rot,
+                          std::vector<int>& output_size,
+                          cv::Mat& trans,
+                          int inv) {
+  float src_w = scale[0];
+  float dst_w = static_cast<float>(output_size[0]);
+  float dst_h = static_cast<float>(output_size[1]);
+  float rot_rad = rot * PI / HALF_CIRCLE_DEGREE;
+  std::vector<float> src_dir = get_dir(-0.5 * src_w, 0, rot_rad);
+  std::vector<float> dst_dir{static_cast<float>(-0.5) * dst_w, 0.0};
+  cv::Point2f srcPoint2f[3], dstPoint2f[3];
+  srcPoint2f[0] = cv::Point2f(center[0], center[1]);
+  srcPoint2f[1] = cv::Point2f(center[0] + src_dir[0], center[1] + src_dir[1]);
+  srcPoint2f[2] = get_3rd_point(srcPoint2f[0], srcPoint2f[1]);
+  dstPoint2f[0] = cv::Point2f(dst_w * 0.5, dst_h * 0.5);
+  dstPoint2f[1] =
+      cv::Point2f(dst_w * 0.5 + dst_dir[0], dst_h * 0.5 + dst_dir[1]);
+  dstPoint2f[2] = get_3rd_point(dstPoint2f[0], dstPoint2f[1]);
+  if (inv == 0) {
+    trans = cv::getAffineTransform(srcPoint2f, dstPoint2f);
+  } else {
+    trans = cv::getAffineTransform(dstPoint2f, srcPoint2f);
+  }
+}
+void transform_preds(std::vector<float>& coords,
+                     std::vector<float>& center,
+                     std::vector<float>& scale,
+                     std::vector<int>& output_size,
+                     std::vector<uint64_t>& dim,
+                     std::vector<float>& target_coords,
+                     bool affine=false) {
+  if (affine) {
+    cv::Mat trans(2, 3, CV_64FC1);
+    get_affine_transform(center, scale, 0, output_size, trans, 1);
+    for (int p = 0; p < dim[1]; ++p) {
+      affine_tranform(
+          coords[p * 2], coords[p * 2 + 1], trans, target_coords, p);
+    }
+  } else {
+    float heat_w = static_cast<float>(output_size[0]);
+    float heat_h = static_cast<float>(output_size[1]);
+    float x_scale = scale[0] / heat_w;
+    float y_scale = scale[1] / heat_h;
+    float offset_x = center[0] - scale[0] / 2.;
+    float offset_y = center[1] - scale[1] / 2.;
+    for (int i = 0; i < dim[1]; i++) {
+      target_coords[i * 3 + 1] = x_scale * coords[i * 2] + offset_x;
+      target_coords[i * 3 + 2] = y_scale * coords[i * 2 + 1] + offset_y;
+    }
+  }
+}
+// only for batchsize == 1
+void get_max_preds(std::vector<float>& heatmap,
+                   std::vector<int>& dim,
+                   std::vector<float>& preds,
+                   std::vector<float>& maxvals,
+                   int batchid,
+                   int joint_idx) {
+  int num_joints = dim[1];
+  int width = dim[3];
+  std::vector<int> idx;
+  idx.resize(num_joints * 2);
+  for (int j = 0; j < dim[1]; j++) {
+    float* index = &(
+        heatmap[batchid * num_joints * dim[2] * dim[3] + j * dim[2] * dim[3]]);
+    float* end = index + dim[2] * dim[3];
+    float* max_dis = std::max_element(index, end);
+    auto max_id = std::distance(index, max_dis);
+    maxvals[j] = *max_dis;
+    if (*max_dis > 0) {
+      preds[j * 2] = static_cast<float>(max_id % width);
+      preds[j * 2 + 1] = static_cast<float>(max_id / width);
+    }
+  }
+}
+void dark_parse(std::vector<float>& heatmap,
+                std::vector<uint64_t>& dim,
+                std::vector<float>& coords,
+                int px,
+                int py,
+                int index,
+                int ch) {
+  /*DARK postpocessing, Zhang et al. Distribution-Aware Coordinate
+  Representation for Human Pose Estimation (CVPR 2020).
+  1) offset = - hassian.inv() * derivative
+  2) dx = (heatmap[x+1] - heatmap[x-1])/2.
+  3) dxx = (dx[x+1] - dx[x-1])/2.
+  4) derivative = Mat([dx, dy])
+  5) hassian = Mat([[dxx, dxy], [dxy, dyy]])
+  */
+  std::vector<float>::const_iterator first1 = heatmap.begin() + index;
+  std::vector<float>::const_iterator last1 =
+      heatmap.begin() + index + dim[2] * dim[3];
+  std::vector<float> heatmap_ch(first1, last1);
+  cv::Mat heatmap_mat = cv::Mat(heatmap_ch).reshape(0, dim[2]);
+  heatmap_mat.convertTo(heatmap_mat, CV_32FC1);
+  cv::GaussianBlur(heatmap_mat, heatmap_mat, cv::Size(3, 3), 0, 0);
+  heatmap_mat = heatmap_mat.reshape(1, 1);
+  heatmap_ch = std::vector<float>(heatmap_mat.reshape(1, 1));
+  float epsilon = 1e-10;
+  // sample heatmap to get values in around target location
+  float xy = log(fmax(heatmap_ch[py * dim[3] + px], epsilon));
+  float xr = log(fmax(heatmap_ch[py * dim[3] + px + 1], epsilon));
+  float xl = log(fmax(heatmap_ch[py * dim[3] + px - 1], epsilon));
+  float xr2 = log(fmax(heatmap_ch[py * dim[3] + px + 2], epsilon));
+  float xl2 = log(fmax(heatmap_ch[py * dim[3] + px - 2], epsilon));
+  float yu = log(fmax(heatmap_ch[(py + 1) * dim[3] + px], epsilon));
+  float yd = log(fmax(heatmap_ch[(py - 1) * dim[3] + px], epsilon));
+  float yu2 = log(fmax(heatmap_ch[(py + 2) * dim[3] + px], epsilon));
+  float yd2 = log(fmax(heatmap_ch[(py - 2) * dim[3] + px], epsilon));
+  float xryu = log(fmax(heatmap_ch[(py + 1) * dim[3] + px + 1], epsilon));
+  float xryd = log(fmax(heatmap_ch[(py - 1) * dim[3] + px + 1], epsilon));
+  float xlyu = log(fmax(heatmap_ch[(py + 1) * dim[3] + px - 1], epsilon));
+  float xlyd = log(fmax(heatmap_ch[(py - 1) * dim[3] + px - 1], epsilon));
+  // compute dx/dy and dxx/dyy with sampled values
+  float dx = 0.5 * (xr - xl);
+  float dy = 0.5 * (yu - yd);
+  float dxx = 0.25 * (xr2 - 2 * xy + xl2);
+  float dxy = 0.25 * (xryu - xryd - xlyu + xlyd);
+  float dyy = 0.25 * (yu2 - 2 * xy + yd2);
+  // finally get offset by derivative and hassian, which combined by dx/dy and
+  // dxx/dyy
+  if (dxx * dyy - dxy * dxy != 0) {
+    float M[2][2] = {dxx, dxy, dxy, dyy};
+    float D[2] = {dx, dy};
+    cv::Mat hassian(2, 2, CV_32F, M);
+    cv::Mat derivative(2, 1, CV_32F, D);
+    cv::Mat offset = -hassian.inv() * derivative;
+    coords[ch * 2] += offset.at<float>(0, 0);
+    coords[ch * 2 + 1] += offset.at<float>(1, 0);
+  }
+}
+void get_final_preds(std::vector<float>& heatmap,
+                     std::vector<uint64_t>& dim,
+                     std::vector<float>& idxout,
+                     std::vector<uint64_t>& idxdim,
+                     std::vector<float>& center,
+                     std::vector<float> scale,
+                     std::vector<float>& preds,
+                     int batchid,
+                     bool DARK) {
+  std::vector<float> coords;
+  coords.resize(dim[1] * 2);
+  int heatmap_height = dim[2];
+  int heatmap_width = dim[3];
+  for (int j = 0; j < dim[1]; ++j) {
+    int index = (batchid * dim[1] + j) * dim[2] * dim[3];
+    int idx = int(idxout[batchid * dim[1] + j]);
+    preds[j * 3] = heatmap[index + idx];
+    coords[j * 2] = idx % heatmap_width;
+    coords[j * 2 + 1] = idx / heatmap_width;
+    int px = int(coords[j * 2] + 0.5);
+    int py = int(coords[j * 2 + 1] + 0.5);
+    if (DARK && px > 1 && px < heatmap_width - 2 && py > 1 &&
+        py < heatmap_height - 2) {
+      dark_parse(heatmap, dim, coords, px, py, index, j);
+    } else {
+      if (px > 0 && px < heatmap_width - 1) {
+        float diff_x = heatmap[index + py * dim[3] + px + 1] -
+                       heatmap[index + py * dim[3] + px - 1];
+        coords[j * 2] += diff_x > 0 ? 1 : -1 * 0.25;
+      }
+      if (py > 0 && py < heatmap_height - 1) {
+        float diff_y = heatmap[index + (py + 1) * dim[3] + px] -
+                       heatmap[index + (py - 1) * dim[3] + px];
+        coords[j * 2 + 1] += diff_y > 0 ? 1 : -1 * 0.25;
+      }
+    }
+  }
+  std::vector<int> img_size{heatmap_width, heatmap_height};
+  transform_preds(coords, center, scale, img_size, dim, preds);
+}
+void CropImg(cv::Mat& img,
+             cv::Mat& crop_img,
+             std::vector<int>& area,
+             std::vector<float>& center,
+             std::vector<float>& scale,
+             float expandratio) {
+  int crop_x1 = std::max(0, area[0]);
+  int crop_y1 = std::max(0, area[1]);
+  int crop_x2 = std::min(img.cols - 1, area[2]);
+  int crop_y2 = std::min(img.rows - 1, area[3]);
+  int center_x = (crop_x1 + crop_x2) / 2.;
+  int center_y = (crop_y1 + crop_y2) / 2.;
+  int half_h = (crop_y2 - crop_y1) / 2.;
+  int half_w = (crop_x2 - crop_x1) / 2.;
+  if (half_h * 3 > half_w * 4) {
+    half_w = static_cast<int>(half_h * 0.75);
+  } else {
+    half_h = static_cast<int>(half_w * 4 / 3);
+  }
+  crop_x1 =
+      std::max(0, center_x - static_cast<int>(half_w * (1 + expandratio)));
+  crop_y1 =
+      std::max(0, center_y - static_cast<int>(half_h * (1 + expandratio)));
+  crop_x2 = std::min(img.cols - 1,
+                     static_cast<int>(center_x + half_w * (1 + expandratio)));
+  crop_y2 = std::min(img.rows - 1,
+                     static_cast<int>(center_y + half_h * (1 + expandratio)));
+  crop_img =
+      img(cv::Range(crop_y1, crop_y2 + 1), cv::Range(crop_x1, crop_x2 + 1));
+  center.clear();
+  center.emplace_back((crop_x1 + crop_x2) / 2);
+  center.emplace_back((crop_y1 + crop_y2) / 2);
+  scale.clear();
+  scale.emplace_back((crop_x2 - crop_x1));
+  scale.emplace_back((crop_y2 - crop_y1));
+}
--- a/deploy/third_engine/demo_openvino_kpts/keypoint_postprocess.h
+++ b/deploy/third_engine/demo_openvino_kpts/keypoint_postprocess.h
+//   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <opencv2/core/core.hpp>
+#include <opencv2/highgui/highgui.hpp>
+#include <opencv2/imgproc/imgproc.hpp>
+#include <vector>
+std::vector<float> get_3rd_point(std::vector<float>& a, std::vector<float>& b);
+std::vector<float> get_dir(float src_point_x, float src_point_y, float rot_rad);
+void affine_tranform(float pt_x,
+                     float pt_y,
+                     cv::Mat& trans,
+                     std::vector<float>& x,
+                     int p,
+                     int num);
+cv::Mat get_affine_transform(std::vector<float>& center,
+                             std::vector<float>& scale,
+                             float rot,
+                             std::vector<int>& output_size,
+                             int inv);
+void transform_preds(std::vector<float>& coords,
+                     std::vector<float>& center,
+                     std::vector<float>& scale,
+                     std::vector<uint64_t>& output_size,
+                     std::vector<int>& dim,
+                     std::vector<float>& target_coords,
+                     bool affine);
+void box_to_center_scale(std::vector<int>& box,
+                         int width,
+                         int height,
+                         std::vector<float>& center,
+                         std::vector<float>& scale);
+void get_max_preds(std::vector<float>& heatmap,
+                   std::vector<int>& dim,
+                   std::vector<float>& preds,
+                   std::vector<float>& maxvals,
+                   int batchid,
+                   int joint_idx);
+void get_final_preds(std::vector<float>& heatmap,
+                     std::vector<uint64_t>& dim,
+                     std::vector<float>& idxout,
+                     std::vector<uint64_t>& idxdim,
+                     std::vector<float>& center,
+                     std::vector<float> scale,
+                     std::vector<float>& preds,
+                     int batchid,
+                     bool DARK = true);
+void CropImg(cv::Mat& img,
+             cv::Mat& crop_img,
+             std::vector<int>& area,
+             std::vector<float>& center,
+             std::vector<float>& scale,
+             float expandratio = 0.25);
--- a/deploy/third_engine/demo_openvino_kpts/main.cpp
+++ b/deploy/third_engine/demo_openvino_kpts/main.cpp
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// reference from https://github.com/RangiLyu/nanodet
+#include <iostream>
+#include <opencv2/core/core.hpp>
+#include <opencv2/highgui/highgui.hpp>
+#include <opencv2/imgproc/imgproc.hpp>
+#define image_size 416
+#include "keypoint_detector.h"
+#include "picodet_openvino.h"
+using namespace PaddleDetection;
+struct object_rect {
+  int x;
+  int y;
+  int width;
+  int height;
+};
+int resize_uniform(cv::Mat& src,
+                   cv::Mat& dst,
+                   cv::Size dst_size,
+                   object_rect& effect_area) {
+  int w = src.cols;
+  int h = src.rows;
+  int dst_w = dst_size.width;
+  int dst_h = dst_size.height;
+  dst = cv::Mat(cv::Size(dst_w, dst_h), CV_8UC3, cv::Scalar(0));
+  float ratio_src = w * 1.0 / h;
+  float ratio_dst = dst_w * 1.0 / dst_h;
+  int tmp_w = 0;
+  int tmp_h = 0;
+  if (ratio_src > ratio_dst) {
+    tmp_w = dst_w;
+    tmp_h = floor((dst_w * 1.0 / w) * h);
+  } else if (ratio_src < ratio_dst) {
+    tmp_h = dst_h;
+    tmp_w = floor((dst_h * 1.0 / h) * w);
+  } else {
+    cv::resize(src, dst, dst_size);
+    effect_area.x = 0;
+    effect_area.y = 0;
+    effect_area.width = dst_w;
+    effect_area.height = dst_h;
+    return 0;
+  }
+  cv::Mat tmp;
+  cv::resize(src, tmp, cv::Size(tmp_w, tmp_h));
+  if (tmp_w != dst_w) {
+    int index_w = floor((dst_w - tmp_w) / 2.0);
+    for (int i = 0; i < dst_h; i++) {
+      memcpy(dst.data + i * dst_w * 3 + index_w * 3,
+             tmp.data + i * tmp_w * 3,
+             tmp_w * 3);
+    }
+    effect_area.x = index_w;
+    effect_area.y = 0;
+    effect_area.width = tmp_w;
+    effect_area.height = tmp_h;
+  } else if (tmp_h != dst_h) {
+    int index_h = floor((dst_h - tmp_h) / 2.0);
+    memcpy(dst.data + index_h * dst_w * 3, tmp.data, tmp_w * tmp_h * 3);
+    effect_area.x = 0;
+    effect_area.y = index_h;
+    effect_area.width = tmp_w;
+    effect_area.height = tmp_h;
+  } else {
+    printf("error\n");
+  }
+  return 0;
+}
+const int color_list[80][3] = {
+    {216, 82, 24},   {236, 176, 31},  {125, 46, 141},  {118, 171, 47},
+    {76, 189, 237},  {238, 19, 46},   {76, 76, 76},    {153, 153, 153},
+    {255, 0, 0},     {255, 127, 0},   {190, 190, 0},   {0, 255, 0},
+    {0, 0, 255},     {170, 0, 255},   {84, 84, 0},     {84, 170, 0},
+    {84, 255, 0},    {170, 84, 0},    {170, 170, 0},   {170, 255, 0},
+    {255, 84, 0},    {255, 170, 0},   {255, 255, 0},   {0, 84, 127},
+    {0, 170, 127},   {0, 255, 127},   {84, 0, 127},    {84, 84, 127},
+    {84, 170, 127},  {84, 255, 127},  {170, 0, 127},   {170, 84, 127},
+    {170, 170, 127}, {170, 255, 127}, {255, 0, 127},   {255, 84, 127},
+    {255, 170, 127}, {255, 255, 127}, {0, 84, 255},    {0, 170, 255},
+    {0, 255, 255},   {84, 0, 255},    {84, 84, 255},   {84, 170, 255},
+    {84, 255, 255},  {170, 0, 255},   {170, 84, 255},  {170, 170, 255},
+    {170, 255, 255}, {255, 0, 255},   {255, 84, 255},  {255, 170, 255},
+    {42, 0, 0},      {84, 0, 0},      {127, 0, 0},     {170, 0, 0},
+    {212, 0, 0},     {255, 0, 0},     {0, 42, 0},      {0, 84, 0},
+    {0, 127, 0},     {0, 170, 0},     {0, 212, 0},     {0, 255, 0},
+    {0, 0, 42},      {0, 0, 84},      {0, 0, 127},     {0, 0, 170},
+    {0, 0, 212},     {0, 0, 255},     {0, 0, 0},       {36, 36, 36},
+    {72, 72, 72},    {109, 109, 109}, {145, 145, 145}, {182, 182, 182},
+    {218, 218, 218}, {0, 113, 188},   {80, 182, 188},  {127, 127, 0},
+};
+void draw_bboxes(const cv::Mat& bgr,
+                 const std::vector<BoxInfo>& bboxes,
+                 object_rect effect_roi) {
+  static const char* class_names[] = {
+      "person",        "bicycle",      "car",
+      "motorcycle",    "airplane",     "bus",
+      "train",         "truck",        "boat",
+      "traffic light", "fire hydrant", "stop sign",
+      "parking meter", "bench",        "bird",
+      "cat",           "dog",          "horse",
+      "sheep",         "cow",          "elephant",
+      "bear",          "zebra",        "giraffe",
+      "backpack",      "umbrella",     "handbag",
+      "tie",           "suitcase",     "frisbee",
+      "skis",          "snowboard",    "sports ball",
+      "kite",          "baseball bat", "baseball glove",
+      "skateboard",    "surfboard",    "tennis racket",
+      "bottle",        "wine glass",   "cup",
+      "fork",          "knife",        "spoon",
+      "bowl",          "banana",       "apple",
+      "sandwich",      "orange",       "broccoli",
+      "carrot",        "hot dog",      "pizza",
+      "donut",         "cake",         "chair",
+      "couch",         "potted plant", "bed",
+      "dining table",  "toilet",       "tv",
+      "laptop",        "mouse",        "remote",
+      "keyboard",      "cell phone",   "microwave",
+      "oven",          "toaster",      "sink",
+      "refrigerator",  "book",         "clock",
+      "vase",          "scissors",     "teddy bear",
+      "hair drier",    "toothbrush"};
+  cv::Mat image = bgr.clone();
+  int src_w = image.cols;
+  int src_h = image.rows;
+  int dst_w = effect_roi.width;
+  int dst_h = effect_roi.height;
+  float width_ratio = (float)src_w / (float)dst_w;
+  float height_ratio = (float)src_h / (float)dst_h;
+  for (size_t i = 0; i < bboxes.size(); i++) {
+    const BoxInfo& bbox = bboxes[i];
+    cv::Scalar color = cv::Scalar(color_list[bbox.label][0],
+                                  color_list[bbox.label][1],
+                                  color_list[bbox.label][2]);
+    cv::rectangle(image,
+                  cv::Rect(cv::Point((bbox.x1 - effect_roi.x) * width_ratio,
+                                     (bbox.y1 - effect_roi.y) * height_ratio),
+                           cv::Point((bbox.x2 - effect_roi.x) * width_ratio,
+                                     (bbox.y2 - effect_roi.y) * height_ratio)),
+                  color);
+    char text[256];
+    sprintf(text, "%s %.1f%%", class_names[bbox.label], bbox.score * 100);
+    int baseLine = 0;
+    cv::Size label_size =
+        cv::getTextSize(text, cv::FONT_HERSHEY_SIMPLEX, 0.4, 1, &baseLine);
+    int x = (bbox.x1 - effect_roi.x) * width_ratio;
+    int y =
+        (bbox.y1 - effect_roi.y) * height_ratio - label_size.height - baseLine;
+    if (y < 0) y = 0;
+    if (x + label_size.width > image.cols) x = image.cols - label_size.width;
+    cv::rectangle(
+        image,
+        cv::Rect(cv::Point(x, y),
+                 cv::Size(label_size.width, label_size.height + baseLine)),
+        color,
+        -1);
+    cv::putText(image,
+                text,
+                cv::Point(x, y + label_size.height),
+                cv::FONT_HERSHEY_SIMPLEX,
+                0.4,
+                cv::Scalar(255, 255, 255));
+  }
+  cv::imwrite("../predict.jpg", image);
+}
+std::vector<BoxInfo> coordsback(const cv::Mat image,
+                                const object_rect effect_roi,
+                                const std::vector<BoxInfo>& bboxes) {
+  int src_w = image.cols;
+  int src_h = image.rows;
+  int dst_w = effect_roi.width;
+  int dst_h = effect_roi.height;
+  float width_ratio = (float)src_w / (float)dst_w;
+  float height_ratio = (float)src_h / (float)dst_h;
+  std::vector<BoxInfo> bboxes_oimg;
+  for (int i = 0; i < bboxes.size(); i++) {
+    auto bbox = bboxes[i];
+    bbox.x1 = (bbox.x1 - effect_roi.x) * width_ratio;
+    bbox.y1 = (bbox.y1 - effect_roi.y) * height_ratio;
+    bbox.x2 = (bbox.x2 - effect_roi.x) * width_ratio;
+    bbox.y2 = (bbox.y2 - effect_roi.y) * height_ratio;
+    bboxes_oimg.emplace_back(bbox);
+  }
+  return bboxes_oimg;
+}
+void image_infer_kpts(KeyPointDetector* kpts_detector,
+                      cv::Mat image,
+                      const object_rect effect_roi,
+                      const std::vector<BoxInfo>& results,
+                      std::string img_name = "kpts_vis",
+                      bool save_img = true) {
+  std::vector<cv::Mat> cropimgs;
+  std::vector<std::vector<float>> center_bs;
+  std::vector<std::vector<float>> scale_bs;
+  std::vector<KeyPointResult> kpts_results;
+  auto results_oimg = coordsback(image, effect_roi, results);
+  for (int i = 0; i < results_oimg.size(); i++) {
+    auto rect = results_oimg[i];
+    if (rect.label == 0) {
+      cv::Mat cropimg;
+      std::vector<float> center, scale;
+      std::vector<int> area = {static_cast<int>(rect.x1),
+                               static_cast<int>(rect.y1),
+                               static_cast<int>(rect.x2),
+                               static_cast<int>(rect.y2)};
+      CropImg(image, cropimg, area, center, scale);
+      cropimgs.emplace_back(cropimg);
+      center_bs.emplace_back(center);
+      scale_bs.emplace_back(scale);
+    }
+    if (cropimgs.size() == 1 ||
+        (cropimgs.size() > 0 && i == results_oimg.size() - 1)) {
+      kpts_detector->Predict(cropimgs, center_bs, scale_bs, &kpts_results);
+      cropimgs.clear();
+      center_bs.clear();
+      scale_bs.clear();
+    }
+  }
+  std::vector<int> compression_params;
+  compression_params.push_back(cv::IMWRITE_JPEG_QUALITY);
+  compression_params.push_back(95);
+  std::string kpts_savepath =
+      "keypoint_" + img_name.substr(img_name.find_last_of('/') + 1);
+  cv::Mat kpts_vis_img =
+      VisualizeKptsResult(image, kpts_results, {0, 255, 0}, 0.1);
+  if (save_img) {
+    cv::imwrite(kpts_savepath, kpts_vis_img, compression_params);
+    printf("Visualized output saved as %s\n", kpts_savepath.c_str());
+  } else {
+    cv::imshow("image", kpts_vis_img);
+  }
+}
+int image_demo(PicoDet& detector,
+               KeyPointDetector* kpts_detector,
+               const char* imagepath) {
+  std::vector<std::string> filenames;
+  cv::glob(imagepath, filenames, false);
+  for (auto img_name : filenames) {
+    cv::Mat image = cv::imread(img_name);
+    if (image.empty()) {
+      return -1;
+    }
+    object_rect effect_roi;
+    cv::Mat resized_img;
+    resize_uniform(
+        image, resized_img, cv::Size(image_size, image_size), effect_roi);
+    auto results = detector.detect(resized_img, 0.4, 0.5);
+    if (kpts_detector) {
+      image_infer_kpts(kpts_detector, image, effect_roi, results, img_name);
+    }
+  }
+  return 0;
+}
+int webcam_demo(PicoDet& detector,
+                KeyPointDetector* kpts_detector,
+                int cam_id) {
+  cv::Mat image;
+  cv::VideoCapture cap(cam_id);
+  while (true) {
+    cap >> image;
+    object_rect effect_roi;
+    cv::Mat resized_img;
+    resize_uniform(
+        image, resized_img, cv::Size(image_size, image_size), effect_roi);
+    auto results = detector.detect(resized_img, 0.4, 0.5);
+    if (kpts_detector) {
+      image_infer_kpts(kpts_detector, image, effect_roi, results, "", false);
+    }
+  }
+  return 0;
+}
+int video_demo(PicoDet& detector,
+               KeyPointDetector* kpts_detector,
+               const char* path) {
+  cv::Mat image;
+  cv::VideoCapture cap(path);
+  while (true) {
+    cap >> image;
+    object_rect effect_roi;
+    cv::Mat resized_img;
+    resize_uniform(
+        image, resized_img, cv::Size(image_size, image_size), effect_roi);
+    auto results = detector.detect(resized_img, 0.4, 0.5);
+    if (kpts_detector) {
+      image_infer_kpts(kpts_detector, image, effect_roi, results, "", false);
+    }
+  }
+  return 0;
+}
+int benchmark(KeyPointDetector* kpts_detector) {
+  int loop_num = 100;
+  int warm_up = 8;
+  double time_min = DBL_MAX;
+  double time_max = -DBL_MAX;
+  double time_avg = 0;
+  cv::Mat image(256, 192, CV_8UC3, cv::Scalar(1, 1, 1));
+  std::vector<float> center = {128, 96};
+  std::vector<float> scale = {256, 192};
+  std::vector<cv::Mat> cropimgs = {image};
+  std::vector<std::vector<float>> center_bs = {center};
+  std::vector<std::vector<float>> scale_bs = {scale};
+  std::vector<KeyPointResult> kpts_results;
+  for (int i = 0; i < warm_up + loop_num; i++) {
+    auto start = std::chrono::steady_clock::now();
+    std::vector<BoxInfo> results;
+    kpts_detector->Predict(cropimgs, center_bs, scale_bs, &kpts_results);
+    auto end = std::chrono::steady_clock::now();
+    std::chrono::duration<double> elapsed = end - start;
+    double time = elapsed.count();
+    if (i >= warm_up) {
+      time_min = (std::min)(time_min, time);
+      time_max = (std::max)(time_max, time);
+      time_avg += time;
+    }
+  }
+  time_avg /= loop_num;
+  fprintf(stderr,
+          "%20s  min = %7.4f  max = %7.4f  avg = %7.4f\n",
+          "tinypose",
+          time_min,
+          time_max,
+          time_avg);
+  return 0;
+}
+int main(int argc, char** argv) {
+  if (argc != 3) {
+    fprintf(stderr,
+            "usage: %s [mode] [path]. \n For webcam mode=0, path is cam id; \n "
+            "For image demo, mode=1, path=xxx/xxx/*.jpg; \n For video, mode=2; "
+            "\n For benchmark, mode=3 path=0.\n",
+            argv[0]);
+    return -1;
+  }
+  std::cout << "start init model" << std::endl;
+  auto detector = PicoDet("./weight/picodet_m_416.xml");
+  auto kpts_detector =
+      new KeyPointDetector("./weight/tinypose256_git2-sim.xml", 256, 192);
+  std::cout << "success" << std::endl;
+  int mode = atoi(argv[1]);
+  switch (mode) {
+    case 0: {
+      int cam_id = atoi(argv[2]);
+      webcam_demo(detector, kpts_detector, cam_id);
+      break;
+    }
+    case 1: {
+      const char* images = argv[2];
+      image_demo(detector, kpts_detector, images);
+      break;
+    }
+    case 2: {
+      const char* path = argv[2];
+      video_demo(detector, kpts_detector, path);
+      break;
+    }
+    case 3: {
+      benchmark(kpts_detector);
+      break;
+    }
+    default: {
+      fprintf(stderr,
+              "usage: %s [mode] [path]. \n For webcam mode=0, path is cam id; "
+              "\n For image demo, mode=1, path=xxx/xxx/*.jpg; \n For video, "
+              "mode=2; \n For benchmark, mode=3 path=0.\n",
+              argv[0]);
+      break;
+    }
+  }
+  delete kpts_detector;
+  kpts_detector = nullptr;
+}
--- a/deploy/third_engine/demo_openvino_kpts/picodet_openvino.cpp
+++ b/deploy/third_engine/demo_openvino_kpts/picodet_openvino.cpp
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// reference from https://github.com/RangiLyu/nanodet/tree/main/demo_openvino
+#include "picodet_openvino.h"
+inline float fast_exp(float x) {
+  union {
+    uint32_t i;
+    float f;
+  } v{};
+  v.i = (1 << 23) * (1.4426950409 * x + 126.93490512f);
+  return v.f;
+}
+inline float sigmoid(float x) { return 1.0f / (1.0f + fast_exp(-x)); }
+template <typename _Tp>
+int activation_function_softmax(const _Tp* src, _Tp* dst, int length) {
+  const _Tp alpha = *std::max_element(src, src + length);
+  _Tp denominator{0};
+  for (int i = 0; i < length; ++i) {
+    dst[i] = fast_exp(src[i] - alpha);
+    denominator += dst[i];
+  }
+  for (int i = 0; i < length; ++i) {
+    dst[i] /= denominator;
+  }
+  return 0;
+}
+PicoDet::PicoDet(const char* model_path) {
+  InferenceEngine::Core ie;
+  InferenceEngine::CNNNetwork model = ie.ReadNetwork(model_path);
+  // prepare input settings
+  InferenceEngine::InputsDataMap inputs_map(model.getInputsInfo());
+  input_name_ = inputs_map.begin()->first;
+  InferenceEngine::InputInfo::Ptr input_info = inputs_map.begin()->second;
+  // prepare output settings
+  InferenceEngine::OutputsDataMap outputs_map(model.getOutputsInfo());
+  for (auto& output_info : outputs_map) {
+    output_info.second->setPrecision(InferenceEngine::Precision::FP32);
+  }
+  // get network
+  network_ = ie.LoadNetwork(model, "CPU");
+  infer_request_ = network_.CreateInferRequest();
+}
+PicoDet::~PicoDet() {}
+void PicoDet::preprocess(cv::Mat& image, InferenceEngine::Blob::Ptr& blob) {
+  int img_w = image.cols;
+  int img_h = image.rows;
+  int channels = 3;
+  InferenceEngine::MemoryBlob::Ptr mblob =
+      InferenceEngine::as<InferenceEngine::MemoryBlob>(blob);
+  if (!mblob) {
+    THROW_IE_EXCEPTION
+        << "We expect blob to be inherited from MemoryBlob in matU8ToBlob, "
+        << "but by fact we were not able to cast inputBlob to MemoryBlob";
+  }
+  auto mblobHolder = mblob->wmap();
+  float* blob_data = mblobHolder.as<float*>();
+  for (size_t c = 0; c < channels; c++) {
+    for (size_t h = 0; h < img_h; h++) {
+      for (size_t w = 0; w < img_w; w++) {
+        blob_data[c * img_w * img_h + h * img_w + w] =
+            (float)image.at<cv::Vec3b>(h, w)[c];
+      }
+    }
+  }
+}
+std::vector<BoxInfo> PicoDet::detect(cv::Mat image,
+                                     float score_threshold,
+                                     float nms_threshold) {
+  InferenceEngine::Blob::Ptr input_blob = infer_request_.GetBlob(input_name_);
+  preprocess(image, input_blob);
+  // do inference
+  infer_request_.Infer();
+  // get output
+  std::vector<std::vector<BoxInfo>> results;
+  results.resize(this->num_class_);
+  for (const auto& head_info : this->heads_info_) {
+    const InferenceEngine::Blob::Ptr dis_pred_blob =
+        infer_request_.GetBlob(head_info.dis_layer);
+    const InferenceEngine::Blob::Ptr cls_pred_blob =
+        infer_request_.GetBlob(head_info.cls_layer);
+    auto mdis_pred =
+        InferenceEngine::as<InferenceEngine::MemoryBlob>(dis_pred_blob);
+    auto mdis_pred_holder = mdis_pred->rmap();
+    const float* dis_pred = mdis_pred_holder.as<const float*>();
+    auto mcls_pred =
+        InferenceEngine::as<InferenceEngine::MemoryBlob>(cls_pred_blob);
+    auto mcls_pred_holder = mcls_pred->rmap();
+    const float* cls_pred = mcls_pred_holder.as<const float*>();
+    this->decode_infer(
+        cls_pred, dis_pred, head_info.stride, score_threshold, results);
+  }
+  std::vector<BoxInfo> dets;
+  for (int i = 0; i < (int)results.size(); i++) {
+    this->nms(results[i], nms_threshold);
+    for (auto& box : results[i]) {
+      dets.push_back(box);
+    }
+  }
+  return dets;
+}
+void PicoDet::decode_infer(const float*& cls_pred,
+                           const float*& dis_pred,
+                           int stride,
+                           float threshold,
+                           std::vector<std::vector<BoxInfo>>& results) {
+  int feature_h = input_size_ / stride;
+  int feature_w = input_size_ / stride;
+  for (int idx = 0; idx < feature_h * feature_w; idx++) {
+    int row = idx / feature_w;
+    int col = idx % feature_w;
+    float score = 0;
+    int cur_label = 0;
+    for (int label = 0; label < num_class_; label++) {
+      if (cls_pred[idx * num_class_ + label] > score) {
+        score = cls_pred[idx * num_class_ + label];
+        cur_label = label;
+      }
+    }
+    if (score > threshold) {
+      const float* bbox_pred = dis_pred + idx * (reg_max_ + 1) * 4;
+      results[cur_label].push_back(
+          this->disPred2Bbox(bbox_pred, cur_label, score, col, row, stride));
+    }
+  }
+}
+BoxInfo PicoDet::disPred2Bbox(
+    const float*& dfl_det, int label, float score, int x, int y, int stride) {
+  float ct_x = (x + 0.5) * stride;
+  float ct_y = (y + 0.5) * stride;
+  std::vector<float> dis_pred;
+  dis_pred.resize(4);
+  for (int i = 0; i < 4; i++) {
+    float dis = 0;
+    float* dis_after_sm = new float[reg_max_ + 1];
+    activation_function_softmax(
+        dfl_det + i * (reg_max_ + 1), dis_after_sm, reg_max_ + 1);
+    for (int j = 0; j < reg_max_ + 1; j++) {
+      dis += j * dis_after_sm[j];
+    }
+    dis *= stride;
+    dis_pred[i] = dis;
+    delete[] dis_after_sm;
+  }
+  float xmin = (std::max)(ct_x - dis_pred[0], .0f);
+  float ymin = (std::max)(ct_y - dis_pred[1], .0f);
+  float xmax = (std::min)(ct_x + dis_pred[2], (float)this->input_size_);
+  float ymax = (std::min)(ct_y + dis_pred[3], (float)this->input_size_);
+  return BoxInfo{xmin, ymin, xmax, ymax, score, label};
+}
+void PicoDet::nms(std::vector<BoxInfo>& input_boxes, float NMS_THRESH) {
+  std::sort(input_boxes.begin(), input_boxes.end(), [](BoxInfo a, BoxInfo b) {
+    return a.score > b.score;
+  });
+  std::vector<float> vArea(input_boxes.size());
+  for (int i = 0; i < int(input_boxes.size()); ++i) {
+    vArea[i] = (input_boxes.at(i).x2 - input_boxes.at(i).x1 + 1) *
+               (input_boxes.at(i).y2 - input_boxes.at(i).y1 + 1);
+  }
+  for (int i = 0; i < int(input_boxes.size()); ++i) {
+    for (int j = i + 1; j < int(input_boxes.size());) {
+      float xx1 = (std::max)(input_boxes[i].x1, input_boxes[j].x1);
+      float yy1 = (std::max)(input_boxes[i].y1, input_boxes[j].y1);
+      float xx2 = (std::min)(input_boxes[i].x2, input_boxes[j].x2);
+      float yy2 = (std::min)(input_boxes[i].y2, input_boxes[j].y2);
+      float w = (std::max)(float(0), xx2 - xx1 + 1);
+      float h = (std::max)(float(0), yy2 - yy1 + 1);
+      float inter = w * h;
+      float ovr = inter / (vArea[i] + vArea[j] - inter);
+      if (ovr >= NMS_THRESH) {
+        input_boxes.erase(input_boxes.begin() + j);
+        vArea.erase(vArea.begin() + j);
+      } else {
+        j++;
+      }
+    }
+  }
+}
--- a/deploy/third_engine/demo_openvino_kpts/picodet_openvino.h
+++ b/deploy/third_engine/demo_openvino_kpts/picodet_openvino.h
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// reference from https://github.com/RangiLyu/nanodet/tree/main/demo_openvino
+#ifndef _PICODET_OPENVINO_H_
+#define _PICODET_OPENVINO_H_
+#include <inference_engine.hpp>
+#include <opencv2/core.hpp>
+#include <string>
+#define image_size 416
+typedef struct HeadInfo {
+  std::string cls_layer;
+  std::string dis_layer;
+  int stride;
+} HeadInfo;
+typedef struct BoxInfo {
+  float x1;
+  float y1;
+  float x2;
+  float y2;
+  float score;
+  int label;
+} BoxInfo;
+class PicoDet {
+public:
+  PicoDet(const char *param);
+  ~PicoDet();
+  InferenceEngine::ExecutableNetwork network_;
+  InferenceEngine::InferRequest infer_request_;
+  std::vector<HeadInfo> heads_info_{
+      // cls_pred|dis_pred|stride
+      {"transpose_0.tmp_0", "transpose_1.tmp_0", 8},
+      {"transpose_2.tmp_0", "transpose_3.tmp_0", 16},
+      {"transpose_4.tmp_0", "transpose_5.tmp_0", 32},
+      {"transpose_6.tmp_0", "transpose_7.tmp_0", 64},
+  };
+  std::vector<BoxInfo> detect(cv::Mat image, float score_threshold,
+                              float nms_threshold);
+private:
+  void preprocess(cv::Mat &image, InferenceEngine::Blob::Ptr &blob);
+  void decode_infer(const float *&cls_pred, const float *&dis_pred, int stride,
+                    float threshold,
+                    std::vector<std::vector<BoxInfo>> &results);
+  BoxInfo disPred2Bbox(const float *&dfl_det, int label, float score, int x,
+                       int y, int stride);
+  static void nms(std::vector<BoxInfo> &result, float nms_threshold);
+  std::string input_name_;
+  int input_size_ = image_size;
+  int num_class_ = 80;
+  int reg_max_ = 7;
+};
+#endif
--- a/deploy/third_engine/onnx/infer.py
+++ b/deploy/third_engine/onnx/infer.py
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import yaml
+import argparse
+import numpy as np
+import glob
+from onnxruntime import InferenceSession
+from preprocess import Compose
+# Global dictionary
+SUPPORT_MODELS = {
+    'YOLO', 'PPYOLOE', 'YOLOX', 'YOLOv5', 'YOLOv6', 'YOLOv7', 'YOLOv8', 'RTMDet'
+}
+parser = argparse.ArgumentParser(description=__doc__)
+parser.add_argument("--infer_cfg", type=str, help="infer_cfg.yml")
+parser.add_argument(
+    '--onnx_file', type=str, default="model.onnx", help="onnx model file path")
+parser.add_argument("--image_dir", type=str)
+parser.add_argument("--image_file", type=str)
+def get_test_images(infer_dir, infer_img):
+    """
+    Get image path list in TEST mode
+    """
+    assert infer_img is not None or infer_dir is not None, \
+        "--image_file or --image_dir should be set"
+    assert infer_img is None or os.path.isfile(infer_img), \
+            "{} is not a file".format(infer_img)
+    assert infer_dir is None or os.path.isdir(infer_dir), \
+            "{} is not a directory".format(infer_dir)
+    # infer_img has a higher priority
+    if infer_img and os.path.isfile(infer_img):
+        return [infer_img]
+    images = set()
+    infer_dir = os.path.abspath(infer_dir)
+    assert os.path.isdir(infer_dir), \
+        "infer_dir {} is not a directory".format(infer_dir)
+    exts = ['jpg', 'jpeg', 'png', 'bmp']
+    exts += [ext.upper() for ext in exts]
+    for ext in exts:
+        images.update(glob.glob('{}/*.{}'.format(infer_dir, ext)))
+    images = list(images)
+    assert len(images) > 0, "no image found in {}".format(infer_dir)
+    print("Found {} inference images in total.".format(len(images)))
+    return images
+class PredictConfig(object):
+    """set config of preprocess, postprocess and visualize
+    Args:
+        infer_config (str): path of infer_cfg.yml
+    """
+    def __init__(self, infer_config):
+        # parsing Yaml config for Preprocess
+        with open(infer_config) as f:
+            yml_conf = yaml.safe_load(f)
+        self.check_model(yml_conf)
+        self.arch = yml_conf['arch']
+        self.preprocess_infos = yml_conf['Preprocess']
+        self.min_subgraph_size = yml_conf['min_subgraph_size']
+        self.label_list = yml_conf['label_list']
+        self.use_dynamic_shape = yml_conf['use_dynamic_shape']
+        self.draw_threshold = yml_conf.get("draw_threshold", 0.5)
+        self.mask = yml_conf.get("mask", False)
+        self.tracker = yml_conf.get("tracker", None)
+        self.nms = yml_conf.get("NMS", None)
+        self.fpn_stride = yml_conf.get("fpn_stride", None)
+        if self.arch == 'RCNN' and yml_conf.get('export_onnx', False):
+            print(
+                'The RCNN export model is used for ONNX and it only supports batch_size = 1'
+            )
+        self.print_config()
+    def check_model(self, yml_conf):
+        """
+        Raises:
+            ValueError: loaded model not in supported model type
+        """
+        for support_model in SUPPORT_MODELS:
+            if support_model in yml_conf['arch']:
+                return True
+        raise ValueError("Unsupported arch: {}, expect {}".format(yml_conf[
+            'arch'], SUPPORT_MODELS))
+    def print_config(self):
+        print('-----------  Model Configuration -----------')
+        print('%s: %s' % ('Model Arch', self.arch))
+        print('%s: ' % ('Transform Order'))
+        for op_info in self.preprocess_infos:
+            print('--%s: %s' % ('transform op', op_info['type']))
+        print('--------------------------------------------')
+def predict_image(infer_config, predictor, img_list):
+    # load preprocess transforms
+    transforms = Compose(infer_config.preprocess_infos)
+    # predict image
+    for img_path in img_list:
+        inputs = transforms(img_path)
+        inputs_name = [var.name for var in predictor.get_inputs()]
+        inputs = {k: inputs[k][None, ] for k in inputs_name}
+        outputs = predictor.run(output_names=None, input_feed=inputs)
+        print("ONNXRuntime predict: ")
+        if infer_config.arch in ["HRNet"]:
+            print(np.array(outputs[0]))
+        else:
+            bboxes = np.array(outputs[0])
+            for bbox in bboxes:
+                if bbox[0] > -1 and bbox[1] > infer_config.draw_threshold:
+                    print(f"{int(bbox[0])} {bbox[1]} "
+                          f"{bbox[2]} {bbox[3]} {bbox[4]} {bbox[5]}")
+if __name__ == '__main__':
+    FLAGS = parser.parse_args()
+    # load image list
+    img_list = get_test_images(FLAGS.image_dir, FLAGS.image_file)
+    # load predictor
+    predictor = InferenceSession(FLAGS.onnx_file)
+    # load infer config
+    infer_config = PredictConfig(FLAGS.infer_cfg)
+    predict_image(infer_config, predictor, img_list)
--- a/deploy/third_engine/onnx/preprocess.py
+++ b/deploy/third_engine/onnx/preprocess.py
+import numpy as np
+import cv2
+import copy
+def decode_image(img_path):
+    with open(img_path, 'rb') as f:
+        im_read = f.read()
+    data = np.frombuffer(im_read, dtype='uint8')
+    im = cv2.imdecode(data, 1)  # BGR mode, but need RGB mode
+    im = cv2.cvtColor(im, cv2.COLOR_BGR2RGB)
+    img_info = {
+        "im_shape": np.array(
+            im.shape[:2], dtype=np.float32),
+        "scale_factor": np.array(
+            [1., 1.], dtype=np.float32)
+    }
+    return im, img_info
+class Resize(object):
+    """resize image by target_size and max_size
+    Args:
+        target_size (int): the target size of image
+        keep_ratio (bool): whether keep_ratio or not, default true
+        interp (int): method of resize
+    """
+    def __init__(self, target_size, keep_ratio=True, interp=cv2.INTER_LINEAR):
+        if isinstance(target_size, int):
+            target_size = [target_size, target_size]
+        self.target_size = target_size
+        self.keep_ratio = keep_ratio
+        self.interp = interp
+    def __call__(self, im, im_info):
+        """
+        Args:
+            im (np.ndarray): image (np.ndarray)
+            im_info (dict): info of image
+        Returns:
+            im (np.ndarray):  processed image (np.ndarray)
+            im_info (dict): info of processed image
+        """
+        assert len(self.target_size) == 2
+        assert self.target_size[0] > 0 and self.target_size[1] > 0
+        im_channel = im.shape[2]
+        im_scale_y, im_scale_x = self.generate_scale(im)
+        im = cv2.resize(
+            im,
+            None,
+            None,
+            fx=im_scale_x,
+            fy=im_scale_y,
+            interpolation=self.interp)
+        im_info['im_shape'] = np.array(im.shape[:2]).astype('float32')
+        im_info['scale_factor'] = np.array(
+            [im_scale_y, im_scale_x]).astype('float32')
+        return im, im_info
+    def generate_scale(self, im):
+        """
+        Args:
+            im (np.ndarray): image (np.ndarray)
+        Returns:
+            im_scale_x: the resize ratio of X
+            im_scale_y: the resize ratio of Y
+        """
+        origin_shape = im.shape[:2]
+        im_c = im.shape[2]
+        if self.keep_ratio:
+            im_size_min = np.min(origin_shape)
+            im_size_max = np.max(origin_shape)
+            target_size_min = np.min(self.target_size)
+            target_size_max = np.max(self.target_size)
+            im_scale = float(target_size_min) / float(im_size_min)
+            if np.round(im_scale * im_size_max) > target_size_max:
+                im_scale = float(target_size_max) / float(im_size_max)
+            im_scale_x = im_scale
+            im_scale_y = im_scale
+        else:
+            resize_h, resize_w = self.target_size
+            im_scale_y = resize_h / float(origin_shape[0])
+            im_scale_x = resize_w / float(origin_shape[1])
+        return im_scale_y, im_scale_x
+class NormalizeImage(object):
+    """normalize image
+    Args:
+        mean (list): im - mean
+        std (list): im / std
+        is_scale (bool): whether need im / 255
+        norm_type (str): type in ['mean_std', 'none']
+    """
+    def __init__(self, mean, std, is_scale=True, norm_type='mean_std'):
+        self.mean = mean
+        self.std = std
+        self.is_scale = is_scale
+        self.norm_type = norm_type
+    def __call__(self, im, im_info):
+        """
+        Args:
+            im (np.ndarray): image (np.ndarray)
+            im_info (dict): info of image
+        Returns:
+            im (np.ndarray):  processed image (np.ndarray)
+            im_info (dict): info of processed image
+        """
+        im = im.astype(np.float32, copy=False)
+        if self.is_scale:
+            scale = 1.0 / 255.0
+            im *= scale
+        if self.norm_type == 'mean_std':
+            mean = np.array(self.mean)[np.newaxis, np.newaxis, :]
+            std = np.array(self.std)[np.newaxis, np.newaxis, :]
+            im -= mean
+            im /= std
+        return im, im_info
+class Permute(object):
+    """permute image
+    Args:
+        to_bgr (bool): whether convert RGB to BGR
+        channel_first (bool): whether convert HWC to CHW
+    """
+    def __init__(self, ):
+        super(Permute, self).__init__()
+    def __call__(self, im, im_info):
+        """
+        Args:
+            im (np.ndarray): image (np.ndarray)
+            im_info (dict): info of image
+        Returns:
+            im (np.ndarray):  processed image (np.ndarray)
+            im_info (dict): info of processed image
+        """
+        im = im.transpose((2, 0, 1)).copy()
+        return im, im_info
+class PadStride(object):
+    """ padding image for model with FPN, instead PadBatch(pad_to_stride) in original config
+    Args:
+        stride (bool): model with FPN need image shape % stride == 0
+    """
+    def __init__(self, stride=0):
+        self.coarsest_stride = stride
+    def __call__(self, im, im_info):
+        """
+        Args:
+            im (np.ndarray): image (np.ndarray)
+            im_info (dict): info of image
+        Returns:
+            im (np.ndarray):  processed image (np.ndarray)
+            im_info (dict): info of processed image
+        """
+        coarsest_stride = self.coarsest_stride
+        if coarsest_stride <= 0:
+            return im, im_info
+        im_c, im_h, im_w = im.shape
+        pad_h = int(np.ceil(float(im_h) / coarsest_stride) * coarsest_stride)
+        pad_w = int(np.ceil(float(im_w) / coarsest_stride) * coarsest_stride)
+        padding_im = np.zeros((im_c, pad_h, pad_w), dtype=np.float32)
+        padding_im[:, :im_h, :im_w] = im
+        return padding_im, im_info
+class LetterBoxResize(object):
+    def __init__(self, target_size):
+        """
+        Resize image to target size, convert normalized xywh to pixel xyxy
+        format ([x_center, y_center, width, height] -> [x0, y0, x1, y1]).
+        Args:
+            target_size (int|list): image target size.
+        """
+        super(LetterBoxResize, self).__init__()
+        if isinstance(target_size, int):
+            target_size = [target_size, target_size]
+        self.target_size = target_size
+    def letterbox(self, img, height, width, color=(127.5, 127.5, 127.5)):
+        # letterbox: resize a rectangular image to a padded rectangular
+        shape = img.shape[:2]  # [height, width]
+        ratio_h = float(height) / shape[0]
+        ratio_w = float(width) / shape[1]
+        ratio = min(ratio_h, ratio_w)
+        new_shape = (round(shape[1] * ratio),
+                     round(shape[0] * ratio))  # [width, height]
+        padw = (width - new_shape[0]) / 2
+        padh = (height - new_shape[1]) / 2
+        top, bottom = round(padh - 0.1), round(padh + 0.1)
+        left, right = round(padw - 0.1), round(padw + 0.1)
+        img = cv2.resize(
+            img, new_shape, interpolation=cv2.INTER_AREA)  # resized, no border
+        img = cv2.copyMakeBorder(
+            img, top, bottom, left, right, cv2.BORDER_CONSTANT,
+            value=color)  # padded rectangular
+        return img, ratio, padw, padh
+    def __call__(self, im, im_info):
+        """
+        Args:
+            im (np.ndarray): image (np.ndarray)
+            im_info (dict): info of image
+        Returns:
+            im (np.ndarray):  processed image (np.ndarray)
+            im_info (dict): info of processed image
+        """
+        assert len(self.target_size) == 2
+        assert self.target_size[0] > 0 and self.target_size[1] > 0
+        height, width = self.target_size
+        h, w = im.shape[:2]
+        im, ratio, padw, padh = self.letterbox(im, height=height, width=width)
+        new_shape = [round(h * ratio), round(w * ratio)]
+        im_info['im_shape'] = np.array(new_shape, dtype=np.float32)
+        im_info['scale_factor'] = np.array([ratio, ratio], dtype=np.float32)
+        return im, im_info
+class Pad(object):
+    def __init__(self, size, fill_value=[114.0, 114.0, 114.0]):
+        """
+        Pad image to a specified size.
+        Args:
+            size (list[int]): image target size
+            fill_value (list[float]): rgb value of pad area, default (114.0, 114.0, 114.0)
+        """
+        super(Pad, self).__init__()
+        if isinstance(size, int):
+            size = [size, size]
+        self.size = size
+        self.fill_value = fill_value
+    def __call__(self, im, im_info):
+        im_h, im_w = im.shape[:2]
+        h, w = self.size
+        if h == im_h and w == im_w:
+            im = im.astype(np.float32)
+            return im, im_info
+        canvas = np.ones((h, w, 3), dtype=np.float32)
+        canvas *= np.array(self.fill_value, dtype=np.float32)
+        canvas[0:im_h, 0:im_w, :] = im.astype(np.float32)
+        im = canvas
+        return im, im_info
+def rotate_point(pt, angle_rad):
+    """Rotate a point by an angle.
+    Args:
+        pt (list[float]): 2 dimensional point to be rotated
+        angle_rad (float): rotation angle by radian
+    Returns:
+        list[float]: Rotated point.
+    """
+    assert len(pt) == 2
+    sn, cs = np.sin(angle_rad), np.cos(angle_rad)
+    new_x = pt[0] * cs - pt[1] * sn
+    new_y = pt[0] * sn + pt[1] * cs
+    rotated_pt = [new_x, new_y]
+    return rotated_pt
+def _get_3rd_point(a, b):
+    """To calculate the affine matrix, three pairs of points are required. This
+    function is used to get the 3rd point, given 2D points a & b.
+    The 3rd point is defined by rotating vector `a - b` by 90 degrees
+    anticlockwise, using b as the rotation center.
+    Args:
+        a (np.ndarray): point(x,y)
+        b (np.ndarray): point(x,y)
+    Returns:
+        np.ndarray: The 3rd point.
+    """
+    assert len(a) == 2
+    assert len(b) == 2
+    direction = a - b
+    third_pt = b + np.array([-direction[1], direction[0]], dtype=np.float32)
+    return third_pt
+def get_affine_transform(center,
+                         input_size,
+                         rot,
+                         output_size,
+                         shift=(0., 0.),
+                         inv=False):
+    """Get the affine transform matrix, given the center/scale/rot/output_size.
+    Args:
+        center (np.ndarray[2, ]): Center of the bounding box (x, y).
+        scale (np.ndarray[2, ]): Scale of the bounding box
+            wrt [width, height].
+        rot (float): Rotation angle (degree).
+        output_size (np.ndarray[2, ]): Size of the destination heatmaps.
+        shift (0-100%): Shift translation ratio wrt the width/height.
+            Default (0., 0.).
+        inv (bool): Option to inverse the affine transform direction.
+            (inv=False: src->dst or inv=True: dst->src)
+    Returns:
+        np.ndarray: The transform matrix.
+    """
+    assert len(center) == 2
+    assert len(output_size) == 2
+    assert len(shift) == 2
+    if not isinstance(input_size, (np.ndarray, list)):
+        input_size = np.array([input_size, input_size], dtype=np.float32)
+    scale_tmp = input_size
+    shift = np.array(shift)
+    src_w = scale_tmp[0]
+    dst_w = output_size[0]
+    dst_h = output_size[1]
+    rot_rad = np.pi * rot / 180
+    src_dir = rotate_point([0., src_w * -0.5], rot_rad)
+    dst_dir = np.array([0., dst_w * -0.5])
+    src = np.zeros((3, 2), dtype=np.float32)
+    src[0, :] = center + scale_tmp * shift
+    src[1, :] = center + src_dir + scale_tmp * shift
+    src[2, :] = _get_3rd_point(src[0, :], src[1, :])
+    dst = np.zeros((3, 2), dtype=np.float32)
+    dst[0, :] = [dst_w * 0.5, dst_h * 0.5]
+    dst[1, :] = np.array([dst_w * 0.5, dst_h * 0.5]) + dst_dir
+    dst[2, :] = _get_3rd_point(dst[0, :], dst[1, :])
+    if inv:
+        trans = cv2.getAffineTransform(np.float32(dst), np.float32(src))
+    else:
+        trans = cv2.getAffineTransform(np.float32(src), np.float32(dst))
+    return trans
+class WarpAffine(object):
+    """Warp affine the image
+    """
+    def __init__(self,
+                 keep_res=False,
+                 pad=31,
+                 input_h=512,
+                 input_w=512,
+                 scale=0.4,
+                 shift=0.1):
+        self.keep_res = keep_res
+        self.pad = pad
+        self.input_h = input_h
+        self.input_w = input_w
+        self.scale = scale
+        self.shift = shift
+    def __call__(self, im, im_info):
+        """
+        Args:
+            im (np.ndarray): image (np.ndarray)
+            im_info (dict): info of image
+        Returns:
+            im (np.ndarray):  processed image (np.ndarray)
+            im_info (dict): info of processed image
+        """
+        img = cv2.cvtColor(im, cv2.COLOR_RGB2BGR)
+        h, w = img.shape[:2]
+        if self.keep_res:
+            input_h = (h | self.pad) + 1
+            input_w = (w | self.pad) + 1
+            s = np.array([input_w, input_h], dtype=np.float32)
+            c = np.array([w // 2, h // 2], dtype=np.float32)
+        else:
+            s = max(h, w) * 1.0
+            input_h, input_w = self.input_h, self.input_w
+            c = np.array([w / 2., h / 2.], dtype=np.float32)
+        trans_input = get_affine_transform(c, s, 0, [input_w, input_h])
+        img = cv2.resize(img, (w, h))
+        inp = cv2.warpAffine(
+            img, trans_input, (input_w, input_h), flags=cv2.INTER_LINEAR)
+        return inp, im_info
+# keypoint preprocess
+def get_warp_matrix(theta, size_input, size_dst, size_target):
+    """This code is based on
+        https://github.com/open-mmlab/mmpose/blob/master/mmpose/core/post_processing/post_transforms.py
+        Calculate the transformation matrix under the constraint of unbiased.
+    Paper ref: Huang et al. The Devil is in the Details: Delving into Unbiased
+    Data Processing for Human Pose Estimation (CVPR 2020).
+    Args:
+        theta (float): Rotation angle in degrees.
+        size_input (np.ndarray): Size of input image [w, h].
+        size_dst (np.ndarray): Size of output image [w, h].
+        size_target (np.ndarray): Size of ROI in input plane [w, h].
+    Returns:
+        matrix (np.ndarray): A matrix for transformation.
+    """
+    theta = np.deg2rad(theta)
+    matrix = np.zeros((2, 3), dtype=np.float32)
+    scale_x = size_dst[0] / size_target[0]
+    scale_y = size_dst[1] / size_target[1]
+    matrix[0, 0] = np.cos(theta) * scale_x
+    matrix[0, 1] = -np.sin(theta) * scale_x
+    matrix[0, 2] = scale_x * (
+        -0.5 * size_input[0] * np.cos(theta) + 0.5 * size_input[1] *
+        np.sin(theta) + 0.5 * size_target[0])
+    matrix[1, 0] = np.sin(theta) * scale_y
+    matrix[1, 1] = np.cos(theta) * scale_y
+    matrix[1, 2] = scale_y * (
+        -0.5 * size_input[0] * np.sin(theta) - 0.5 * size_input[1] *
+        np.cos(theta) + 0.5 * size_target[1])
+    return matrix
+class TopDownEvalAffine(object):
+    """apply affine transform to image and coords
+    Args:
+        trainsize (list): [w, h], the standard size used to train
+        use_udp (bool): whether to use Unbiased Data Processing.
+        records(dict): the dict contained the image and coords
+    Returns:
+        records (dict): contain the image and coords after tranformed
+    """
+    def __init__(self, trainsize, use_udp=False):
+        self.trainsize = trainsize
+        self.use_udp = use_udp
+    def __call__(self, image, im_info):
+        rot = 0
+        imshape = im_info['im_shape'][::-1]
+        center = im_info['center'] if 'center' in im_info else imshape / 2.
+        scale = im_info['scale'] if 'scale' in im_info else imshape
+        if self.use_udp:
+            trans = get_warp_matrix(
+                rot, center * 2.0,
+                [self.trainsize[0] - 1.0, self.trainsize[1] - 1.0], scale)
+            image = cv2.warpAffine(
+                image,
+                trans, (int(self.trainsize[0]), int(self.trainsize[1])),
+                flags=cv2.INTER_LINEAR)
+        else:
+            trans = get_affine_transform(center, scale, rot, self.trainsize)
+            image = cv2.warpAffine(
+                image,
+                trans, (int(self.trainsize[0]), int(self.trainsize[1])),
+                flags=cv2.INTER_LINEAR)
+        return image, im_info
+class Compose:
+    def __init__(self, transforms):
+        self.transforms = []
+        for op_info in transforms:
+            new_op_info = op_info.copy()
+            op_type = new_op_info.pop('type')
+            self.transforms.append(eval(op_type)(**new_op_info))
+    def __call__(self, img_path):
+        img, im_info = decode_image(img_path)
+        for t in self.transforms:
+            img, im_info = t(img, im_info)
+        inputs = copy.deepcopy(im_info)
+        inputs['image'] = img
+        return inputs