update code

fccfdfa5 · dlyrm · dcc7bf4f · fccfdfa5 · fccfdfa5 · fccfdfa5
Commit fccfdfa5 authored Dec 25, 2023 by dlyrm
20 changed files
--- a/deploy/cpp/src/preprocess_op.cc
+++ b/deploy/cpp/src/preprocess_op.cc
+//   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <string>
+#include <thread>
+#include <vector>
+#include "include/preprocess_op.h"
+namespace PaddleDetection {
+void InitInfo::Run(cv::Mat* im, ImageBlob* data) {
+  data->im_shape_ = {static_cast<float>(im->rows),
+                     static_cast<float>(im->cols)};
+  data->scale_factor_ = {1., 1.};
+  data->in_net_shape_ = {static_cast<float>(im->rows),
+                         static_cast<float>(im->cols)};
+}
+void NormalizeImage::Run(cv::Mat* im, ImageBlob* data) {
+  double e = 1.0;
+  if (is_scale_) {
+    e /= 255.0;
+  }
+  (*im).convertTo(*im, CV_32FC3, e);
+  if (norm_type_ == "mean_std"){
+    for (int h = 0; h < im->rows; h++) {
+      for (int w = 0; w < im->cols; w++) {
+        im->at<cv::Vec3f>(h, w)[0] =
+            (im->at<cv::Vec3f>(h, w)[0] - mean_[0]) / scale_[0];
+        im->at<cv::Vec3f>(h, w)[1] =
+            (im->at<cv::Vec3f>(h, w)[1] - mean_[1]) / scale_[1];
+        im->at<cv::Vec3f>(h, w)[2] =
+            (im->at<cv::Vec3f>(h, w)[2] - mean_[2]) / scale_[2];
+      }
+    }
+  }
+}
+void Permute::Run(cv::Mat* im, ImageBlob* data) {
+  (*im).convertTo(*im, CV_32FC3);
+  int rh = im->rows;
+  int rw = im->cols;
+  int rc = im->channels();
+  (data->im_data_).resize(rc * rh * rw);
+  float* base = (data->im_data_).data();
+  for (int i = 0; i < rc; ++i) {
+    cv::extractChannel(*im, cv::Mat(rh, rw, CV_32FC1, base + i * rh * rw), i);
+  }
+}
+void Resize::Run(cv::Mat* im, ImageBlob* data) {
+  auto resize_scale = GenerateScale(*im);
+  cv::resize(
+      *im, *im, cv::Size(), resize_scale.first, resize_scale.second, interp_);
+  data->in_net_shape_ = {static_cast<float>(im->rows),
+                         static_cast<float>(im->cols)};
+  data->im_shape_ = {
+      static_cast<float>(im->rows), static_cast<float>(im->cols),
+  };
+  data->scale_factor_ = {
+      resize_scale.second, resize_scale.first,
+  };
+}
+std::pair<float, float> Resize::GenerateScale(const cv::Mat& im) {
+  std::pair<float, float> resize_scale;
+  int origin_w = im.cols;
+  int origin_h = im.rows;
+  if (keep_ratio_) {
+    int im_size_max = std::max(origin_w, origin_h);
+    int im_size_min = std::min(origin_w, origin_h);
+    int target_size_max =
+        *std::max_element(target_size_.begin(), target_size_.end());
+    int target_size_min =
+        *std::min_element(target_size_.begin(), target_size_.end());
+    float scale_min =
+        static_cast<float>(target_size_min) / static_cast<float>(im_size_min);
+    float scale_max =
+        static_cast<float>(target_size_max) / static_cast<float>(im_size_max);
+    float scale_ratio = std::min(scale_min, scale_max);
+    resize_scale = {scale_ratio, scale_ratio};
+  } else {
+    resize_scale.first =
+        static_cast<float>(target_size_[1]) / static_cast<float>(origin_w);
+    resize_scale.second =
+        static_cast<float>(target_size_[0]) / static_cast<float>(origin_h);
+  }
+  return resize_scale;
+}
+void LetterBoxResize::Run(cv::Mat* im, ImageBlob* data) {
+  float resize_scale = GenerateScale(*im);
+  int new_shape_w = std::round(im->cols * resize_scale);
+  int new_shape_h = std::round(im->rows * resize_scale);
+  data->im_shape_ = {static_cast<float>(new_shape_h),
+                     static_cast<float>(new_shape_w)};
+  float padw = (target_size_[1] - new_shape_w) / 2.;
+  float padh = (target_size_[0] - new_shape_h) / 2.;
+  int top = std::round(padh - 0.1);
+  int bottom = std::round(padh + 0.1);
+  int left = std::round(padw - 0.1);
+  int right = std::round(padw + 0.1);
+  cv::resize(
+      *im, *im, cv::Size(new_shape_w, new_shape_h), 0, 0, cv::INTER_AREA);
+  data->in_net_shape_ = {
+      static_cast<float>(im->rows), static_cast<float>(im->cols),
+  };
+  cv::copyMakeBorder(*im,
+                     *im,
+                     top,
+                     bottom,
+                     left,
+                     right,
+                     cv::BORDER_CONSTANT,
+                     cv::Scalar(127.5));
+  data->in_net_shape_ = {
+      static_cast<float>(im->rows), static_cast<float>(im->cols),
+  };
+  data->scale_factor_ = {
+      resize_scale, resize_scale,
+  };
+}
+float LetterBoxResize::GenerateScale(const cv::Mat& im) {
+  int origin_w = im.cols;
+  int origin_h = im.rows;
+  int target_h = target_size_[0];
+  int target_w = target_size_[1];
+  float ratio_h = static_cast<float>(target_h) / static_cast<float>(origin_h);
+  float ratio_w = static_cast<float>(target_w) / static_cast<float>(origin_w);
+  float resize_scale = std::min(ratio_h, ratio_w);
+  return resize_scale;
+}
+void PadStride::Run(cv::Mat* im, ImageBlob* data) {
+  if (stride_ <= 0) {
+    data->in_net_im_ = im->clone();
+    return;
+  }
+  int rc = im->channels();
+  int rh = im->rows;
+  int rw = im->cols;
+  int nh = (rh / stride_) * stride_ + (rh % stride_ != 0) * stride_;
+  int nw = (rw / stride_) * stride_ + (rw % stride_ != 0) * stride_;
+  cv::copyMakeBorder(
+      *im, *im, 0, nh - rh, 0, nw - rw, cv::BORDER_CONSTANT, cv::Scalar(0));
+  data->in_net_im_ = im->clone();
+  data->in_net_shape_ = {
+      static_cast<float>(im->rows), static_cast<float>(im->cols),
+  };
+}
+void TopDownEvalAffine::Run(cv::Mat* im, ImageBlob* data) {
+  cv::resize(*im, *im, cv::Size(trainsize_[0], trainsize_[1]), 0, 0, interp_);
+  // todo: Simd::ResizeBilinear();
+  data->in_net_shape_ = {
+      static_cast<float>(trainsize_[1]), static_cast<float>(trainsize_[0]),
+  };
+}
+void GetAffineTrans(const cv::Point2f center,
+                    const cv::Point2f input_size,
+                    const cv::Point2f output_size,
+                    cv::Mat* trans) {
+  cv::Point2f srcTri[3];
+  cv::Point2f dstTri[3];
+  float src_w = input_size.x;
+  float dst_w = output_size.x;
+  float dst_h = output_size.y;
+  cv::Point2f src_dir(0, -0.5 * src_w);
+  cv::Point2f dst_dir(0, -0.5 * dst_w);
+  srcTri[0] = center;
+  srcTri[1] = center + src_dir;
+  cv::Point2f src_d = srcTri[0] - srcTri[1];
+  srcTri[2] = srcTri[1] + cv::Point2f(-src_d.y, src_d.x);
+  dstTri[0] = cv::Point2f(dst_w * 0.5, dst_h * 0.5);
+  dstTri[1] = cv::Point2f(dst_w * 0.5, dst_h * 0.5) + dst_dir;
+  cv::Point2f dst_d = dstTri[0] - dstTri[1];
+  dstTri[2] = dstTri[1] + cv::Point2f(-dst_d.y, dst_d.x);
+  *trans = cv::getAffineTransform(srcTri, dstTri);
+}
+void WarpAffine::Run(cv::Mat* im, ImageBlob* data) {
+  cv::cvtColor(*im, *im, cv::COLOR_RGB2BGR);
+  cv::Mat trans(2, 3, CV_32FC1);
+  cv::Point2f center;
+  cv::Point2f input_size;
+  int h = im->rows;
+  int w = im->cols;
+  if (keep_res_) {
+    input_h_ = (h | pad_) + 1;
+    input_w_ = (w + pad_) + 1;
+    input_size = cv::Point2f(input_w_, input_h_);
+    center = cv::Point2f(w / 2, h / 2);
+  } else {
+    float s = std::max(h, w) * 1.0;
+    input_size = cv::Point2f(s, s);
+    center = cv::Point2f(w / 2., h / 2.);
+  }
+  cv::Point2f output_size(input_w_, input_h_);
+  GetAffineTrans(center, input_size, output_size, &trans);
+  cv::warpAffine(*im, *im, trans, cv::Size(input_w_, input_h_));
+  data->in_net_shape_ = {
+      static_cast<float>(input_h_), static_cast<float>(input_w_),
+  };
+}
+void Pad::Run(cv::Mat* im, ImageBlob* data) {
+  int h = size_[0];
+  int w = size_[1];
+  int rh = im->rows;
+  int rw = im->cols;
+  if (h == rh && w == rw){
+    data->in_net_im_ = im->clone();
+    return;
+  }
+  cv::copyMakeBorder(
+      *im, *im, 0, h - rh, 0, w - rw, cv::BORDER_CONSTANT, cv::Scalar(114));
+  data->in_net_im_ = im->clone();
+  data->in_net_shape_ = {
+      static_cast<float>(im->rows), static_cast<float>(im->cols),
+  };
+}
+// Preprocessor op running order
+const std::vector<std::string> Preprocessor::RUN_ORDER = {"InitInfo",
+                                                          "TopDownEvalAffine",
+                                                          "Resize",
+                                                          "LetterBoxResize",
+                                                          "WarpAffine",
+                                                          "NormalizeImage",
+                                                          "PadStride",
+                                                          "Pad",
+                                                          "Permute"};
+void Preprocessor::Run(cv::Mat* im, ImageBlob* data) {
+  for (const auto& name : RUN_ORDER) {
+    if (ops_.find(name) != ops_.end()) {
+      ops_[name]->Run(im, data);
+    }
+  }
+}
+void CropImg(cv::Mat& img,
+             cv::Mat& crop_img,
+             std::vector<int>& area,
+             std::vector<float>& center,
+             std::vector<float>& scale,
+             float expandratio) {
+  int crop_x1 = std::max(0, area[0]);
+  int crop_y1 = std::max(0, area[1]);
+  int crop_x2 = std::min(img.cols - 1, area[2]);
+  int crop_y2 = std::min(img.rows - 1, area[3]);
+  int center_x = (crop_x1 + crop_x2) / 2.;
+  int center_y = (crop_y1 + crop_y2) / 2.;
+  int half_h = (crop_y2 - crop_y1) / 2.;
+  int half_w = (crop_x2 - crop_x1) / 2.;
+  // adjust h or w to keep image ratio, expand the shorter edge
+  if (half_h * 3 > half_w * 4) {
+    half_w = static_cast<int>(half_h * 0.75);
+  } else {
+    half_h = static_cast<int>(half_w * 4 / 3);
+  }
+  crop_x1 =
+      std::max(0, center_x - static_cast<int>(half_w * (1 + expandratio)));
+  crop_y1 =
+      std::max(0, center_y - static_cast<int>(half_h * (1 + expandratio)));
+  crop_x2 = std::min(img.cols - 1,
+                     static_cast<int>(center_x + half_w * (1 + expandratio)));
+  crop_y2 = std::min(img.rows - 1,
+                     static_cast<int>(center_y + half_h * (1 + expandratio)));
+  crop_img =
+      img(cv::Range(crop_y1, crop_y2 + 1), cv::Range(crop_x1, crop_x2 + 1));
+  center.clear();
+  center.emplace_back((crop_x1 + crop_x2) / 2);
+  center.emplace_back((crop_y1 + crop_y2) / 2);
+  scale.clear();
+  scale.emplace_back((crop_x2 - crop_x1));
+  scale.emplace_back((crop_y2 - crop_y1));
+}
+bool CheckDynamicInput(const std::vector<cv::Mat>& imgs) {
+  if (imgs.size() == 1) return false;
+  int h = imgs.at(0).rows;
+  int w = imgs.at(0).cols;
+  for (int i = 1; i < imgs.size(); ++i) {
+    int hi = imgs.at(i).rows;
+    int wi = imgs.at(i).cols;
+    if (hi != h || wi != w) {
+      return true;
+    }
+  }
+  return false;
+}
+std::vector<cv::Mat> PadBatch(const std::vector<cv::Mat>& imgs) {
+  std::vector<cv::Mat> out_imgs;
+  int max_h = 0;
+  int max_w = 0;
+  int rh = 0;
+  int rw = 0;
+  // find max_h and max_w in batch
+  for (int i = 0; i < imgs.size(); ++i) {
+    rh = imgs.at(i).rows;
+    rw = imgs.at(i).cols;
+    if (rh > max_h) max_h = rh;
+    if (rw > max_w) max_w = rw;
+  }
+  for (int i = 0; i < imgs.size(); ++i) {
+    cv::Mat im = imgs.at(i);
+    cv::copyMakeBorder(im,
+                       im,
+                       0,
+                       max_h - imgs.at(i).rows,
+                       0,
+                       max_w - imgs.at(i).cols,
+                       cv::BORDER_CONSTANT,
+                       cv::Scalar(0));
+    out_imgs.push_back(im);
+  }
+  return out_imgs;
+}
+}  // namespace PaddleDetection
--- a/deploy/cpp/src/tracker.cc
+++ b/deploy/cpp/src/tracker.cc
+//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// The code is based on:
+// https://github.com/CnybTseng/JDE/blob/master/platforms/common/jdetracker.cpp
+// Ths copyright of CnybTseng/JDE is as follows:
+// MIT License 
+#include <map>
+#include <stdio.h>
+#include <limits.h>
+#include <algorithm>
+#include "include/lapjv.h"
+#include "include/tracker.h"
+#define mat2vec4f(m) cv::Vec4f(*m.ptr<float>(0,0), *m.ptr<float>(0,1), *m.ptr<float>(0,2), *m.ptr<float>(0,3))
+namespace PaddleDetection {
+static std::map<int, float> chi2inv95 = {
+    {1,  3.841459f},
+    {2,  5.991465f},
+    {3,  7.814728f},
+    {4,  9.487729f},
+    {5, 11.070498f},
+    {6, 12.591587f},
+    {7, 14.067140f},
+    {8, 15.507313f},
+    {9, 16.918978f}
+};
+JDETracker *JDETracker::me = new JDETracker;
+JDETracker *JDETracker::instance(void)
+{
+    return me;
+}
+JDETracker::JDETracker(void) : timestamp(0), max_lost_time(30), lambda(0.98f), det_thresh(0.3f)
+{    
+}
+bool JDETracker::update(const cv::Mat &dets, const cv::Mat &emb, std::vector<Track> &tracks)
+{
+    ++timestamp;
+    TrajectoryPool candidates(dets.rows);
+    for (int i = 0; i < dets.rows; ++i)
+    {
+        float score = *dets.ptr<float>(i, 1);
+        const cv::Mat &ltrb_ = dets(cv::Rect(2, i, 4, 1));
+        cv::Vec4f ltrb = mat2vec4f(ltrb_);
+        const cv::Mat &embedding = emb(cv::Rect(0, i, emb.cols, 1));
+        candidates[i] = Trajectory(ltrb, score, embedding);
+    }
+    TrajectoryPtrPool tracked_trajectories;
+    TrajectoryPtrPool unconfirmed_trajectories;
+    for (size_t i = 0; i < this->tracked_trajectories.size(); ++i)
+    {
+        if (this->tracked_trajectories[i].is_activated)
+            tracked_trajectories.push_back(&this->tracked_trajectories[i]);
+        else
+            unconfirmed_trajectories.push_back(&this->tracked_trajectories[i]);
+    }
+    TrajectoryPtrPool trajectory_pool = tracked_trajectories + this->lost_trajectories;
+    for (size_t i = 0; i < trajectory_pool.size(); ++i)
+        trajectory_pool[i]->predict();   
+    Match matches;
+    std::vector<int> mismatch_row;
+    std::vector<int> mismatch_col;
+    cv::Mat cost = motion_distance(trajectory_pool, candidates);
+    linear_assignment(cost, 0.7f, matches, mismatch_row, mismatch_col);
+    MatchIterator miter;
+    TrajectoryPtrPool activated_trajectories;
+    TrajectoryPtrPool retrieved_trajectories;    
+    for (miter = matches.begin(); miter != matches.end(); miter++)
+    {
+        Trajectory *pt = trajectory_pool[miter->first];
+        Trajectory &ct = candidates[miter->second];
+        if (pt->state == Tracked)
+        {
+            pt->update(ct, timestamp);
+            activated_trajectories.push_back(pt);
+        }
+        else
+        {
+            pt->reactivate(ct, timestamp);
+            retrieved_trajectories.push_back(pt);
+        }
+    }
+    TrajectoryPtrPool next_candidates(mismatch_col.size());
+    for (size_t i = 0; i < mismatch_col.size(); ++i)
+        next_candidates[i] = &candidates[mismatch_col[i]];
+    TrajectoryPtrPool next_trajectory_pool;
+    for (size_t i = 0; i < mismatch_row.size(); ++i)
+    {
+        int j = mismatch_row[i];
+        if (trajectory_pool[j]->state == Tracked)
+            next_trajectory_pool.push_back(trajectory_pool[j]);
+    }
+    cost = iou_distance(next_trajectory_pool, next_candidates);
+    linear_assignment(cost, 0.5f, matches, mismatch_row, mismatch_col);
+    for (miter = matches.begin(); miter != matches.end(); miter++)
+    {
+        Trajectory *pt = next_trajectory_pool[miter->first];
+        Trajectory *ct = next_candidates[miter->second];
+        if (pt->state == Tracked)
+        {
+            pt->update(*ct, timestamp);
+            activated_trajectories.push_back(pt);
+        }
+        else
+        {
+            pt->reactivate(*ct, timestamp);
+            retrieved_trajectories.push_back(pt);
+        }
+    }
+    TrajectoryPtrPool lost_trajectories;
+    for (size_t i = 0; i < mismatch_row.size(); ++i)
+    {
+        Trajectory *pt = next_trajectory_pool[mismatch_row[i]];
+        if (pt->state != Lost)
+        {
+            pt->mark_lost();
+            lost_trajectories.push_back(pt);
+        }
+    }
+    TrajectoryPtrPool nnext_candidates(mismatch_col.size());
+    for (size_t i = 0; i < mismatch_col.size(); ++i)
+        nnext_candidates[i] = next_candidates[mismatch_col[i]];
+    cost = iou_distance(unconfirmed_trajectories, nnext_candidates);
+    linear_assignment(cost, 0.7f, matches, mismatch_row, mismatch_col);
+    for (miter = matches.begin(); miter != matches.end(); miter++)
+    {
+        unconfirmed_trajectories[miter->first]->update(*nnext_candidates[miter->second], timestamp);
+        activated_trajectories.push_back(unconfirmed_trajectories[miter->first]);
+    }
+    TrajectoryPtrPool removed_trajectories;
+    for (size_t i = 0; i < mismatch_row.size(); ++i)
+    {
+        unconfirmed_trajectories[mismatch_row[i]]->mark_removed();
+        removed_trajectories.push_back(unconfirmed_trajectories[mismatch_row[i]]);
+    }
+    for (size_t i = 0; i < mismatch_col.size(); ++i)
+    {
+        if (nnext_candidates[mismatch_col[i]]->score < det_thresh) continue;
+        nnext_candidates[mismatch_col[i]]->activate(timestamp);
+        activated_trajectories.push_back(nnext_candidates[mismatch_col[i]]);
+    }
+    for (size_t i = 0; i < this->lost_trajectories.size(); ++i)
+    {
+        Trajectory &lt = this->lost_trajectories[i];
+        if (timestamp - lt.timestamp > max_lost_time)
+        {
+            lt.mark_removed();
+            removed_trajectories.push_back(&lt);
+        }
+    }
+    TrajectoryPoolIterator piter;
+    for (piter = this->tracked_trajectories.begin(); piter != this->tracked_trajectories.end(); )
+    {
+        if (piter->state != Tracked)
+            piter = this->tracked_trajectories.erase(piter);
+        else
+            ++piter;
+    }
+    this->tracked_trajectories += activated_trajectories;
+    this->tracked_trajectories += retrieved_trajectories;
+    this->lost_trajectories -= this->tracked_trajectories;
+    this->lost_trajectories += lost_trajectories;
+    this->lost_trajectories -= this->removed_trajectories;
+    this->removed_trajectories += removed_trajectories;
+    remove_duplicate_trajectory(this->tracked_trajectories, this->lost_trajectories);
+    tracks.clear();
+    for (size_t i = 0; i < this->tracked_trajectories.size(); ++i)
+    {
+        if (this->tracked_trajectories[i].is_activated)
+        {
+            Track track = {
+                .id = this->tracked_trajectories[i].id,
+                .score = this->tracked_trajectories[i].score,
+                .ltrb = this->tracked_trajectories[i].ltrb};
+            tracks.push_back(track);
+        }
+    }
+    return 0;
+}
+cv::Mat JDETracker::motion_distance(const TrajectoryPtrPool &a, const TrajectoryPool &b)
+{
+    if (0 == a.size() || 0 == b.size())
+        return cv::Mat(a.size(), b.size(), CV_32F);
+    cv::Mat edists = embedding_distance(a, b);
+    cv::Mat mdists = mahalanobis_distance(a, b);
+    cv::Mat fdists = lambda * edists + (1 - lambda) * mdists;
+    const float gate_thresh = chi2inv95[4];
+    for (int i = 0; i < fdists.rows; ++i)
+    {
+        for (int j = 0; j < fdists.cols; ++j)
+        {
+            if (*mdists.ptr<float>(i, j) > gate_thresh)
+                *fdists.ptr<float>(i, j) = FLT_MAX;
+        }
+    }
+    return fdists;
+}
+void JDETracker::linear_assignment(const cv::Mat &cost, float cost_limit, Match &matches,
+    std::vector<int> &mismatch_row, std::vector<int> &mismatch_col)
+{
+    matches.clear();
+    mismatch_row.clear();    
+    mismatch_col.clear();
+    if (cost.empty())
+    {
+        for (int i = 0; i < cost.rows; ++i)
+            mismatch_row.push_back(i);
+        for (int i = 0; i < cost.cols; ++i)
+            mismatch_col.push_back(i);
+        return;
+    }
+    float opt = 0; 
+    cv::Mat x(cost.rows, 1, CV_32S);
+    cv::Mat y(cost.cols, 1, CV_32S);    
+    lapjv_internal(cost, true, cost_limit,
+        (int *)x.data, (int *)y.data);
+    for (int i = 0; i < x.rows; ++i)
+    {
+        int j = *x.ptr<int>(i);
+        if (j >= 0)
+            matches.insert({i, j});
+        else
+            mismatch_row.push_back(i);
+    }
+    for (int i = 0; i < y.rows; ++i)
+    {
+        int j = *y.ptr<int>(i);
+        if (j < 0)
+            mismatch_col.push_back(i);
+    }
+    return;
+}
+void JDETracker::remove_duplicate_trajectory(TrajectoryPool &a, TrajectoryPool &b, float iou_thresh)
+{
+    if (0 == a.size() || 0 == b.size())
+        return;
+    cv::Mat dist = iou_distance(a, b);
+    cv::Mat mask = dist < iou_thresh;
+    std::vector<cv::Point> idx;
+    cv::findNonZero(mask, idx);
+    std::vector<int> da;
+    std::vector<int> db;
+    for (size_t i = 0; i < idx.size(); ++i)
+    {
+        int ta = a[idx[i].y].timestamp - a[idx[i].y].starttime;
+        int tb = b[idx[i].x].timestamp - b[idx[i].x].starttime;
+        if (ta > tb)
+            db.push_back(idx[i].x);
+        else
+            da.push_back(idx[i].y);
+    }
+    int id = 0;
+    TrajectoryPoolIterator piter;
+    for (piter = a.begin(); piter != a.end(); )
+    {
+        std::vector<int>::iterator iter = find(da.begin(), da.end(), id++);
+        if (iter != da.end())
+            piter = a.erase(piter);
+        else
+            ++piter;
+    }
+    id = 0;
+    for (piter = b.begin(); piter != b.end(); )
+    {
+        std::vector<int>::iterator iter = find(db.begin(), db.end(), id++);
+        if (iter != db.end())
+            piter = b.erase(piter);
+        else
+            ++piter;
+    }
+}
+}   // namespace PaddleDetection
--- a/deploy/cpp/src/trajectory.cc
+++ b/deploy/cpp/src/trajectory.cc
+//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// The code is based on:
+// https://github.com/CnybTseng/JDE/blob/master/platforms/common/trajectory.cpp
+// Ths copyright of CnybTseng/JDE is as follows:
+// MIT License
+#include <algorithm>
+#include "include/trajectory.h"
+namespace PaddleDetection {
+void TKalmanFilter::init(const cv::Mat &measurement)
+{
+    measurement.copyTo(statePost(cv::Rect(0, 0, 1, 4)));
+    statePost(cv::Rect(0, 4, 1, 4)).setTo(0);
+    statePost.copyTo(statePre);
+    float varpos = 2 *  std_weight_position * (*measurement.ptr<float>(3));
+    varpos *= varpos;
+    float varvel = 10 * std_weight_velocity * (*measurement.ptr<float>(3));
+    varvel *= varvel;
+    errorCovPost.setTo(0);
+    *errorCovPost.ptr<float>(0, 0) = varpos;
+    *errorCovPost.ptr<float>(1, 1) = varpos;
+    *errorCovPost.ptr<float>(2, 2) = 1e-4f;
+    *errorCovPost.ptr<float>(3, 3) = varpos;
+    *errorCovPost.ptr<float>(4, 4) = varvel;
+    *errorCovPost.ptr<float>(5, 5) = varvel;
+    *errorCovPost.ptr<float>(6, 6) = 1e-10f;
+    *errorCovPost.ptr<float>(7, 7) = varvel;  
+    errorCovPost.copyTo(errorCovPre);
+}
+const cv::Mat &TKalmanFilter::predict()
+{
+    float varpos = std_weight_position * (*statePre.ptr<float>(3));
+    varpos *= varpos;
+    float varvel = std_weight_velocity * (*statePre.ptr<float>(3));
+    varvel *= varvel;
+    processNoiseCov.setTo(0);
+    *processNoiseCov.ptr<float>(0, 0) = varpos;
+    *processNoiseCov.ptr<float>(1, 1) = varpos;
+    *processNoiseCov.ptr<float>(2, 2) = 1e-4f;
+    *processNoiseCov.ptr<float>(3, 3) = varpos;
+    *processNoiseCov.ptr<float>(4, 4) = varvel;
+    *processNoiseCov.ptr<float>(5, 5) = varvel;
+    *processNoiseCov.ptr<float>(6, 6) = 1e-10f;
+    *processNoiseCov.ptr<float>(7, 7) = varvel;
+    return cv::KalmanFilter::predict();
+}
+const cv::Mat &TKalmanFilter::correct(const cv::Mat &measurement)
+{
+    float varpos = std_weight_position * (*measurement.ptr<float>(3));
+    varpos *= varpos;
+    measurementNoiseCov.setTo(0);
+    *measurementNoiseCov.ptr<float>(0, 0) = varpos;
+    *measurementNoiseCov.ptr<float>(1, 1) = varpos;
+    *measurementNoiseCov.ptr<float>(2, 2) = 1e-2f;
+    *measurementNoiseCov.ptr<float>(3, 3) = varpos;
+    return cv::KalmanFilter::correct(measurement);
+}
+void TKalmanFilter::project(cv::Mat &mean, cv::Mat &covariance) const
+{    
+    float varpos = std_weight_position * (*statePost.ptr<float>(3));
+    varpos *= varpos;
+    cv::Mat measurementNoiseCov_ = cv::Mat::eye(4, 4, CV_32F);
+    *measurementNoiseCov_.ptr<float>(0, 0) = varpos;
+    *measurementNoiseCov_.ptr<float>(1, 1) = varpos;
+    *measurementNoiseCov_.ptr<float>(2, 2) = 1e-2f;
+    *measurementNoiseCov_.ptr<float>(3, 3) = varpos;
+    mean = measurementMatrix * statePost;
+    cv::Mat temp = measurementMatrix * errorCovPost;
+    gemm(temp, measurementMatrix, 1, measurementNoiseCov_, 1, covariance, cv::GEMM_2_T);
+}
+int Trajectory::count = 0;
+const cv::Mat &Trajectory::predict(void)
+{
+    if (state != Tracked)
+        *cv::KalmanFilter::statePost.ptr<float>(7) = 0;
+    return TKalmanFilter::predict();
+}
+void Trajectory::update(Trajectory &traj, int timestamp_, bool update_embedding_)
+{
+    timestamp = timestamp_;
+    ++length;
+    ltrb = traj.ltrb;
+    xyah = traj.xyah;    
+    TKalmanFilter::correct(cv::Mat(traj.xyah));    
+    state = Tracked;
+    is_activated = true;
+    score = traj.score;   
+    if (update_embedding_)
+        update_embedding(traj.current_embedding);
+}
+void Trajectory::activate(int timestamp_)
+{
+    id = next_id();
+    TKalmanFilter::init(cv::Mat(xyah));    
+    length = 0;
+    state = Tracked;
+    if (timestamp_ == 1) {
+      is_activated = true;
+    }
+    timestamp = timestamp_;
+    starttime = timestamp_;
+}
+void Trajectory::reactivate(Trajectory &traj, int timestamp_, bool newid)
+{
+    TKalmanFilter::correct(cv::Mat(traj.xyah));
+    update_embedding(traj.current_embedding);
+    length = 0;
+    state = Tracked;
+    is_activated = true;
+    timestamp = timestamp_;
+    if (newid)
+        id = next_id();
+}
+void Trajectory::update_embedding(const cv::Mat &embedding)
+{
+    current_embedding = embedding / cv::norm(embedding);
+    if (smooth_embedding.empty())
+    {
+        smooth_embedding = current_embedding;
+    }
+    else
+    {
+        smooth_embedding = eta * smooth_embedding + (1 - eta) * current_embedding;
+    }
+    smooth_embedding = smooth_embedding / cv::norm(smooth_embedding);
+}
+TrajectoryPool operator+(const TrajectoryPool &a, const TrajectoryPool &b)
+{
+    TrajectoryPool sum;
+    sum.insert(sum.end(), a.begin(), a.end());
+    std::vector<int> ids(a.size());
+    for (size_t i = 0; i < a.size(); ++i)
+        ids[i] = a[i].id;
+    for (size_t i = 0; i < b.size(); ++i)
+    {
+        std::vector<int>::iterator iter = find(ids.begin(), ids.end(), b[i].id);
+        if (iter == ids.end())
+        {
+            sum.push_back(b[i]);
+            ids.push_back(b[i].id);
+        }
+    }
+    return sum;
+}
+TrajectoryPool operator+(const TrajectoryPool &a, const TrajectoryPtrPool &b)
+{
+    TrajectoryPool sum;
+    sum.insert(sum.end(), a.begin(), a.end());
+    std::vector<int> ids(a.size());
+    for (size_t i = 0; i < a.size(); ++i)
+        ids[i] = a[i].id;
+    for (size_t i = 0; i < b.size(); ++i)
+    {
+        std::vector<int>::iterator iter = find(ids.begin(), ids.end(), b[i]->id);
+        if (iter == ids.end())
+        {
+            sum.push_back(*b[i]);
+            ids.push_back(b[i]->id);
+        }
+    }
+    return sum;
+}
+TrajectoryPool &operator+=(TrajectoryPool &a, const TrajectoryPtrPool &b)
+{    
+    std::vector<int> ids(a.size());
+    for (size_t i = 0; i < a.size(); ++i)
+        ids[i] = a[i].id;
+    for (size_t i = 0; i < b.size(); ++i)
+    {
+        if (b[i]->smooth_embedding.empty())
+            continue;
+        std::vector<int>::iterator iter = find(ids.begin(), ids.end(), b[i]->id);
+        if (iter == ids.end())
+        {
+            a.push_back(*b[i]);
+            ids.push_back(b[i]->id);
+        }
+    }
+    return a;
+}
+TrajectoryPool operator-(const TrajectoryPool &a, const TrajectoryPool &b)
+{
+    TrajectoryPool dif;
+    std::vector<int> ids(b.size());
+    for (size_t i = 0; i < b.size(); ++i)
+        ids[i] = b[i].id;
+    for (size_t i = 0; i < a.size(); ++i)
+    {
+        std::vector<int>::iterator iter = find(ids.begin(), ids.end(), a[i].id);
+        if (iter == ids.end())
+            dif.push_back(a[i]);
+    }
+    return dif;
+}
+TrajectoryPool &operator-=(TrajectoryPool &a, const TrajectoryPool &b)
+{
+    std::vector<int> ids(b.size());
+    for (size_t i = 0; i < b.size(); ++i)
+        ids[i] = b[i].id;
+    TrajectoryPoolIterator piter;
+    for (piter = a.begin(); piter != a.end(); )
+    {
+        std::vector<int>::iterator iter = find(ids.begin(), ids.end(), piter->id);
+        if (iter == ids.end())
+            ++piter;
+        else
+            piter = a.erase(piter);
+    }
+    return a;
+}
+TrajectoryPtrPool operator+(const TrajectoryPtrPool &a, const TrajectoryPtrPool &b)
+{
+    TrajectoryPtrPool sum;
+    sum.insert(sum.end(), a.begin(), a.end());
+    std::vector<int> ids(a.size());
+    for (size_t i = 0; i < a.size(); ++i)
+        ids[i] = a[i]->id;
+    for (size_t i = 0; i < b.size(); ++i)
+    {
+        std::vector<int>::iterator iter = find(ids.begin(), ids.end(), b[i]->id);
+        if (iter == ids.end())
+        {
+            sum.push_back(b[i]);
+            ids.push_back(b[i]->id);
+        }
+    }
+    return sum;
+}
+TrajectoryPtrPool operator+(const TrajectoryPtrPool &a, TrajectoryPool &b)
+{
+    TrajectoryPtrPool sum;
+    sum.insert(sum.end(), a.begin(), a.end());
+    std::vector<int> ids(a.size());
+    for (size_t i = 0; i < a.size(); ++i)
+        ids[i] = a[i]->id;
+    for (size_t i = 0; i < b.size(); ++i)
+    {
+        std::vector<int>::iterator iter = find(ids.begin(), ids.end(), b[i].id);
+        if (iter == ids.end())
+        {
+            sum.push_back(&b[i]);
+            ids.push_back(b[i].id);
+        }
+    }
+    return sum;
+}
+TrajectoryPtrPool operator-(const TrajectoryPtrPool &a, const TrajectoryPtrPool &b)
+{
+    TrajectoryPtrPool dif;
+    std::vector<int> ids(b.size());
+    for (size_t i = 0; i < b.size(); ++i)
+        ids[i] = b[i]->id;
+    for (size_t i = 0; i < a.size(); ++i)
+    {
+        std::vector<int>::iterator iter = find(ids.begin(), ids.end(), a[i]->id);
+        if (iter == ids.end())
+            dif.push_back(a[i]);
+    }
+    return dif;
+}
+cv::Mat embedding_distance(const TrajectoryPool &a, const TrajectoryPool &b)
+{
+    cv::Mat dists(a.size(), b.size(), CV_32F);
+    for (size_t i = 0; i < a.size(); ++i)
+    {
+        float *distsi = dists.ptr<float>(i);
+        for (size_t j = 0; j < b.size(); ++j)
+        {
+            cv::Mat u = a[i].smooth_embedding;
+            cv::Mat v = b[j].smooth_embedding;
+            double uv = u.dot(v);
+            double uu = u.dot(u);
+            double vv = v.dot(v);
+            double dist = std::abs(1. - uv / std::sqrt(uu * vv));
+            //double dist = cv::norm(a[i].smooth_embedding, b[j].smooth_embedding, cv::NORM_L2);
+            distsi[j] = static_cast<float>(std::max(std::min(dist, 2.), 0.));
+        }
+    }
+    return dists;
+}
+cv::Mat embedding_distance(const TrajectoryPtrPool &a, const TrajectoryPtrPool &b)
+{
+    cv::Mat dists(a.size(), b.size(), CV_32F);
+    for (size_t i = 0; i < a.size(); ++i)
+    {
+        float *distsi = dists.ptr<float>(i);
+        for (size_t j = 0; j < b.size(); ++j)
+        {
+            //double dist = cv::norm(a[i]->smooth_embedding, b[j]->smooth_embedding, cv::NORM_L2);
+            //distsi[j] = static_cast<float>(dist);
+            cv::Mat u = a[i]->smooth_embedding;
+            cv::Mat v = b[j]->smooth_embedding;
+            double uv = u.dot(v);
+            double uu = u.dot(u);
+            double vv = v.dot(v);
+            double dist = std::abs(1. - uv / std::sqrt(uu * vv));
+            distsi[j] = static_cast<float>(std::max(std::min(dist, 2.), 0.));
+        }
+    }
+    return dists;
+}
+cv::Mat embedding_distance(const TrajectoryPtrPool &a, const TrajectoryPool &b)
+{
+    cv::Mat dists(a.size(), b.size(), CV_32F);
+    for (size_t i = 0; i < a.size(); ++i)
+    {
+        float *distsi = dists.ptr<float>(i);
+        for (size_t j = 0; j < b.size(); ++j)
+        {
+            //double dist = cv::norm(a[i]->smooth_embedding, b[j].smooth_embedding, cv::NORM_L2);
+            //distsi[j] = static_cast<float>(dist);
+            cv::Mat u = a[i]->smooth_embedding;
+            cv::Mat v = b[j].smooth_embedding;
+            double uv = u.dot(v);
+            double uu = u.dot(u);
+            double vv = v.dot(v);
+            double dist = std::abs(1. - uv / std::sqrt(uu * vv));
+            distsi[j] = static_cast<float>(std::max(std::min(dist, 2.), 0.));
+        }
+    }
+    return dists;
+}
+cv::Mat mahalanobis_distance(const TrajectoryPool &a, const TrajectoryPool &b)
+{
+    std::vector<cv::Mat> means(a.size());
+    std::vector<cv::Mat> icovariances(a.size());
+    for (size_t i = 0; i < a.size(); ++i)
+    {
+        cv::Mat covariance;
+        a[i].project(means[i], covariance);
+        cv::invert(covariance, icovariances[i]);
+    }
+    cv::Mat dists(a.size(), b.size(), CV_32F);
+    for (size_t i = 0; i < a.size(); ++i)
+    {
+        float *distsi = dists.ptr<float>(i);
+        for (size_t j = 0; j < b.size(); ++j)
+        {
+            const cv::Mat x(b[j].xyah);
+            float dist = static_cast<float>(cv::Mahalanobis(x, means[i], icovariances[i]));
+            distsi[j] = dist * dist;
+        }
+    }
+    return dists;
+}
+cv::Mat mahalanobis_distance(const TrajectoryPtrPool &a, const TrajectoryPtrPool &b)
+{
+    std::vector<cv::Mat> means(a.size());
+    std::vector<cv::Mat> icovariances(a.size());
+    for (size_t i = 0; i < a.size(); ++i)
+    {
+        cv::Mat covariance;
+        a[i]->project(means[i], covariance);
+        cv::invert(covariance, icovariances[i]);
+    }
+    cv::Mat dists(a.size(), b.size(), CV_32F);
+    for (size_t i = 0; i < a.size(); ++i)
+    {
+        float *distsi = dists.ptr<float>(i);
+        for (size_t j = 0; j < b.size(); ++j)
+        {
+            const cv::Mat x(b[j]->xyah);
+            float dist = static_cast<float>(cv::Mahalanobis(x, means[i], icovariances[i]));
+            distsi[j] = dist * dist;
+        }
+    }
+    return dists;
+}
+cv::Mat mahalanobis_distance(const TrajectoryPtrPool &a, const TrajectoryPool &b)
+{
+    std::vector<cv::Mat> means(a.size());
+    std::vector<cv::Mat> icovariances(a.size());
+    for (size_t i = 0; i < a.size(); ++i)
+    {
+        cv::Mat covariance;
+        a[i]->project(means[i], covariance);
+        cv::invert(covariance, icovariances[i]);
+    }
+    cv::Mat dists(a.size(), b.size(), CV_32F);
+    for (size_t i = 0; i < a.size(); ++i)
+    {
+        float *distsi = dists.ptr<float>(i);
+        for (size_t j = 0; j < b.size(); ++j)
+        {
+            const cv::Mat x(b[j].xyah);
+            float dist = static_cast<float>(cv::Mahalanobis(x, means[i], icovariances[i]));
+            distsi[j] = dist * dist;
+        }
+    }
+    return dists;
+}
+static inline float calc_inter_area(const cv::Vec4f &a, const cv::Vec4f &b)
+{
+    if (a[2] < b[0] || a[0] > b[2] || a[3] < b[1] || a[1] > b[3])
+        return 0.f;
+    float w = std::min(a[2], b[2]) - std::max(a[0], b[0]);
+    float h = std::min(a[3], b[3]) - std::max(a[1], b[1]);
+    return w * h;
+}
+cv::Mat iou_distance(const TrajectoryPool &a, const TrajectoryPool &b)
+{
+    std::vector<float> areaa(a.size());
+    for (size_t i = 0; i < a.size(); ++i)
+    {
+        float w = a[i].ltrb[2] - a[i].ltrb[0];
+        float h = a[i].ltrb[3] - a[i].ltrb[1];
+        areaa[i] = w * h;
+    }
+    std::vector<float> areab(b.size());
+    for (size_t j = 0; j < b.size(); ++j)
+    {
+        float w = b[j].ltrb[2] - b[j].ltrb[0];
+        float h = b[j].ltrb[3] - b[j].ltrb[1];
+        areab[j] = w * h;
+    }
+    cv::Mat dists(a.size(), b.size(), CV_32F);
+    for (size_t i = 0; i < a.size(); ++i)
+    {
+        const cv::Vec4f &boxa = a[i].ltrb;
+        float *distsi = dists.ptr<float>(i);
+        for (size_t j = 0; j < b.size(); ++j)
+        {
+            const cv::Vec4f &boxb = b[j].ltrb;
+            float inters = calc_inter_area(boxa, boxb);
+            distsi[j] = 1.f - inters / (areaa[i] + areab[j] - inters);
+        }
+    }
+    return dists;
+}
+cv::Mat iou_distance(const TrajectoryPtrPool &a, const TrajectoryPtrPool &b)
+{
+    std::vector<float> areaa(a.size());
+    for (size_t i = 0; i < a.size(); ++i)
+    {
+        float w = a[i]->ltrb[2] - a[i]->ltrb[0];
+        float h = a[i]->ltrb[3] - a[i]->ltrb[1];
+        areaa[i] = w * h;
+    }
+    std::vector<float> areab(b.size());
+    for (size_t j = 0; j < b.size(); ++j)
+    {
+        float w = b[j]->ltrb[2] - b[j]->ltrb[0];
+        float h = b[j]->ltrb[3] - b[j]->ltrb[1];
+        areab[j] = w * h;
+    }
+    cv::Mat dists(a.size(), b.size(), CV_32F);
+    for (size_t i = 0; i < a.size(); ++i)
+    {
+        const cv::Vec4f &boxa = a[i]->ltrb;
+        float *distsi = dists.ptr<float>(i);
+        for (size_t j = 0; j < b.size(); ++j)
+        {
+            const cv::Vec4f &boxb = b[j]->ltrb;
+            float inters = calc_inter_area(boxa, boxb);
+            distsi[j] = 1.f - inters / (areaa[i] + areab[j] - inters);
+        }
+    }
+    return dists;
+}
+cv::Mat iou_distance(const TrajectoryPtrPool &a, const TrajectoryPool &b)
+{
+    std::vector<float> areaa(a.size());
+    for (size_t i = 0; i < a.size(); ++i)
+    {
+        float w = a[i]->ltrb[2] - a[i]->ltrb[0];
+        float h = a[i]->ltrb[3] - a[i]->ltrb[1];
+        areaa[i] = w * h;
+    }
+    std::vector<float> areab(b.size());
+    for (size_t j = 0; j < b.size(); ++j)
+    {
+        float w = b[j].ltrb[2] - b[j].ltrb[0];
+        float h = b[j].ltrb[3] - b[j].ltrb[1];
+        areab[j] = w * h;
+    }
+    cv::Mat dists(a.size(), b.size(), CV_32F);
+    for (size_t i = 0; i < a.size(); ++i)
+    {
+        const cv::Vec4f &boxa = a[i]->ltrb;
+        float *distsi = dists.ptr<float>(i);
+        for (size_t j = 0; j < b.size(); ++j)
+        {
+            const cv::Vec4f &boxb = b[j].ltrb;
+            float inters = calc_inter_area(boxa, boxb);
+            distsi[j] = 1.f - inters / (areaa[i] + areab[j] - inters);
+        }
+    }
+    return dists;
+}
+}   // namespace PaddleDetection
--- a/deploy/cpp/src/utils.cc
+++ b/deploy/cpp/src/utils.cc
+//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "include/utils.h"
+namespace PaddleDetection {
+void nms(std::vector<ObjectResult> &input_boxes, float nms_threshold) {
+  std::sort(input_boxes.begin(),
+  input_boxes.end(), 
+  [](ObjectResult a, ObjectResult b) { return a.confidence > b.confidence; });
+  std::vector<float> vArea(input_boxes.size());
+  for (int i = 0; i < int(input_boxes.size()); ++i) {
+    vArea[i] = (input_boxes.at(i).rect[2] - input_boxes.at(i).rect[0] + 1) 
+            * (input_boxes.at(i).rect[3] - input_boxes.at(i).rect[1] + 1);
+  }
+  for (int i = 0; i < int(input_boxes.size()); ++i) {
+    for (int j = i + 1; j < int(input_boxes.size());) {
+      float xx1 = (std::max)(input_boxes[i].rect[0], input_boxes[j].rect[0]);
+      float yy1 = (std::max)(input_boxes[i].rect[1], input_boxes[j].rect[1]);
+      float xx2 = (std::min)(input_boxes[i].rect[2], input_boxes[j].rect[2]);
+      float yy2 = (std::min)(input_boxes[i].rect[3], input_boxes[j].rect[3]);
+      float w = (std::max)(float(0), xx2 - xx1 + 1);
+      float h = (std::max)(float(0), yy2 - yy1 + 1);
+      float inter = w * h;
+      float ovr = inter / (vArea[i] + vArea[j] - inter);
+      if (ovr >= nms_threshold) {
+          input_boxes.erase(input_boxes.begin() + j);
+          vArea.erase(vArea.begin() + j);
+      }
+      else {
+          j++;
+      }
+    }
+  }
+}
+}  // namespace PaddleDetection
--- a/deploy/end2end_ppyoloe/README.md
+++ b/deploy/end2end_ppyoloe/README.md
+# Export ONNX Model
+## Download pretrain paddle models
+* [ppyoloe-s](https://paddledet.bj.bcebos.com/models/ppyoloe_crn_s_300e_coco.pdparams)
+* [ppyoloe-m](https://paddledet.bj.bcebos.com/models/ppyoloe_crn_m_300e_coco.pdparams)
+* [ppyoloe-l](https://paddledet.bj.bcebos.com/models/ppyoloe_crn_l_300e_coco.pdparams)
+* [ppyoloe-x](https://paddledet.bj.bcebos.com/models/ppyoloe_crn_x_300e_coco.pdparams)
+* [ppyoloe-s-400e](https://paddledet.bj.bcebos.com/models/ppyoloe_crn_s_400e_coco.pdparams)
+## Export paddle model for deploying
+```shell
+python ./tools/export_model.py \
+    -c configs/ppyoloe/ppyoloe_crn_s_300e_coco.yml \
+    -o weights=ppyoloe_crn_s_300e_coco.pdparams \
+    trt=True \
+    exclude_nms=True \
+    TestReader.inputs_def.image_shape=[3,640,640] \
+    --output_dir ./
+# if you want to try ppyoloe-s-400e model
+python ./tools/export_model.py \
+    -c configs/ppyoloe/ppyoloe_crn_s_400e_coco.yml \
+    -o weights=ppyoloe_crn_s_400e_coco.pdparams \
+    trt=True \
+    exclude_nms=True \
+    TestReader.inputs_def.image_shape=[3,640,640] \
+    --output_dir ./
+```
+## Check requirements
+```shell
+pip install onnx>=1.10.0
+pip install paddle2onnx
+pip install onnx-simplifier
+pip install onnx-graphsurgeon --index-url https://pypi.ngc.nvidia.com
+# if use cuda-python infer, please install it
+pip install cuda-python
+# if use cupy infer, please install it
+pip install cupy-cuda117 # cuda110-cuda117 are all available
+```
+## Export script
+```shell
+python ./deploy/end2end_ppyoloe/end2end.py \
+    --model-dir ppyoloe_crn_s_300e_coco \
+    --save-file ppyoloe_crn_s_300e_coco.onnx \
+    --opset 11 \
+    --batch-size 1 \
+    --topk-all 100 \
+    --iou-thres 0.6 \
+    --conf-thres 0.4
+# if you want to try ppyoloe-s-400e model
+python ./deploy/end2end_ppyoloe/end2end.py \
+    --model-dir ppyoloe_crn_s_400e_coco \
+    --save-file ppyoloe_crn_s_400e_coco.onnx \
+    --opset 11 \
+    --batch-size 1 \
+    --topk-all 100 \
+    --iou-thres 0.6 \
+    --conf-thres 0.4
+```
+#### Description of all arguments
+- `--model-dir` : the path of ppyoloe export dir.
+- `--save-file` : the path of export onnx.
+- `--opset` : onnx opset version.
+- `--img-size` : image size for exporting ppyoloe.
+- `--batch-size` : batch size for exporting ppyoloe.
+- `--topk-all` : topk objects for every image.
+- `--iou-thres` : iou threshold for NMS algorithm.
+- `--conf-thres` : confidence threshold for NMS algorithm.
+### TensorRT backend (TensorRT version>= 8.0.0)
+#### TensorRT engine export
+``` shell
+/path/to/trtexec \
+    --onnx=ppyoloe_crn_s_300e_coco.onnx \
+    --saveEngine=ppyoloe_crn_s_300e_coco.engine \
+    --fp16 # if export TensorRT fp16 model
+# if you want to try ppyoloe-s-400e model
+/path/to/trtexec \
+    --onnx=ppyoloe_crn_s_400e_coco.onnx \
+    --saveEngine=ppyoloe_crn_s_400e_coco.engine \
+    --fp16 # if export TensorRT fp16 model
+```
+#### TensorRT image infer
+``` shell
+# cuda-python infer script
+python ./deploy/end2end_ppyoloe/cuda-python.py ppyoloe_crn_s_300e_coco.engine
+# cupy infer script
+python ./deploy/end2end_ppyoloe/cupy-python.py ppyoloe_crn_s_300e_coco.engine
+# if you want to try ppyoloe-s-400e model
+python ./deploy/end2end_ppyoloe/cuda-python.py ppyoloe_crn_s_400e_coco.engine
+# or
+python ./deploy/end2end_ppyoloe/cuda-python.py ppyoloe_crn_s_400e_coco.engine
+```
\ No newline at end of file
--- a/deploy/end2end_ppyoloe/cuda-python.py
+++ b/deploy/end2end_ppyoloe/cuda-python.py
+import sys
+import requests
+import cv2
+import random
+import time
+import numpy as np
+import tensorrt as trt
+from cuda import cudart
+from pathlib import Path
+from collections import OrderedDict, namedtuple
+def letterbox(im, new_shape=(640, 640), color=(114, 114, 114), auto=True, scaleup=True, stride=32):
+    # Resize and pad image while meeting stride-multiple constraints
+    shape = im.shape[:2]  # current shape [height, width]
+    if isinstance(new_shape, int):
+        new_shape = (new_shape, new_shape)
+    # Scale ratio (new / old)
+    r = min(new_shape[0] / shape[0], new_shape[1] / shape[1])
+    if not scaleup:  # only scale down, do not scale up (for better val mAP)
+        r = min(r, 1.0)
+    # Compute padding
+    new_unpad = int(round(shape[1] * r)), int(round(shape[0] * r))
+    dw, dh = new_shape[1] - new_unpad[0], new_shape[0] - new_unpad[1]  # wh padding
+    if auto:  # minimum rectangle
+        dw, dh = np.mod(dw, stride), np.mod(dh, stride)  # wh padding
+    dw /= 2  # divide padding into 2 sides
+    dh /= 2
+    if shape[::-1] != new_unpad:  # resize
+        im = cv2.resize(im, new_unpad, interpolation=cv2.INTER_LINEAR)
+    top, bottom = int(round(dh - 0.1)), int(round(dh + 0.1))
+    left, right = int(round(dw - 0.1)), int(round(dw + 0.1))
+    im = cv2.copyMakeBorder(im, top, bottom, left, right, cv2.BORDER_CONSTANT, value=color)  # add border
+    return im, r, (dw, dh)
+w = Path(sys.argv[1])
+assert w.exists() and w.suffix in ('.engine', '.plan'), 'Wrong engine path'
+names = ['person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train', 'truck', 'boat', 'traffic light',
+         'fire hydrant', 'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow',
+         'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', 'handbag', 'tie', 'suitcase', 'frisbee',
+         'skis', 'snowboard', 'sports ball', 'kite', 'baseball bat', 'baseball glove', 'skateboard', 'surfboard',
+         'tennis racket', 'bottle', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple',
+         'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch',
+         'potted plant', 'bed', 'dining table', 'toilet', 'tv', 'laptop', 'mouse', 'remote', 'keyboard', 'cell phone',
+         'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase', 'scissors', 'teddy bear',
+         'hair drier', 'toothbrush']
+colors = {name: [random.randint(0, 255) for _ in range(3)] for i, name in enumerate(names)}
+url = 'https://oneflow-static.oss-cn-beijing.aliyuncs.com/tripleMu/image1.jpg'
+file = requests.get(url)
+img = cv2.imdecode(np.frombuffer(file.content, np.uint8), 1)
+_, stream = cudart.cudaStreamCreate()
+mean = np.array([0.485, 0.456, 0.406], dtype=np.float32).reshape(1, 3, 1, 1)
+std = np.array([0.229, 0.224, 0.225], dtype=np.float32).reshape(1, 3, 1, 1)
+# Infer TensorRT Engine
+Binding = namedtuple('Binding', ('name', 'dtype', 'shape', 'data', 'ptr'))
+logger = trt.Logger(trt.Logger.ERROR)
+trt.init_libnvinfer_plugins(logger, namespace="")
+with open(w, 'rb') as f, trt.Runtime(logger) as runtime:
+    model = runtime.deserialize_cuda_engine(f.read())
+bindings = OrderedDict()
+fp16 = False  # default updated below
+for index in range(model.num_bindings):
+    name = model.get_binding_name(index)
+    dtype = trt.nptype(model.get_binding_dtype(index))
+    shape = tuple(model.get_binding_shape(index))
+    data = np.empty(shape, dtype=np.dtype(dtype))
+    _, data_ptr = cudart.cudaMallocAsync(data.nbytes, stream)
+    bindings[name] = Binding(name, dtype, shape, data, data_ptr)
+    if model.binding_is_input(index) and dtype == np.float16:
+        fp16 = True
+binding_addrs = OrderedDict((n, d.ptr) for n, d in bindings.items())
+context = model.create_execution_context()
+image = img.copy()
+image, ratio, dwdh = letterbox(image, auto=False)
+image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+image_copy = image.copy()
+image = image.transpose((2, 0, 1))
+image = np.expand_dims(image, 0)
+image = np.ascontiguousarray(image)
+im = image.astype(np.float32)
+im /= 255
+im -= mean
+im /= std
+_, image_ptr = cudart.cudaMallocAsync(im.nbytes, stream)
+cudart.cudaMemcpyAsync(image_ptr, im.ctypes.data, im.nbytes,
+                       cudart.cudaMemcpyKind.cudaMemcpyHostToDevice, stream)
+# warmup for 10 times
+for _ in range(10):
+    tmp = np.random.randn(1, 3, 640, 640).astype(np.float32)
+    _, tmp_ptr = cudart.cudaMallocAsync(tmp.nbytes, stream)
+    binding_addrs['image'] = tmp_ptr
+    context.execute_v2(list(binding_addrs.values()))
+start = time.perf_counter()
+binding_addrs['image'] = image_ptr
+context.execute_v2(list(binding_addrs.values()))
+print(f'Cost {(time.perf_counter() - start) * 1000}ms')
+nums = bindings['num_dets'].data
+boxes = bindings['det_boxes'].data
+scores = bindings['det_scores'].data
+classes = bindings['det_classes'].data
+cudart.cudaMemcpyAsync(nums.ctypes.data,
+                       bindings['num_dets'].ptr,
+                       nums.nbytes,
+                       cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost,
+                       stream)
+cudart.cudaMemcpyAsync(boxes.ctypes.data,
+                       bindings['det_boxes'].ptr,
+                       boxes.nbytes,
+                       cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost,
+                       stream)
+cudart.cudaMemcpyAsync(scores.ctypes.data,
+                       bindings['det_scores'].ptr,
+                       scores.nbytes,
+                       cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost,
+                       stream)
+cudart.cudaMemcpyAsync(classes.ctypes.data,
+                       bindings['det_classes'].ptr,
+                       classes.data.nbytes,
+                       cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost,
+                       stream)
+cudart.cudaStreamSynchronize(stream)
+cudart.cudaStreamDestroy(stream)
+for i in binding_addrs.values():
+    cudart.cudaFree(i)
+num = int(nums[0][0])
+box_img = boxes[0, :num].round().astype(np.int32)
+score_img = scores[0, :num]
+clss_img = classes[0, :num]
+for i, (box, score, clss) in enumerate(zip(box_img, score_img, clss_img)):
+    name = names[int(clss)]
+    color = colors[name]
+    cv2.rectangle(image_copy, box[:2].tolist(), box[2:].tolist(), color, 2)
+    cv2.putText(image_copy, name, (int(box[0]), int(box[1]) - 2), cv2.FONT_HERSHEY_SIMPLEX,
+                0.75, [225, 255, 255], thickness=2)
+cv2.imshow('Result', cv2.cvtColor(image_copy, cv2.COLOR_RGB2BGR))
+cv2.waitKey(0)
--- a/deploy/end2end_ppyoloe/cupy-python.py
+++ b/deploy/end2end_ppyoloe/cupy-python.py
+import sys
+import requests
+import cv2
+import random
+import time
+import numpy as np
+import cupy as cp
+import tensorrt as trt
+from PIL import Image
+from collections import OrderedDict, namedtuple
+from pathlib import Path
+def letterbox(im, new_shape=(640, 640), color=(114, 114, 114), auto=True, scaleup=True, stride=32):
+    # Resize and pad image while meeting stride-multiple constraints
+    shape = im.shape[:2]  # current shape [height, width]
+    if isinstance(new_shape, int):
+        new_shape = (new_shape, new_shape)
+    # Scale ratio (new / old)
+    r = min(new_shape[0] / shape[0], new_shape[1] / shape[1])
+    if not scaleup:  # only scale down, do not scale up (for better val mAP)
+        r = min(r, 1.0)
+    # Compute padding
+    new_unpad = int(round(shape[1] * r)), int(round(shape[0] * r))
+    dw, dh = new_shape[1] - new_unpad[0], new_shape[0] - new_unpad[1]  # wh padding
+    if auto:  # minimum rectangle
+        dw, dh = np.mod(dw, stride), np.mod(dh, stride)  # wh padding
+    dw /= 2  # divide padding into 2 sides
+    dh /= 2
+    if shape[::-1] != new_unpad:  # resize
+        im = cv2.resize(im, new_unpad, interpolation=cv2.INTER_LINEAR)
+    top, bottom = int(round(dh - 0.1)), int(round(dh + 0.1))
+    left, right = int(round(dw - 0.1)), int(round(dw + 0.1))
+    im = cv2.copyMakeBorder(im, top, bottom, left, right, cv2.BORDER_CONSTANT, value=color)  # add border
+    return im, r, (dw, dh)
+names = ['person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train', 'truck', 'boat', 'traffic light',
+         'fire hydrant', 'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow',
+         'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', 'handbag', 'tie', 'suitcase', 'frisbee',
+         'skis', 'snowboard', 'sports ball', 'kite', 'baseball bat', 'baseball glove', 'skateboard', 'surfboard',
+         'tennis racket', 'bottle', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple',
+         'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch',
+         'potted plant', 'bed', 'dining table', 'toilet', 'tv', 'laptop', 'mouse', 'remote', 'keyboard', 'cell phone',
+         'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase', 'scissors', 'teddy bear',
+         'hair drier', 'toothbrush']
+colors = {name: [random.randint(0, 255) for _ in range(3)] for i, name in enumerate(names)}
+url = 'https://oneflow-static.oss-cn-beijing.aliyuncs.com/tripleMu/image1.jpg'
+file = requests.get(url)
+img = cv2.imdecode(np.frombuffer(file.content, np.uint8), 1)
+w = Path(sys.argv[1])
+assert w.exists() and w.suffix in ('.engine', '.plan'), 'Wrong engine path'
+mean = np.array([0.485, 0.456, 0.406], dtype=np.float32).reshape(1, 3, 1, 1)
+std = np.array([0.229, 0.224, 0.225], dtype=np.float32).reshape(1, 3, 1, 1)
+mean = cp.asarray(mean)
+std = cp.asarray(std)
+# Infer TensorRT Engine
+Binding = namedtuple('Binding', ('name', 'dtype', 'shape', 'data', 'ptr'))
+logger = trt.Logger(trt.Logger.INFO)
+trt.init_libnvinfer_plugins(logger, namespace="")
+with open(w, 'rb') as f, trt.Runtime(logger) as runtime:
+    model = runtime.deserialize_cuda_engine(f.read())
+bindings = OrderedDict()
+fp16 = False  # default updated below
+for index in range(model.num_bindings):
+    name = model.get_binding_name(index)
+    dtype = trt.nptype(model.get_binding_dtype(index))
+    shape = tuple(model.get_binding_shape(index))
+    data = cp.empty(shape, dtype=cp.dtype(dtype))
+    bindings[name] = Binding(name, dtype, shape, data, int(data.data.ptr))
+    if model.binding_is_input(index) and dtype == np.float16:
+        fp16 = True
+binding_addrs = OrderedDict((n, d.ptr) for n, d in bindings.items())
+context = model.create_execution_context()
+image = img.copy()
+image, ratio, dwdh = letterbox(image, auto=False)
+image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+image_copy = image.copy()
+image = image.transpose((2, 0, 1))
+image = np.expand_dims(image, 0)
+image = np.ascontiguousarray(image)
+im = cp.asarray(image)
+im = im.astype(cp.float32)
+im /= 255
+im -= mean
+im /= std
+# warmup for 10 times
+for _ in range(10):
+    tmp = cp.random.randn(1, 3, 640, 640).astype(cp.float32)
+    binding_addrs['image'] = int(tmp.data.ptr)
+    context.execute_v2(list(binding_addrs.values()))
+start = time.perf_counter()
+binding_addrs['image'] = int(im.data.ptr)
+context.execute_v2(list(binding_addrs.values()))
+print(f'Cost {(time.perf_counter() - start) * 1000}ms')
+nums = bindings['num_dets'].data
+boxes = bindings['det_boxes'].data
+scores = bindings['det_scores'].data
+classes = bindings['det_classes'].data
+num = int(nums[0][0])
+box_img = boxes[0, :num].round().astype(cp.int32)
+score_img = scores[0, :num]
+clss_img = classes[0, :num]
+for i, (box, score, clss) in enumerate(zip(box_img, score_img, clss_img)):
+    name = names[int(clss)]
+    color = colors[name]
+    cv2.rectangle(image_copy, box[:2].tolist(), box[2:].tolist(), color, 2)
+    cv2.putText(image_copy, name, (int(box[0]), int(box[1]) - 2), cv2.FONT_HERSHEY_SIMPLEX,
+                0.75, [225, 255, 255], thickness=2)
+cv2.imshow('Result', cv2.cvtColor(image_copy, cv2.COLOR_RGB2BGR))
+cv2.waitKey(0)
--- a/deploy/end2end_ppyoloe/end2end.py
+++ b/deploy/end2end_ppyoloe/end2end.py
+import argparse
+import onnx
+import onnx_graphsurgeon as gs
+import numpy as np
+from pathlib import Path
+from paddle2onnx.legacy.command import program2onnx
+from collections import OrderedDict
+def main(opt):
+    model_dir = Path(opt.model_dir)
+    save_file = Path(opt.save_file)
+    assert model_dir.exists() and model_dir.is_dir()
+    if save_file.is_dir():
+        save_file = (save_file / model_dir.stem).with_suffix('.onnx')
+    elif save_file.is_file() and save_file.suffix != '.onnx':
+        save_file = save_file.with_suffix('.onnx')
+    input_shape_dict = {'image': [opt.batch_size, 3, *opt.img_size],
+                        'scale_factor': [opt.batch_size, 2]}
+    program2onnx(str(model_dir), str(save_file),
+                 'model.pdmodel', 'model.pdiparams',
+                 opt.opset, input_shape_dict=input_shape_dict)
+    onnx_model = onnx.load(save_file)
+    try:
+        import onnxsim
+        onnx_model, check = onnxsim.simplify(onnx_model)
+        assert check, 'assert check failed'
+    except Exception as e:
+        print(f'Simplifier failure: {e}')
+    onnx.checker.check_model(onnx_model)
+    graph = gs.import_onnx(onnx_model)
+    graph.fold_constants()
+    graph.cleanup().toposort()
+    mul = concat = None
+    for node in graph.nodes:
+        if node.op == 'Div' and node.i(0).op == 'Mul':
+            mul = node.i(0)
+        if node.op == 'Concat' and node.o().op == 'Reshape' and node.o().o().op == 'ReduceSum':
+            concat = node
+    assert mul.outputs[0].shape[1] == concat.outputs[0].shape[2], 'Something wrong in outputs shape'
+    anchors = mul.outputs[0].shape[1]
+    classes = concat.outputs[0].shape[1]
+    scores = gs.Variable(name='scores', shape=[opt.batch_size, anchors, classes], dtype=np.float32)
+    graph.layer(op='Transpose', name='lastTranspose',
+                inputs=[concat.outputs[0]],
+                outputs=[scores],
+                attrs=OrderedDict(perm=[0, 2, 1]))
+    graph.inputs = [graph.inputs[0]]
+    attrs = OrderedDict(
+        plugin_version="1",
+        background_class=-1,
+        max_output_boxes=opt.topk_all,
+        score_threshold=opt.conf_thres,
+        iou_threshold=opt.iou_thres,
+        score_activation=False,
+        box_coding=0, )
+    outputs = [gs.Variable("num_dets", np.int32, [opt.batch_size, 1]),
+               gs.Variable("det_boxes", np.float32, [opt.batch_size, opt.topk_all, 4]),
+               gs.Variable("det_scores", np.float32, [opt.batch_size, opt.topk_all]),
+               gs.Variable("det_classes", np.int32, [opt.batch_size, opt.topk_all])]
+    graph.layer(op='EfficientNMS_TRT', name="batched_nms",
+                inputs=[mul.outputs[0], scores],
+                outputs=outputs,
+                attrs=attrs)
+    graph.outputs = outputs
+    graph.cleanup().toposort()
+    onnx.save(gs.export_onnx(graph), save_file)
+def parse_opt():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--model-dir', type=str,
+                        default=None,
+                        help='paddle static model')
+    parser.add_argument('--save-file', type=str,
+                        default=None,
+                        help='onnx model save path')
+    parser.add_argument('--opset', type=int, default=11, help='opset version')
+    parser.add_argument('--img-size', nargs='+', type=int, default=[640, 640], help='image size')
+    parser.add_argument('--batch-size', type=int, default=1, help='batch size')
+    parser.add_argument('--topk-all', type=int, default=100, help='topk objects for every images')
+    parser.add_argument('--iou-thres', type=float, default=0.45, help='iou threshold for NMS')
+    parser.add_argument('--conf-thres', type=float, default=0.25, help='conf threshold for NMS')
+    opt = parser.parse_args()
+    opt.img_size *= 2 if len(opt.img_size) == 1 else 1
+    return opt
+if __name__ == '__main__':
+    opt = parse_opt()
+    main(opt)
--- a/deploy/lite/Makefile
+++ b/deploy/lite/Makefile
+ARM_ABI = arm8#[arm7/arm8]
+export ARM_ABI
+ifeq ($(ARM_ABI), arm8)
+    ARM_PLAT=arm64-v8a
+else
+    ARM_PLAT=armeabi-v7a
+endif
+${info ARM_ABI: ${ARM_ABI}}
+${info ARM_PLAT: ${ARM_PLAT}; option[arm7/arm8]}
+include ../Makefile.def
+LITE_ROOT=../../../
+${info LITE_ROOT: $(abspath ${LITE_ROOT})}
+THIRD_PARTY_DIR=third_party
+${info THIRD_PARTY_DIR: $(abspath ${THIRD_PARTY_DIR})}
+OPENCV_VERSION=opencv4.1.0
+OPENCV_LIBS = ${THIRD_PARTY_DIR}/${OPENCV_VERSION}/${ARM_PLAT}/libs/libopencv_imgcodecs.a \
+              ${THIRD_PARTY_DIR}/${OPENCV_VERSION}/${ARM_PLAT}/libs/libopencv_imgproc.a \
+              ${THIRD_PARTY_DIR}/${OPENCV_VERSION}/${ARM_PLAT}/libs/libopencv_core.a \
+              ${THIRD_PARTY_DIR}/${OPENCV_VERSION}/${ARM_PLAT}/3rdparty/libs/libtegra_hal.a \
+              ${THIRD_PARTY_DIR}/${OPENCV_VERSION}/${ARM_PLAT}/3rdparty/libs/liblibjpeg-turbo.a \
+              ${THIRD_PARTY_DIR}/${OPENCV_VERSION}/${ARM_PLAT}/3rdparty/libs/liblibwebp.a \
+              ${THIRD_PARTY_DIR}/${OPENCV_VERSION}/${ARM_PLAT}/3rdparty/libs/liblibpng.a \
+              ${THIRD_PARTY_DIR}/${OPENCV_VERSION}/${ARM_PLAT}/3rdparty/libs/liblibjasper.a \
+              ${THIRD_PARTY_DIR}/${OPENCV_VERSION}/${ARM_PLAT}/3rdparty/libs/liblibtiff.a \
+              ${THIRD_PARTY_DIR}/${OPENCV_VERSION}/${ARM_PLAT}/3rdparty/libs/libIlmImf.a \
+              ${THIRD_PARTY_DIR}/${OPENCV_VERSION}/${ARM_PLAT}/3rdparty/libs/libtbb.a \
+              ${THIRD_PARTY_DIR}/${OPENCV_VERSION}/${ARM_PLAT}/3rdparty/libs/libcpufeatures.a
+LITE_LIBS = -L${LITE_ROOT}/cxx/lib/ -lpaddle_light_api_shared
+###############################################################
+# How to use one of static libaray:                           #
+#  `libpaddle_api_full_bundled.a`                             #
+#  `libpaddle_api_light_bundled.a`                            #
+###############################################################
+# Note: default use lite's shared library.                    #
+###############################################################
+# 1. Comment above line using `libpaddle_light_api_shared.so`
+# 2. Undo comment below line using `libpaddle_api_light_bundled.a`
+# LITE_LIBS = ${LITE_ROOT}/cxx/lib/libpaddle_api_light_bundled.a
+CXX_LIBS = $(LITE_LIBS) ${OPENCV_LIBS} $(SYSTEM_LIBS)
+LOCAL_DIRSRCS=$(wildcard src/*.cc)
+LOCAL_SRCS=$(notdir $(LOCAL_DIRSRCS))
+LOCAL_OBJS=$(patsubst %.cpp, %.o, $(patsubst %.cc, %.o, $(LOCAL_SRCS)))
+JSON_OBJS = json_reader.o json_value.o json_writer.o
+main: $(LOCAL_OBJS) $(JSON_OBJS) fetch_opencv
+	$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) $(LOCAL_OBJS) $(JSON_OBJS) -o main $(CXX_LIBS) $(LDFLAGS)
+fetch_opencv:
+	@ test -d ${THIRD_PARTY_DIR} ||  mkdir ${THIRD_PARTY_DIR}
+	@ test -e ${THIRD_PARTY_DIR}/${OPENCV_VERSION}.tar.gz || \
+      (echo "fetch opencv libs" && \
+      wget -P ${THIRD_PARTY_DIR} https://paddle-inference-dist.bj.bcebos.com/${OPENCV_VERSION}.tar.gz)
+	@ test -d ${THIRD_PARTY_DIR}/${OPENCV_VERSION} || \
+      tar -zxf ${THIRD_PARTY_DIR}/${OPENCV_VERSION}.tar.gz -C ${THIRD_PARTY_DIR}
+fetch_json_code:
+	@ test -d ${THIRD_PARTY_DIR} ||  mkdir ${THIRD_PARTY_DIR}
+	@ test -e ${THIRD_PARTY_DIR}/jsoncpp_code.tar.gz || \
+      (echo "fetch jsoncpp_code.tar.gz" && \
+      wget -P ${THIRD_PARTY_DIR} https://bj.bcebos.com/v1/paddledet/deploy/jsoncpp_code.tar.gz )
+	@ test -d ${THIRD_PARTY_DIR}/jsoncpp_code || \
+      tar -zxf ${THIRD_PARTY_DIR}/jsoncpp_code.tar.gz -C ${THIRD_PARTY_DIR}
+LOCAL_INCLUDES = -I./ -Iinclude
+OPENCV_INCLUDE = -I${THIRD_PARTY_DIR}/${OPENCV_VERSION}/${ARM_PLAT}/include
+JSON_INCLUDE = -I${THIRD_PARTY_DIR}/jsoncpp_code/include
+CXX_INCLUDES = ${LOCAL_INCLUDES} ${INCLUDES} ${OPENCV_INCLUDE} ${JSON_INCLUDE} -I$(LITE_ROOT)/cxx/include
+$(LOCAL_OBJS): %.o: src/%.cc fetch_opencv fetch_json_code
+	$(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -c $< -o $@
+$(JSON_OBJS): %.o: ${THIRD_PARTY_DIR}/jsoncpp_code/%.cpp fetch_json_code
+	$(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -c $< -o $@
+.PHONY: clean fetch_opencv fetch_json_code
+clean:
+	rm -rf $(LOCAL_OBJS) $(JSON_OBJS)
+	rm -f main
--- a/deploy/lite/README.md
+++ b/deploy/lite/README.md
+# Paddle-Lite端侧部署
+[Paddle-Lite](https://github.com/PaddlePaddle/Paddle-Lite)是飞桨轻量化推理引擎，为手机、IOT端提供高效推理能力，并广泛整合跨平台硬件，为端侧部署及应用落地问题提供轻量化的部署方案。
+本目录提供了PaddleDetection中主要模型在Paddle-Lite上的端到端部署代码。用户可以通过本教程了解如何使用该部分代码，基于Paddle-Lite实现在移动端部署PaddleDetection模型。
+## 1. 准备环境
+### 运行准备
+- 电脑（编译Paddle Lite）
+- 安卓手机（armv7或armv8）
+### 1.1 准备交叉编译环境
+交叉编译环境用于编译 Paddle Lite 和 PaddleDetection 的C++ demo。
+支持多种开发环境，不同开发环境的编译流程请参考对应文档，请确保安装完成Java jdk、Android NDK(R17 < version < R21，其他版本以上未做测试)。
+设置NDK_ROOT命令：
+```shell
+export NDK_ROOT=[YOUR_NDK_PATH]/android-ndk-r17c
+```
+1. [Docker](https://paddle-lite.readthedocs.io/zh/latest/source_compile/compile_env.html#docker)
+2. [Linux](https://paddle-lite.readthedocs.io/zh/latest/source_compile/compile_env.html#linux)
+3. [MAC OS](https://paddle-lite.readthedocs.io/zh/latest/source_compile/compile_env.html#mac-os)
+### 1.2 准备预测库
+预测库有两种获取方式：
+1. [**建议**]直接从[Paddle-Lite Release](https://github.com/PaddlePaddle/Paddle-Lite/releases)中, 根据设备类型与架构选择对应的预编译库，请注意使用模型FP32/16版本需要与库相对应，库文件的说明请参考[官方文档](https://paddle-lite.readthedocs.io/zh/latest/quick_start/release_lib.html#android-toolchain-gcc)。
+**注意**：（1） 如果是从 Paddle-Lite [官方文档](https://paddle-lite.readthedocs.io/zh/latest/quick_start/release_lib.html#android-toolchain-gcc)下载的预测库，注意选择`with_extra=ON，with_cv=ON`的下载链接。2. 目前只提供Android端demo，IOS端demo可以参考[Paddle-Lite IOS demo](https://github.com/PaddlePaddle/Paddle-Lite-Demo/tree/master/PaddleLite-ios-demo)
+（2）PP-PicoDet部署需要Paddle Lite 2.11以上版本。
+2. 编译Paddle-Lite得到预测库，Paddle-Lite的编译方式如下(Lite库在不断更新，如若下列命令无效，请以Lite官方repo为主)：
+```shell
+git clone https://github.com/PaddlePaddle/Paddle-Lite.git
+cd Paddle-Lite
+# 如果使用编译方式，建议使用develop分支编译预测库
+git checkout develop
+# FP32
+./lite/tools/build_android.sh --arch=armv8 --toolchain=clang --with_cv=ON --with_extra=ON
+# FP16
+./lite/tools/build_android.sh --arch=armv8 --toolchain=clang --with_cv=ON --with_extra=ON --with_arm82_fp16=ON
+```
+**注意**：编译Paddle-Lite获得预测库时，需要打开`--with_cv=ON --with_extra=ON`两个选项，`--arch`表示`arm`版本，这里指定为armv8，更多编译命令介绍请参考[链接](https://paddle-lite.readthedocs.io/zh/latest/source_compile/compile_options.html)。
+直接下载预测库并解压后，可以得到`inference_lite_lib.android.armv8.clang.c++_static.with_extra.with_cv/`文件夹，通过编译Paddle-Lite得到的预测库位于`Paddle-Lite/build.lite.android.armv8.gcc/inference_lite_lib.android.armv8/`文件夹下。
+预测库的文件目录如下：
+```
+inference_lite_lib.android.armv8/
+|-- cxx                                        C++ 预测库和头文件
+|   |-- include                                C++ 头文件
+|   |   |-- paddle_api.h
+|   |   |-- paddle_image_preprocess.h
+|   |   |-- paddle_lite_factory_helper.h
+|   |   |-- paddle_place.h
+|   |   |-- paddle_use_kernels.h
+|   |   |-- paddle_use_ops.h
+|   |   `-- paddle_use_passes.h
+|   `-- lib                                           C++预测库
+|       |-- libpaddle_api_light_bundled.a             C++静态库
+|       `-- libpaddle_light_api_shared.so             C++动态库
+|-- java                                     Java预测库
+|   |-- jar
+|   |   `-- PaddlePredictor.jar
+|   |-- so
+|   |   `-- libpaddle_lite_jni.so
+|   `-- src
+|-- demo                                     C++和Java示例代码
+|   |-- cxx                                  C++  预测库demo, 请将本文档目录下的PaddleDetection相关代码拷贝至该文件夹下执行交叉编译。
+|   `-- java                                 Java 预测库demo
+```
+## 2 开始运行
+### 2.1 模型转换
+Paddle-Lite 提供了多种策略来自动优化原始的模型，其中包括量化、子图融合、混合调度、Kernel优选等方法，使用Paddle-Lite的`opt`工具可以自动对inference模型进行优化，并转换为推理所使用的文件格式。目前支持两种优化方式，优化后的模型更轻量，模型运行速度更快。
+**注意**：如果已经准备好了 `.nb` 结尾的模型文件，可以跳过此步骤。
+#### 2.1.1 安装paddle_lite_opt工具
+安装`paddle_lite_opt`工具有如下两种方法, **请注意**，无论使用哪种方法，请尽量保证`paddle_lite_opt`工具和预测库的版本一致，以避免未知的Bug。
+1. [**建议**]pip安装paddlelite并进行转换
+    ```shell
+    pip install paddlelite
+    ```
+2. 源码编译Paddle-Lite生成`paddle_lite_opt`工具
+    模型优化需要Paddle-Lite的`opt`可执行文件，可以通过编译Paddle-Lite源码获得，编译步骤如下：
+    ```shell
+    # 如果准备环境时已经clone了Paddle-Lite，则不用重新clone Paddle-Lite
+    git clone https://github.com/PaddlePaddle/Paddle-Lite.git
+    cd Paddle-Lite
+    git checkout develop
+    # 启动编译
+    ./lite/tools/build.sh build_optimize_tool
+    ```
+    编译完成后，`opt`文件位于`build.opt/lite/api/`下，可通过如下方式查看`opt`的运行选项和使用方式；
+    ```shell
+    cd build.opt/lite/api/
+    ./opt
+    ```
+    `opt`的使用方式与参数与上面的`paddle_lite_opt`完全一致。
+之后使用`paddle_lite_opt`工具可以进行inference模型的转换。`paddle_lite_opt`的部分参数如下：
+|选项|说明|
+|-|-|
+|--model_file|待优化的PaddlePaddle模型（combined形式）的网络结构文件路径|
+|--param_file|待优化的PaddlePaddle模型（combined形式）的权重文件路径|
+|--optimize_out_type|输出模型类型，目前支持两种类型：protobuf和naive_buffer，其中naive_buffer是一种更轻量级的序列化/反序列化实现，默认为naive_buffer|
+|--optimize_out|优化模型的输出路径|
+|--valid_targets|指定模型可执行的backend，默认为arm。目前可支持x86、arm、opencl、npu、xpu，可以同时指定多个backend(以空格分隔)，Model Optimize Tool将会自动选择最佳方式。如果需要支持华为NPU（Kirin 810/990 Soc搭载的达芬奇架构NPU），应当设置为npu, arm|
+| --enable_fp16| true/false，是否使用fp16进行推理。如果开启，需要使用对应fp16的预测库|
+更详细的`paddle_lite_opt`工具使用说明请参考[使用opt转化模型文档](https://paddle-lite.readthedocs.io/zh/latest/user_guides/opt/opt_bin.html)
+`--model_file`表示inference模型的model文件地址，`--param_file`表示inference模型的param文件地址；`optimize_out`用于指定输出文件的名称（不需要添加`.nb`的后缀）。直接在命令行中运行`paddle_lite_opt`，也可以查看所有参数及其说明。
+#### 2.1.2 转换示例
+下面以PaddleDetection中的 `PicoDet` 模型为例，介绍使用`paddle_lite_opt`完成预训练模型到inference模型，再到Paddle-Lite优化模型的转换。
+```shell
+# 进入PaddleDetection根目录
+cd PaddleDetection_root_path
+# 将预训练模型导出为inference模型
+python tools/export_model.py -c configs/picodet/picodet_s_320_coco.yml \
+              -o weights=https://paddledet.bj.bcebos.com/models/picodet_s_320_coco.pdparams --output_dir=output_inference
+# 将inference模型转化为Paddle-Lite优化模型
+# FP32
+paddle_lite_opt  --valid_targets=arm --model_file=output_inference/picodet_s_320_coco/model.pdmodel --param_file=output_inference/picodet_s_320_coco/model.pdiparams --optimize_out=output_inference/picodet_s_320_coco/model
+# FP16
+paddle_lite_opt  --valid_targets=arm --model_file=output_inference/picodet_s_320_coco/model.pdmodel --param_file=output_inference/picodet_s_320_coco/model.pdiparams --optimize_out=output_inference/picodet_s_320_coco/model --enable_fp16=true
+# 将inference模型配置转化为json格式
+python deploy/lite/convert_yml_to_json.py output_inference/picodet_s_320_coco/infer_cfg.yml
+```
+最终在output_inference/picodet_s_320_coco/文件夹下生成`model.nb` 和 `infer_cfg.json`的文件。
+**注意**：`--optimize_out` 参数为优化后模型的保存路径，无需加后缀`.nb`；`--model_file` 参数为模型结构信息文件的路径，`--param_file` 参数为模型权重信息文件的路径，请注意文件名。
+### 2.2 与手机联调
+首先需要进行一些准备工作。
+1. 准备一台arm8的安卓手机，如果编译的预测库是armv7，则需要arm7的手机，并修改Makefile中`ARM_ABI=arm7`。
+2. 电脑上安装ADB工具，用于调试。 ADB安装方式如下：
+    2.1. MAC电脑安装ADB:
+    ```shell
+    brew cask install android-platform-tools
+    ```
+    2.2. Linux安装ADB
+    ```shell
+    sudo apt update
+    sudo apt install -y wget adb
+    ```
+    2.3. Window安装ADB
+    win上安装需要去谷歌的安卓平台下载ADB软件包进行安装：[链接](https://developer.android.com/studio)
+3. 手机连接电脑后，开启手机`USB调试`选项，选择`文件传输`模式，在电脑终端中输入：
+```shell
+adb devices
+```
+如果有device输出，则表示安装成功，如下所示：
+```
+List of devices attached
+744be294    device
+```
+4. 编译lite部署代码生成移动端可执行文件
+```shell
+cd {PadddleDetection_Root}
+cd deploy/lite/
+inference_lite_path=/{lite prediction library path}/inference_lite_lib.android.armv8.gcc.c++_static.with_extra.with_cv/
+mkdir $inference_lite_path/demo/cxx/lite
+cp -r Makefile src/ include/ *runtime_config.json $inference_lite_path/demo/cxx/lite
+cd $inference_lite_path/demo/cxx/lite
+# 执行编译，等待完成后得到可执行文件main
+make ARM_ABI=arm8
+#如果是arm7，则执行 make ARM_ABI = arm7 (或者在Makefile中修改该项)
+```
+5. 准备优化后的模型、预测库文件、测试图像。
+```shell
+mkdir deploy
+cp main *runtime_config.json deploy/
+cd deploy
+mkdir model_det
+mkdir model_keypoint
+# 将优化后的模型、预测库文件、测试图像放置在预测库中的demo/cxx/detection文件夹下
+cp {PadddleDetection_Root}/output_inference/picodet_s_320_coco/model.nb ./model_det/
+cp {PadddleDetection_Root}/output_inference/picodet_s_320_coco/infer_cfg.json ./model_det/
+# 如果需要关键点模型，则只需操作：
+cp {PadddleDetection_Root}/output_inference/hrnet_w32_256x192/model.nb ./model_keypoint/
+cp {PadddleDetection_Root}/output_inference/hrnet_w32_256x192/infer_cfg.json ./model_keypoint/
+# 将测试图像复制到deploy文件夹中
+cp [your_test_img].jpg ./demo.jpg
+# 将C++预测动态库so文件复制到deploy文件夹中
+cp ../../../cxx/lib/libpaddle_light_api_shared.so ./
+```
+执行完成后，deploy文件夹下将有如下文件格式：
+```
+deploy/
+|-- model_det/
+|   |--model.nb                    优化后的检测模型文件
+|   |--infer_cfg.json              检测器模型配置文件
+|-- model_keypoint/
+|   |--model.nb                    优化后的关键点模型文件
+|   |--infer_cfg.json              关键点模型配置文件
+|-- main                           生成的移动端执行文件
+|-- det_runtime_config.json        目标检测执行时参数配置文件
+|-- keypoint_runtime_config.json   关键点检测执行时参数配置文件
+|-- libpaddle_light_api_shared.so  Paddle-Lite库文件
+```
+**注意：**
+*  `det_runtime_config.json` 包含了目标检测的超参数，请按需进行修改：
+```shell
+{
+  "model_dir_det": "./model_det/",              #检测器模型路径
+  "batch_size_det": 1,                          #检测预测时batchsize
+  "threshold_det": 0.5,                         #检测器输出阈值
+  "image_file": "demo.jpg",                     #测试图片
+  "image_dir": "",                              #测试图片文件夹
+  "run_benchmark": true,                       #性能测试开关
+  "cpu_threads": 4                              #线程数
+}
+```
+*  `keypoint_runtime_config.json` 同时包含了目标检测和关键点检测的超参数，支持Top-Down方案的推理流程，请按需进行修改：
+```shell
+{
+  "model_dir_det": "./model_det/",              #检测模型路径
+  "batch_size_det": 1,                          #检测模型预测时batchsize, 存在关键点模型时只能为1
+  "threshold_det": 0.5,                         #检测器输出阈值
+  "model_dir_keypoint": "./model_keypoint/",    #关键点模型路径（不使用需为空字符）
+  "batch_size_keypoint": 8,                     #关键点预测时batchsize
+  "threshold_keypoint": 0.5,                    #关键点输出阈值
+  "image_file": "demo.jpg",                     #测试图片
+  "image_dir": "",                              #测试图片文件夹
+  "run_benchmark": true,                        #性能测试开关
+  "cpu_threads": 4                              #线程数
+  "use_dark_decode": true                       #是否使用DARK解码关键点坐标
+}
+```
+6. 启动调试，上述步骤完成后就可以使用ADB将文件夹 `deploy/` push到手机上运行，步骤如下：
+```shell
+# 将上述deploy文件夹push到手机上
+adb push deploy /data/local/tmp/
+adb shell
+cd /data/local/tmp/deploy
+export LD_LIBRARY_PATH=/data/local/tmp/deploy:$LD_LIBRARY_PATH
+# 修改权限为可执行
+chmod 777 main
+# 以检测为例，执行程序
+./main det_runtime_config.json
+```
+如果对代码做了修改，则需要重新编译并push到手机上。
+运行效果如下：
+<div align="center">
+    <img src="../../docs/images/lite_demo.jpg" width="600">
+</div>
+## FAQ
+Q1：如果想更换模型怎么办，需要重新按照流程走一遍吗？  
+A1：如果已经走通了上述步骤，更换模型只需要替换 `.nb` 模型文件及其对应模型配置文件`infer_cfg.json`，同时要注意修改下配置文件中的 `.nb` 文件路径以及类别映射文件（如有必要）。
+Q2：换一个图测试怎么做？  
+A2：替换 deploy 下的测试图像为你想要测试的图像，使用 ADB 再次 push 到手机上即可。
--- a/deploy/lite/convert_yml_to_json.py
+++ b/deploy/lite/convert_yml_to_json.py
+import yaml
+import json
+import sys
+yamlf = sys.argv[1]
+assert yamlf.endswith(".yml")
+with open(yamlf, 'r') as rf:
+    yaml_data = yaml.safe_load(rf)
+jsonf = yamlf[:-4] + ".json"
+with open(jsonf, 'w') as wf:
+    json.dump(yaml_data, wf, indent=4)
--- a/deploy/lite/include/config_parser.h
+++ b/deploy/lite/include/config_parser.h
+//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <fstream>
+#include <iostream>
+#include <map>
+#include <string>
+#include <vector>
+#include "json/json.h"
+#ifdef _WIN32
+#define OS_PATH_SEP "\\"
+#else
+#define OS_PATH_SEP "/"
+#endif
+namespace PaddleDetection {
+void load_jsonf(std::string jsonfile, Json::Value& jsondata);
+// Inference model configuration parser
+class ConfigPaser {
+ public:
+  ConfigPaser() {}
+  ~ConfigPaser() {}
+  bool load_config(const std::string& model_dir,
+                   const std::string& cfg = "infer_cfg") {
+    Json::Value config;
+    load_jsonf(model_dir + OS_PATH_SEP + cfg + ".json", config);
+    // Get model arch : YOLO, SSD, RetinaNet, RCNN, Face, PicoDet, HRNet
+    if (config.isMember("arch")) {
+      arch_ = config["arch"].as<std::string>();
+    } else {
+      std::cerr
+          << "Please set model arch,"
+          << "support value : YOLO, SSD, RetinaNet, RCNN, Face, PicoDet, HRNet."
+          << std::endl;
+      return false;
+    }
+    // Get draw_threshold for visualization
+    if (config.isMember("draw_threshold")) {
+      draw_threshold_ = config["draw_threshold"].as<float>();
+    } else {
+      std::cerr << "Please set draw_threshold." << std::endl;
+      return false;
+    }
+    // Get Preprocess for preprocessing
+    if (config.isMember("Preprocess")) {
+      preprocess_info_ = config["Preprocess"];
+    } else {
+      std::cerr << "Please set Preprocess." << std::endl;
+      return false;
+    }
+    // Get label_list for visualization
+    if (config.isMember("label_list")) {
+      label_list_.clear();
+      for (auto item : config["label_list"]) {
+        label_list_.emplace_back(item.as<std::string>());
+      }
+    } else {
+      std::cerr << "Please set label_list." << std::endl;
+      return false;
+    }
+    // Get NMS for postprocess
+    if (config.isMember("NMS")) {
+      nms_info_ = config["NMS"];
+    }
+    // Get fpn_stride in PicoDet
+    if (config.isMember("fpn_stride")) {
+      fpn_stride_.clear();
+      for (auto item : config["fpn_stride"]) {
+        fpn_stride_.emplace_back(item.as<int>());
+      }
+    }
+    return true;
+  }
+  float draw_threshold_;
+  std::string arch_;
+  Json::Value preprocess_info_;
+  Json::Value nms_info_;
+  std::vector<std::string> label_list_;
+  std::vector<int> fpn_stride_;
+};
+}  // namespace PaddleDetection
--- a/deploy/lite/include/keypoint_detector.h
+++ b/deploy/lite/include/keypoint_detector.h
+//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <ctime>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+#include <opencv2/core/core.hpp>
+#include <opencv2/highgui/highgui.hpp>
+#include <opencv2/imgproc/imgproc.hpp>
+#include "paddle_api.h"  // NOLINT
+#include "include/config_parser.h"
+#include "include/keypoint_postprocess.h"
+#include "include/preprocess_op.h"
+using namespace paddle::lite_api;  // NOLINT
+namespace PaddleDetection {
+// Object KeyPoint Result
+struct KeyPointResult {
+  // Keypoints: shape(N x 3); N: number of Joints; 3: x,y,conf
+  std::vector<float> keypoints;
+  int num_joints = -1;
+};
+// Visualiztion KeyPoint Result
+cv::Mat VisualizeKptsResult(const cv::Mat& img,
+                            const std::vector<KeyPointResult>& results,
+                            const std::vector<int>& colormap,
+                            float threshold = 0.2);
+class KeyPointDetector {
+ public:
+  explicit KeyPointDetector(const std::string& model_dir,
+                            int cpu_threads = 1,
+                            const int batch_size = 1,
+                            bool use_dark = true) {
+    config_.load_config(model_dir);
+    threshold_ = config_.draw_threshold_;
+    use_dark_ = use_dark;
+    preprocessor_.Init(config_.preprocess_info_);
+    printf("before keypoint detector\n");
+    LoadModel(model_dir, cpu_threads);
+    printf("create keypoint detector\n");
+  }
+  // Load Paddle inference model
+  void LoadModel(std::string model_file, int num_theads);
+  // Run predictor
+  void Predict(const std::vector<cv::Mat> imgs,
+               std::vector<std::vector<float>>& center,
+               std::vector<std::vector<float>>& scale,
+               const int warmup = 0,
+               const int repeats = 1,
+               std::vector<KeyPointResult>* result = nullptr,
+               std::vector<double>* times = nullptr);
+  // Get Model Label list
+  const std::vector<std::string>& GetLabelList() const {
+    return config_.label_list_;
+  }
+  bool use_dark(){return this->use_dark_;}
+  inline float get_threshold() {return threshold_;};
+ private:
+  // Preprocess image and copy data to input buffer
+  void Preprocess(const cv::Mat& image_mat);
+  // Postprocess result
+  void Postprocess(std::vector<float>& output,
+                   std::vector<int64_t>& output_shape,
+                   std::vector<int64_t>& idxout,
+                   std::vector<int64_t>& idx_shape,
+                   std::vector<KeyPointResult>* result,
+                   std::vector<std::vector<float>>& center,
+                   std::vector<std::vector<float>>& scale);
+  std::shared_ptr<PaddlePredictor> predictor_;
+  Preprocessor preprocessor_;
+  ImageBlob inputs_;
+  std::vector<float> output_data_;
+  std::vector<int64_t> idx_data_;
+  float threshold_;
+  ConfigPaser config_;
+  bool use_dark_;
+};
+}  // namespace PaddleDetection
--- a/deploy/lite/include/keypoint_postprocess.h
+++ b/deploy/lite/include/keypoint_postprocess.h
+//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <opencv2/core/core.hpp>
+#include <opencv2/highgui/highgui.hpp>
+#include <opencv2/imgproc/imgproc.hpp>
+#include <vector>
+std::vector<float> get_3rd_point(std::vector<float>& a, std::vector<float>& b);
+std::vector<float> get_dir(float src_point_x, float src_point_y, float rot_rad);
+void affine_tranform(
+    float pt_x, float pt_y, cv::Mat& trans, std::vector<float>& x, int p, int num);
+cv::Mat get_affine_transform(std::vector<float>& center,
+                             std::vector<float>& scale,
+                             float rot,
+                             std::vector<int>& output_size,
+                             int inv);
+void transform_preds(std::vector<float>& coords,
+                     std::vector<float>& center,
+                     std::vector<float>& scale,
+                     std::vector<int>& output_size,
+                     std::vector<int>& dim,
+                     std::vector<float>& target_coords,
+                     bool affine);
+void box_to_center_scale(std::vector<int>& box,
+                         int width,
+                         int height,
+                         std::vector<float>& center,
+                         std::vector<float>& scale);
+void get_max_preds(std::vector<float>& heatmap,
+                   std::vector<int64_t>& dim,
+                   std::vector<float>& preds,
+                   std::vector<float>& maxvals,
+                   int batchid,
+                   int joint_idx);
+void get_final_preds(std::vector<float>& heatmap,
+                     std::vector<int64_t>& dim,
+                     std::vector<int64_t>& idxout,
+                     std::vector<int64_t>& idxdim,
+                     std::vector<float>& center,
+                     std::vector<float> scale,
+                     std::vector<float>& preds,
+                     int batchid,
+                     bool DARK = true);
--- a/deploy/lite/include/object_detector.h
+++ b/deploy/lite/include/object_detector.h
+//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <ctime>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+#include <opencv2/core/core.hpp>
+#include <opencv2/highgui/highgui.hpp>
+#include <opencv2/imgproc/imgproc.hpp>
+#include "paddle_api.h"  // NOLINT
+#include "include/config_parser.h"
+#include "include/preprocess_op.h"
+#include "include/utils.h"
+#include "include/picodet_postprocess.h"
+using namespace paddle::lite_api;  // NOLINT
+namespace PaddleDetection {
+// Generate visualization colormap for each class
+std::vector<int> GenerateColorMap(int num_class);
+// Visualiztion Detection Result
+cv::Mat VisualizeResult(const cv::Mat& img,
+                        const std::vector<PaddleDetection::ObjectResult>& results,
+                        const std::vector<std::string>& lables,
+                        const std::vector<int>& colormap,
+                        const bool is_rbox);
+class ObjectDetector {
+ public:
+  explicit ObjectDetector(const std::string& model_dir,
+                          int cpu_threads = 1,
+                          const int batch_size = 1) {
+    config_.load_config(model_dir);
+    printf("config created\n");
+    threshold_ = config_.draw_threshold_;
+    preprocessor_.Init(config_.preprocess_info_);
+    printf("before object detector\n");
+    LoadModel(model_dir, cpu_threads);
+    printf("create object detector\n");
+  }
+  // Load Paddle inference model
+  void LoadModel(std::string model_file, int num_theads);
+  // Run predictor
+  void Predict(const std::vector<cv::Mat>& imgs,
+               const double threshold = 0.5,
+               const int warmup = 0,
+               const int repeats = 1,
+               std::vector<PaddleDetection::ObjectResult>* result = nullptr,
+               std::vector<int>* bbox_num = nullptr,
+               std::vector<double>* times = nullptr);
+  // Get Model Label list
+  const std::vector<std::string>& GetLabelList() const {
+    return config_.label_list_;
+  }
+ private:
+  // Preprocess image and copy data to input buffer
+  void Preprocess(const cv::Mat& image_mat);
+  // Postprocess result
+  void Postprocess(const std::vector<cv::Mat> mats,
+                   std::vector<PaddleDetection::ObjectResult>* result,
+                   std::vector<int> bbox_num,
+                   bool is_rbox);
+  std::shared_ptr<PaddlePredictor> predictor_;
+  Preprocessor preprocessor_;
+  ImageBlob inputs_;
+  std::vector<float> output_data_;
+  std::vector<int> out_bbox_num_data_;
+  float threshold_;
+  ConfigPaser config_;
+};
+}  // namespace PaddleDetection
--- a/deploy/lite/include/picodet_postprocess.h
+++ b/deploy/lite/include/picodet_postprocess.h
+//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <string>
+#include <vector>
+#include <memory>
+#include <utility>
+#include <ctime>
+#include <numeric>
+#include <math.h>
+#include "include/utils.h"
+namespace PaddleDetection {
+void PicoDetPostProcess(std::vector<PaddleDetection::ObjectResult>* results,
+                         std::vector<const float *> outs,
+                         std::vector<int> fpn_stride,
+                         std::vector<float> im_shape,
+                         std::vector<float> scale_factor,
+                         float score_threshold = 0.3,
+                         float nms_threshold = 0.5,
+                         int num_class = 80,
+                         int reg_max = 7);
+}  // namespace PaddleDetection
--- a/deploy/lite/include/preprocess_op.h
+++ b/deploy/lite/include/preprocess_op.h
+//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <iostream>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+#include <opencv2/core/core.hpp>
+#include <opencv2/highgui/highgui.hpp>
+#include <opencv2/imgproc/imgproc.hpp>
+#include "json/json.h"
+namespace PaddleDetection {
+// Object for storing all preprocessed data
+class ImageBlob {
+ public:
+  // image width and height
+  std::vector<float> im_shape_;
+  // Buffer for image data after preprocessing
+  std::vector<float> im_data_;
+  // in net data shape(after pad)
+  std::vector<float> in_net_shape_;
+  // Evaluation image width and height
+  // std::vector<float>  eval_im_size_f_;
+  // Scale factor for image size to origin image size
+  std::vector<float> scale_factor_;
+};
+// Abstraction of preprocessing opration class
+class PreprocessOp {
+ public:
+  virtual void Init(const Json::Value& item) = 0;
+  virtual void Run(cv::Mat* im, ImageBlob* data) = 0;
+};
+class InitInfo : public PreprocessOp {
+ public:
+  virtual void Init(const Json::Value& item) {}
+  virtual void Run(cv::Mat* im, ImageBlob* data);
+};
+class NormalizeImage : public PreprocessOp {
+ public:
+  virtual void Init(const Json::Value& item) {
+    mean_.clear();
+    scale_.clear();
+    for (auto tmp : item["mean"]) {
+      mean_.emplace_back(tmp.as<float>());
+    }
+    for (auto tmp : item["std"]) {
+      scale_.emplace_back(tmp.as<float>());
+    }
+    is_scale_ = item["is_scale"].as<bool>();
+  }
+  virtual void Run(cv::Mat* im, ImageBlob* data);
+ private:
+  // CHW or HWC
+  std::vector<float> mean_;
+  std::vector<float> scale_;
+  bool is_scale_;
+};
+class Permute : public PreprocessOp {
+ public:
+  virtual void Init(const Json::Value& item) {}
+  virtual void Run(cv::Mat* im, ImageBlob* data);
+};
+class Resize : public PreprocessOp {
+ public:
+  virtual void Init(const Json::Value& item) {
+    interp_ = item["interp"].as<int>();
+    // max_size_ = item["target_size"].as<int>();
+    keep_ratio_ = item["keep_ratio"].as<bool>();
+    target_size_.clear();
+    for (auto tmp : item["target_size"]) {
+      target_size_.emplace_back(tmp.as<int>());
+    }
+  }
+  // Compute best resize scale for x-dimension, y-dimension
+  std::pair<float, float> GenerateScale(const cv::Mat& im);
+  virtual void Run(cv::Mat* im, ImageBlob* data);
+ private:
+  int interp_;
+  bool keep_ratio_;
+  std::vector<int> target_size_;
+  std::vector<int> in_net_shape_;
+};
+// Models with FPN need input shape % stride == 0
+class PadStride : public PreprocessOp {
+ public:
+  virtual void Init(const Json::Value& item) {
+    stride_ = item["stride"].as<int>();
+  }
+  virtual void Run(cv::Mat* im, ImageBlob* data);
+ private:
+  int stride_;
+};
+class TopDownEvalAffine : public PreprocessOp {
+ public:
+  virtual void Init(const Json::Value& item) {
+    trainsize_.clear();
+    for (auto tmp : item["trainsize"]) {
+      trainsize_.emplace_back(tmp.as<int>());
+    }
+  }
+  virtual void Run(cv::Mat* im, ImageBlob* data);
+ private:
+  int interp_ = 1;
+  std::vector<int> trainsize_;
+};
+void CropImg(cv::Mat& img,
+             cv::Mat& crop_img,
+             std::vector<int>& area,
+             std::vector<float>& center,
+             std::vector<float>& scale,
+             float expandratio = 0.15);
+class Preprocessor {
+ public:
+  void Init(const Json::Value& config_node) {
+    // initialize image info at first
+    ops_["InitInfo"] = std::make_shared<InitInfo>();
+    for (const auto& item : config_node) {
+      auto op_name = item["type"].as<std::string>();
+      ops_[op_name] = CreateOp(op_name);
+      ops_[op_name]->Init(item);
+    }
+  }
+  std::shared_ptr<PreprocessOp> CreateOp(const std::string& name) {
+    if (name == "Resize") {
+      return std::make_shared<Resize>();
+    } else if (name == "Permute") {
+      return std::make_shared<Permute>();
+    } else if (name == "NormalizeImage") {
+      return std::make_shared<NormalizeImage>();
+    } else if (name == "PadStride") {
+      // use PadStride instead of PadBatch
+      return std::make_shared<PadStride>();
+    } else if (name == "TopDownEvalAffine") {
+      return std::make_shared<TopDownEvalAffine>();
+    }
+    std::cerr << "can not find function of OP: " << name
+              << " and return: nullptr" << std::endl;
+    return nullptr;
+  }
+  void Run(cv::Mat* im, ImageBlob* data);
+ public:
+  static const std::vector<std::string> RUN_ORDER;
+ private:
+  std::unordered_map<std::string, std::shared_ptr<PreprocessOp>> ops_;
+};
+}  // namespace PaddleDetection
--- a/deploy/lite/include/utils.h
+++ b/deploy/lite/include/utils.h
+//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <string>
+#include <vector>
+#include <memory>
+#include <utility>
+#include <ctime>
+#include <numeric>
+#include <algorithm>
+namespace PaddleDetection {
+// Object Detection Result
+struct ObjectResult {
+  // Rectangle coordinates of detected object: left, right, top, down
+  std::vector<int> rect;
+  // Class id of detected object
+  int class_id;
+  // Confidence of detected object
+  float confidence;
+};
+void nms(std::vector<ObjectResult> &input_boxes, float nms_threshold);
+}  // namespace PaddleDetection
\ No newline at end of file
--- a/deploy/lite/src/config_parser.cc
+++ b/deploy/lite/src/config_parser.cc
+//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "include/config_parser.h"
+namespace PaddleDetection {
+void load_jsonf(std::string jsonfile, Json::Value &jsondata) {
+  std::ifstream ifs;
+  ifs.open(jsonfile);
+  Json::CharReaderBuilder builder;
+  builder["collectComments"] = true;
+  JSONCPP_STRING errs;
+  if (!parseFromStream(builder, ifs, &jsondata, &errs)) {
+    std::cout << errs << std::endl;
+    return;
+  }
+}
+}  // namespace PaddleDetection
--- a/deploy/lite/src/keypoint_detector.cc
+++ b/deploy/lite/src/keypoint_detector.cc
+//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <sstream>
+// for setprecision
+#include <chrono>
+#include <iomanip>
+#include "include/keypoint_detector.h"
+namespace PaddleDetection {
+// Load Model and create model predictor
+void KeyPointDetector::LoadModel(std::string model_file, int num_theads) {
+  MobileConfig config;
+  config.set_threads(num_theads);
+  config.set_model_from_file(model_file + "/model.nb");
+  config.set_power_mode(LITE_POWER_HIGH);
+  predictor_ = std::move(CreatePaddlePredictor<MobileConfig>(config));
+}
+// Visualiztion MaskDetector results
+cv::Mat VisualizeKptsResult(const cv::Mat& img,
+                            const std::vector<KeyPointResult>& results,
+                            const std::vector<int>& colormap,
+                            float threshold) {
+  const int edge[][2] = {{0, 1},
+                         {0, 2},
+                         {1, 3},
+                         {2, 4},
+                         {3, 5},
+                         {4, 6},
+                         {5, 7},
+                         {6, 8},
+                         {7, 9},
+                         {8, 10},
+                         {5, 11},
+                         {6, 12},
+                         {11, 13},
+                         {12, 14},
+                         {13, 15},
+                         {14, 16},
+                         {11, 12}};
+  cv::Mat vis_img = img.clone();
+  for (int batchid = 0; batchid < results.size(); batchid++) {
+    for (int i = 0; i < results[batchid].num_joints; i++) {
+      if (results[batchid].keypoints[i * 3] > threshold) {
+        int x_coord = int(results[batchid].keypoints[i * 3 + 1]);
+        int y_coord = int(results[batchid].keypoints[i * 3 + 2]);
+        cv::circle(vis_img,
+                   cv::Point2d(x_coord, y_coord),
+                   1,
+                   cv::Scalar(0, 0, 255),
+                   2);
+      }
+    }
+    for (int i = 0; i < results[batchid].num_joints; i++) {
+      if (results[batchid].keypoints[edge[i][0] * 3] > threshold &&
+          results[batchid].keypoints[edge[i][1] * 3] > threshold) {
+        int x_start = int(results[batchid].keypoints[edge[i][0] * 3 + 1]);
+        int y_start = int(results[batchid].keypoints[edge[i][0] * 3 + 2]);
+        int x_end = int(results[batchid].keypoints[edge[i][1] * 3 + 1]);
+        int y_end = int(results[batchid].keypoints[edge[i][1] * 3 + 2]);
+        cv::line(vis_img,
+                 cv::Point2d(x_start, y_start),
+                 cv::Point2d(x_end, y_end),
+                 colormap[i],
+                 1);
+      }
+    }
+  }
+  return vis_img;
+}
+void KeyPointDetector::Preprocess(const cv::Mat& ori_im) {
+  // Clone the image : keep the original mat for postprocess
+  cv::Mat im = ori_im.clone();
+  cv::cvtColor(im, im, cv::COLOR_BGR2RGB);
+  preprocessor_.Run(&im, &inputs_);
+}
+void KeyPointDetector::Postprocess(std::vector<float>& output,
+                                   std::vector<int64_t>& output_shape,
+                                   std::vector<int64_t>& idxout,
+                                   std::vector<int64_t>& idx_shape,
+                                   std::vector<KeyPointResult>* result,
+                                   std::vector<std::vector<float>>& center_bs,
+                                   std::vector<std::vector<float>>& scale_bs) {
+  std::vector<float> preds(output_shape[1] * 3, 0);
+  for (int batchid = 0; batchid < output_shape[0]; batchid++) {
+    get_final_preds(output,
+                    output_shape,
+                    idxout,
+                    idx_shape,
+                    center_bs[batchid],
+                    scale_bs[batchid],
+                    preds,
+                    batchid,
+                    this->use_dark());
+    KeyPointResult result_item;
+    result_item.num_joints = output_shape[1];
+    result_item.keypoints.clear();
+    for (int i = 0; i < output_shape[1]; i++) {
+      result_item.keypoints.emplace_back(preds[i * 3]);
+      result_item.keypoints.emplace_back(preds[i * 3 + 1]);
+      result_item.keypoints.emplace_back(preds[i * 3 + 2]);
+    }
+    result->push_back(result_item);
+  }
+}
+void KeyPointDetector::Predict(const std::vector<cv::Mat> imgs,
+                               std::vector<std::vector<float>>& center_bs,
+                               std::vector<std::vector<float>>& scale_bs,
+                               const int warmup,
+                               const int repeats,
+                               std::vector<KeyPointResult>* result,
+                               std::vector<double>* times) {
+  auto preprocess_start = std::chrono::steady_clock::now();
+  int batch_size = imgs.size();
+  // in_data_batch
+  std::vector<float> in_data_all;
+  // Preprocess image
+  for (int bs_idx = 0; bs_idx < batch_size; bs_idx++) {
+    cv::Mat im = imgs.at(bs_idx);
+    Preprocess(im);
+    // TODO: reduce cost time
+    in_data_all.insert(
+        in_data_all.end(), inputs_.im_data_.begin(), inputs_.im_data_.end());
+  }
+  // Prepare input tensor
+  auto input_names = predictor_->GetInputNames();
+  for (const auto& tensor_name : input_names) {
+    auto in_tensor = predictor_->GetInputByName(tensor_name);
+    if (tensor_name == "image") {
+      int rh = inputs_.in_net_shape_[0];
+      int rw = inputs_.in_net_shape_[1];
+      in_tensor->Resize({batch_size, 3, rh, rw});
+      auto* inptr = in_tensor->mutable_data<float>();
+      std::copy_n(in_data_all.data(), in_data_all.size(), inptr);
+    }
+  }
+  auto preprocess_end = std::chrono::steady_clock::now();
+  std::vector<int64_t> output_shape, idx_shape;
+  // Run predictor
+  // warmup
+  for (int i = 0; i < warmup; i++) {
+    predictor_->Run();
+    // Get output tensor
+    auto output_names = predictor_->GetOutputNames();
+    auto out_tensor = predictor_->GetTensor(output_names[0]);
+    auto idx_tensor = predictor_->GetTensor(output_names[1]);
+  }
+  auto inference_start = std::chrono::steady_clock::now();
+  for (int i = 0; i < repeats; i++) {
+    predictor_->Run();
+    // Get output tensor
+    auto output_names = predictor_->GetOutputNames();
+    auto out_tensor = predictor_->GetTensor(output_names[0]);
+    output_shape = out_tensor->shape();
+    // Calculate output length
+    int output_size = 1;
+    for (int j = 0; j < output_shape.size(); ++j) {
+      output_size *= output_shape[j];
+    }
+    if (output_size < 6) {
+      std::cerr << "[WARNING] No object detected." << std::endl;
+    }
+    output_data_.resize(output_size);
+    std::copy_n(
+        out_tensor->mutable_data<float>(), output_size, output_data_.data());
+    auto idx_tensor = predictor_->GetTensor(output_names[1]);
+    idx_shape = idx_tensor->shape();
+    // Calculate output length
+    output_size = 1;
+    for (int j = 0; j < idx_shape.size(); ++j) {
+      output_size *= idx_shape[j];
+    }
+    idx_data_.resize(output_size);
+    std::copy_n(
+        idx_tensor->mutable_data<int64_t>(), output_size, idx_data_.data());
+  }
+  auto inference_end = std::chrono::steady_clock::now();
+  auto postprocess_start = std::chrono::steady_clock::now();
+  // Postprocessing result
+  Postprocess(output_data_,
+              output_shape,
+              idx_data_,
+              idx_shape,
+              result,
+              center_bs,
+              scale_bs);
+  auto postprocess_end = std::chrono::steady_clock::now();
+  std::chrono::duration<float> preprocess_diff =
+      preprocess_end - preprocess_start;
+  times->push_back(double(preprocess_diff.count() * 1000));
+  std::chrono::duration<float> inference_diff = inference_end - inference_start;
+  times->push_back(double(inference_diff.count() / repeats * 1000));
+  std::chrono::duration<float> postprocess_diff =
+      postprocess_end - postprocess_start;
+  times->push_back(double(postprocess_diff.count() * 1000));
+}
+}  // namespace PaddleDetection