add csrc and mmdeploy module

546b4279 · limm · 502f4fb9 · 546b4279 · 546b4279 · 546b4279
Commit 546b4279 authored Jun 25, 2025 by limm
20 changed files
--- a/csrc/mmdeploy/codebase/mmdet/base_dense_head.h
+++ b/csrc/mmdeploy/codebase/mmdet/base_dense_head.h
+// Copyright (c) OpenMMLab. All rights reserved.
+#ifndef MMDEPLOY_CODEBASE_MMDET_BASE_DENSE_HEAD_H_
+#define MMDEPLOY_CODEBASE_MMDET_BASE_DENSE_HEAD_H_
+
+#include "mmdeploy/codebase/mmdet/mmdet.h"
+#include "mmdeploy/core/tensor.h"
+
+namespace mmdeploy::mmdet {
+
+class BaseDenseHead : public MMDetection {
+ public:
+  explicit BaseDenseHead(const Value& cfg);
+
+  Result<Value> operator()(const Value& prep_res, const Value& infer_res);
+  Result<Detections> GetBBoxes(const Value& prep_res, const Tensor& dets,
+                               const Tensor& scores) const;
+
+ private:
+  float score_thr_{0.4f};
+  int nms_pre_{1000};
+  float iou_threshold_{0.45f};
+  int min_bbox_size_{0};
+};
+}  // namespace mmdeploy::mmdet
+
+#endif  // MMDEPLOY_CODEBASE_MMDET_BASE_DENSE_HEAD_H_
--- a/csrc/mmdeploy/codebase/mmdet/instance_segmentation.cpp
+++ b/csrc/mmdeploy/codebase/mmdet/instance_segmentation.cpp
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#include "mmdeploy/core/registry.h"
+#include "mmdeploy/core/utils/device_utils.h"
+#include "mmdeploy/experimental/module_adapter.h"
+#include "mmdeploy/operation/managed.h"
+#include "mmdeploy/operation/vision.h"
+#include "object_detection.h"
+#include "opencv2/imgproc/imgproc.hpp"
+#include "opencv_utils.h"
+
+namespace mmdeploy::mmdet {
+
+class ResizeInstanceMask : public ResizeBBox {
+ public:
+  explicit ResizeInstanceMask(const Value& cfg) : ResizeBBox(cfg) {
+    if (cfg.contains("params")) {
+      mask_thr_binary_ = cfg["params"].value("mask_thr_binary", mask_thr_binary_);
+      is_rcnn_ = cfg["params"].contains("rcnn");
+      is_resize_mask_ = cfg["params"].value("is_resize_mask", is_resize_mask_);
+    }
+    operation::Context ctx(device_, stream_);
+    warp_affine_ = operation::Managed<operation::WarpAffine>::Create("bilinear");
+    permute_ = operation::Managed<::mmdeploy::operation::Permute>::Create();
+  }
+
+  // TODO: remove duplication
+  Result<Value> operator()(const Value& prep_res, const Value& infer_res) {
+    MMDEPLOY_DEBUG("prep_res: {}\ninfer_res: {}", prep_res, infer_res);
+    try {
+      DeviceGuard guard(device_);
+      auto dets = infer_res["dets"].get<Tensor>();
+      auto labels = infer_res["labels"].get<Tensor>();
+      auto masks = infer_res["masks"].get<Tensor>();
+
+      MMDEPLOY_DEBUG("dets.shape: {}", dets.shape());
+      MMDEPLOY_DEBUG("labels.shape: {}", labels.shape());
+      MMDEPLOY_DEBUG("masks.shape: {}", masks.shape());
+
+      // `dets` is supposed to have 3 dims. They are 'batch', 'bboxes_number'
+      // and 'channels' respectively
+      if (!(dets.shape().size() == 3 && dets.data_type() == DataType::kFLOAT)) {
+        MMDEPLOY_ERROR("unsupported `dets` tensor, shape: {}, dtype: {}", dets.shape(),
+                       (int)dets.data_type());
+        return Status(eNotSupported);
+      }
+
+      // `labels` is supposed to have 2 dims, which are 'batch' and
+      // 'bboxes_number'
+      if (labels.shape().size() != 2) {
+        MMDEPLOY_ERROR("unsupported `labels`, tensor, shape: {}, dtype: {}", labels.shape(),
+                       (int)labels.data_type());
+        return Status(eNotSupported);
+      }
+
+      if (!(masks.shape().size() == 4 && masks.data_type() == DataType::kFLOAT)) {
+        MMDEPLOY_ERROR("unsupported `mask` tensor, shape: {}, dtype: {}", masks.shape(),
+                       (int)masks.data_type());
+        return Status(eNotSupported);
+      }
+
+      OUTCOME_TRY(auto _dets, MakeAvailableOnDevice(dets, kHost, stream()));
+      OUTCOME_TRY(auto _labels, MakeAvailableOnDevice(labels, kHost, stream()));
+      // Note: `masks` are kept on device to avoid data copy overhead from device to host.
+      // refer to https://github.com/open-mmlab/mmdeploy/issues/1849
+      // OUTCOME_TRY(auto _masks, MakeAvailableOnDevice(masks, kHost, stream()));
+      // OUTCOME_TRY(stream().Wait());
+
+      OUTCOME_TRY(auto result, DispatchGetBBoxes(prep_res["img_metas"], _dets, _labels));
+      auto ori_w = prep_res["img_metas"]["ori_shape"][2].get<int>();
+      auto ori_h = prep_res["img_metas"]["ori_shape"][1].get<int>();
+      from_value(prep_res["img_metas"]["scale_factor"], scale_factor_);
+
+      ProcessMasks(result, masks, _dets, ori_w, ori_h);
+
+      return to_value(result);
+    } catch (const std::exception& e) {
+      MMDEPLOY_ERROR("{}", e.what());
+      return Status(eFail);
+    }
+  }
+
+ protected:
+  Result<void> ProcessMasks(Detections& result, Tensor d_mask, Tensor cpu_dets, int img_w,
+                            int img_h) {
+    d_mask.Squeeze(0);
+    cpu_dets.Squeeze(0);
+
+    ::mmdeploy::operation::Context ctx(device_, stream_);
+
+    std::vector<Tensor> warped_masks;
+    warped_masks.reserve(result.size());
+
+    std::vector<Tensor> h_warped_masks;
+    h_warped_masks.reserve(result.size());
+
+    if (is_rcnn_) {  // mask r-cnn
+      for (auto& det : result) {
+        auto mask = d_mask.Slice(det.index);
+        auto mask_height = (int)mask.shape(1);
+        auto mask_width = (int)mask.shape(2);
+        mask.Reshape({1, mask_height, mask_width, 1});
+        // resize masks to origin image shape instead of input image shape
+        // default is true
+        if (is_resize_mask_) {
+          auto& bbox = det.bbox;
+          // same as mmdet with skip_empty = True
+          auto x0 = std::max(std::floor(bbox[0]) - 1, 0.f);
+          auto y0 = std::max(std::floor(bbox[1]) - 1, 0.f);
+          auto x1 = std::min(std::ceil(bbox[2]) + 1, (float)img_w);
+          auto y1 = std::min(std::ceil(bbox[3]) + 1, (float)img_h);
+          auto width = static_cast<int>(x1 - x0);
+          auto height = static_cast<int>(y1 - y0);
+          // params align_corners = False
+          float fx;
+          float fy;
+          float tx;
+          float ty;
+          fx = (float)mask_width / (bbox[2] - bbox[0]);
+          fy = (float)mask_height / (bbox[3] - bbox[1]);
+          tx = (x0 + .5f - bbox[0]) * fx - .5f;
+          ty = (y0 + .5f - bbox[1]) * fy - .5f;
+
+          float affine_matrix[] = {fx, 0, tx, 0, fy, ty};
+
+          cv::Mat_<float> m(2, 3, affine_matrix);
+          cv::invertAffineTransform(m, m);
+          Tensor& warped_mask = warped_masks.emplace_back();
+          OUTCOME_TRY(warp_affine_.Apply(mask, warped_mask, affine_matrix, height, width));
+          OUTCOME_TRY(CopyToHost(warped_mask, h_warped_masks.emplace_back()));
+
+        } else {
+          OUTCOME_TRY(CopyToHost(mask, h_warped_masks.emplace_back()));
+        }
+      }
+
+    } else {  // rtmdet-inst
+      auto mask_channel = (int)d_mask.shape(0);
+      auto mask_height = (int)d_mask.shape(1);
+      auto mask_width = (int)d_mask.shape(2);
+      // (C, H, W) -> (H, W, C)
+      std::vector<int> axes = {1, 2, 0};
+      OUTCOME_TRY(permute_.Apply(d_mask, d_mask, axes));
+      Device host{"cpu"};
+      OUTCOME_TRY(auto cpu_mask, MakeAvailableOnDevice(d_mask, host, stream_));
+      OUTCOME_TRY(stream().Wait());
+      cv::Mat mask_mat(mask_height, mask_width, CV_32FC(mask_channel), cpu_mask.data());
+      int resize_height = int(mask_height / scale_factor_[1] + 0.5);
+      int resize_width = int(mask_width / scale_factor_[0] + 0.5);
+      // skip resize if scale_factor is 1.0
+      if (resize_height != mask_height || resize_width != mask_width) {
+        cv::resize(mask_mat, mask_mat, cv::Size(resize_width, resize_height), cv::INTER_LINEAR);
+      }
+      // crop masks
+      mask_mat = mask_mat(cv::Range(0, img_h), cv::Range(0, img_w)).clone();
+
+      for (int i = 0; i < (int)result.size(); i++) {
+        cv::Mat mask_;
+        cv::extractChannel(mask_mat, mask_, i);
+        Tensor mask_t = cpu::CVMat2Tensor(mask_);
+        h_warped_masks.emplace_back(mask_t);
+      }
+    }
+
+    OUTCOME_TRY(stream_.Wait());
+
+    for (size_t i = 0; i < h_warped_masks.size(); ++i) {
+      result[i].mask = ThresholdMask(h_warped_masks[i]);
+    }
+
+    return success();
+  }
+
+  Result<void> CopyToHost(const Tensor& src, Tensor& dst) {
+    if (src.device() == kHost) {
+      dst = src;
+      return success();
+    }
+    dst = TensorDesc{kHost, src.data_type(), src.shape()};
+    OUTCOME_TRY(stream_.Copy(src.buffer(), dst.buffer(), dst.byte_size()));
+    return success();
+  }
+
+  Mat ThresholdMask(const Tensor& h_mask) const {
+    cv::Mat warped_mat = cpu::Tensor2CVMat(h_mask);
+    warped_mat = warped_mat > mask_thr_binary_;
+    return {warped_mat.rows, warped_mat.cols, PixelFormat::kGRAYSCALE, DataType::kINT8,
+            std::shared_ptr<void>(warped_mat.data, [mat = warped_mat](void*) {})};
+  }
+
+ private:
+  operation::Managed<operation::WarpAffine> warp_affine_;
+  ::mmdeploy::operation::Managed<::mmdeploy::operation::Permute> permute_;
+  float mask_thr_binary_{.5f};
+  bool is_rcnn_{true};
+  bool is_resize_mask_{true};
+  std::vector<float> scale_factor_{1.0, 1.0, 1.0, 1.0};
+};
+
+MMDEPLOY_REGISTER_CODEBASE_COMPONENT(MMDetection, ResizeInstanceMask);
+
+}  // namespace mmdeploy::mmdet
--- a/csrc/mmdeploy/codebase/mmdet/mmdet.cpp
+++ b/csrc/mmdeploy/codebase/mmdet/mmdet.cpp
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#include "mmdeploy/codebase/mmdet/mmdet.h"
+
+namespace mmdeploy::mmdet {
+
+MMDEPLOY_REGISTER_CODEBASE(MMDetection);
+
+}  // namespace mmdeploy::mmdet
--- a/csrc/mmdeploy/codebase/mmdet/mmdet.h
+++ b/csrc/mmdeploy/codebase/mmdet/mmdet.h
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#ifndef MMDEPLOY_SRC_CODEBASE_MMDET_MMDET_H_
+#define MMDEPLOY_SRC_CODEBASE_MMDET_MMDET_H_
+
+#include <array>
+
+#include "mmdeploy/codebase/common.h"
+#include "mmdeploy/core/device.h"
+#include "mmdeploy/core/mat.h"
+#include "mmdeploy/core/module.h"
+#include "mmdeploy/core/registry.h"
+#include "mmdeploy/core/serialization.h"
+
+namespace mmdeploy::mmdet {
+
+struct Detection {
+  int index;
+  int label_id;
+  float score;
+  std::array<float, 4> bbox;  // left, top, right, bottom
+  Mat mask;
+  MMDEPLOY_ARCHIVE_MEMBERS(index, label_id, score, bbox, mask);
+};
+
+using Detections = std::vector<Detection>;
+
+MMDEPLOY_DECLARE_CODEBASE(MMDetection, mmdet);
+
+}  // namespace mmdeploy::mmdet
+
+#endif  // MMDEPLOY_SRC_CODEBASE_MMDET_MMDET_H_
--- a/csrc/mmdeploy/codebase/mmdet/object_detection.cpp
+++ b/csrc/mmdeploy/codebase/mmdet/object_detection.cpp
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#include "object_detection.h"
+
+#include "mmdeploy/core/registry.h"
+#include "mmdeploy/core/utils/device_utils.h"
+#include "mmdeploy/core/utils/formatter.h"
+#include "mmdeploy/experimental/module_adapter.h"
+#include "utils.h"
+
+using namespace std;
+
+namespace mmdeploy::mmdet {
+
+ResizeBBox::ResizeBBox(const Value& cfg) : MMDetection(cfg) {
+  if (cfg.contains("params")) {
+    if (cfg["params"].contains("conf_thr")) {
+      // for mobilev2yolov3
+      score_thr_ = cfg["params"].value("conf_thr", 0.f);
+    } else {
+      score_thr_ = cfg["params"].value("score_thr", 0.f);
+    }
+    min_bbox_size_ = cfg["params"].value("min_bbox_size", 0.f);
+  }
+}
+std::vector<Tensor> ResizeBBox::GetDetsLabels(const Value& prep_res, const Value& infer_res) {
+  std::vector<Tensor> results;
+  if (infer_res.contains("dets") && infer_res.contains("labels")) {
+    results.push_back(infer_res["dets"].get<Tensor>());
+    results.push_back(infer_res["labels"].get<Tensor>());
+    return results;
+  } else if (infer_res.contains("detection_output") && (!infer_res.contains("dets")) &&
+             (!infer_res.contains("labels"))) {
+    int img_width = prep_res["img_metas"]["img_shape"][2].get<int>();
+    int img_height = prep_res["img_metas"]["img_shape"][1].get<int>();
+    auto detection_output = infer_res["detection_output"].get<Tensor>();
+    auto* detection_output_ptr = detection_output.data<float>();
+    // detection_output: (1, num_det, 6)
+    TensorDesc labeldesc = detection_output.desc();
+    int batch_size = detection_output.shape()[0];
+    int num_det = detection_output.shape()[1];
+    labeldesc.shape = {batch_size, num_det};
+    Tensor labels(labeldesc);
+    TensorDesc detdesc = detection_output.desc();
+    detdesc.shape = {batch_size, num_det, 5};
+    Tensor dets(detdesc);
+    auto* dets_ptr = dets.data<float>();
+    auto* labels_ptr = labels.data<float>();
+
+    for (int i = 0; i < batch_size * num_det; ++i) {
+      *labels_ptr++ = detection_output_ptr[0] - 1;
+      dets_ptr[4] = detection_output_ptr[1];
+      dets_ptr[0] = detection_output_ptr[2] * img_width;
+      dets_ptr[1] = detection_output_ptr[3] * img_height;
+      dets_ptr[2] = detection_output_ptr[4] * img_width;
+      dets_ptr[3] = detection_output_ptr[5] * img_height;
+      dets_ptr += 5;
+      detection_output_ptr += 6;
+    }
+    results.push_back(dets);
+    results.push_back(labels);
+    return results;
+  } else {
+    MMDEPLOY_ERROR("No support for another key of detection results!");
+    return results;
+  }
+}
+Result<Value> ResizeBBox::operator()(const Value& prep_res, const Value& infer_res) {
+  MMDEPLOY_DEBUG("prep_res: {}\ninfer_res: {}", prep_res, infer_res);
+  try {
+    Tensor dets, labels;
+    vector<Tensor> outputs = GetDetsLabels(prep_res, infer_res);
+    dets = outputs[0];
+    labels = outputs[1];
+    MMDEPLOY_DEBUG("dets.shape: {}", dets.shape());
+    MMDEPLOY_DEBUG("labels.shape: {}", labels.shape());
+    // `dets` is supposed to have 3 dims. They are 'batch', 'bboxes_number'
+    // and 'channels' respectively
+    if (!(dets.shape().size() == 3 && dets.data_type() == DataType::kFLOAT)) {
+      MMDEPLOY_ERROR("unsupported `dets` tensor, shape: {}, dtype: {}", dets.shape(),
+                     (int)dets.data_type());
+      return Status(eNotSupported);
+    }
+
+    // `labels` is supposed to have 2 dims, which are 'batch' and
+    // 'bboxes_number'
+    if (labels.shape().size() != 2) {
+      MMDEPLOY_ERROR("unsupported `labels`, tensor, shape: {}, dtype: {}", labels.shape(),
+                     (int)labels.data_type());
+      return Status(eNotSupported);
+    }
+
+    OUTCOME_TRY(auto _dets, MakeAvailableOnDevice(dets, kHost, stream()));
+    OUTCOME_TRY(auto _labels, MakeAvailableOnDevice(labels, kHost, stream()));
+    OUTCOME_TRY(stream().Wait());
+
+    OUTCOME_TRY(auto result, DispatchGetBBoxes(prep_res["img_metas"], _dets, _labels));
+    return to_value(result);
+
+  } catch (...) {
+    return Status(eFail);
+  }
+}
+Result<Detections> ResizeBBox::DispatchGetBBoxes(const Value& prep_res, const Tensor& dets,
+                                                 const Tensor& labels) {
+  auto data_type = labels.data_type();
+  switch (data_type) {
+    case DataType::kFLOAT:
+      return GetBBoxes<float>(prep_res, dets, labels);
+    case DataType::kINT32:
+      return GetBBoxes<int32_t>(prep_res, dets, labels);
+    case DataType::kINT64:
+      return GetBBoxes<int64_t>(prep_res, dets, labels);
+    default:
+      return Status(eNotSupported);
+  }
+}
+template <typename T>
+Result<Detections> ResizeBBox::GetBBoxes(const Value& prep_res, const Tensor& dets,
+                                         const Tensor& labels) {
+  Detections objs;
+  auto* dets_ptr = dets.data<float>();
+  auto* labels_ptr = labels.data<T>();
+  vector<float> scale_factor;
+  if (prep_res.contains("scale_factor")) {
+    from_value(prep_res["scale_factor"], scale_factor);
+  } else {
+    scale_factor = {1.f, 1.f, 1.f, 1.f};
+  }
+
+  int top_padding = 0;
+  int left_padding = 0;
+  if (prep_res.contains("pad_param")) {
+    top_padding = prep_res["pad_param"][0].get<int>();
+    left_padding = prep_res["pad_param"][1].get<int>();
+  }
+
+  float w_offset = 0.f;
+  float h_offset = 0.f;
+  if (prep_res.contains("border")) {
+    w_offset = -prep_res["border"][1].get<int>();
+    h_offset = -prep_res["border"][0].get<int>();
+  }
+  int ori_width = prep_res["ori_shape"][2].get<int>();
+  int ori_height = prep_res["ori_shape"][1].get<int>();
+
+  // `dets` has shape(1, n, 4) or shape(1, n, 5). The latter one has `score`
+  auto bboxes_number = dets.shape()[1];
+  auto channels = dets.shape()[2];
+  for (auto i = 0; i < bboxes_number; ++i, dets_ptr += channels, ++labels_ptr) {
+    float score = 0.f;
+    if (channels > 4 && dets_ptr[4] <= score_thr_) {
+      continue;
+    }
+    score = channels > 4 ? dets_ptr[4] : score;
+    auto left = dets_ptr[0];
+    auto top = dets_ptr[1];
+    auto right = dets_ptr[2];
+    auto bottom = dets_ptr[3];
+
+    MMDEPLOY_DEBUG("ori left {}, top {}, right {}, bottom {}, label {}", left, top, right, bottom,
+                   *labels_ptr);
+    auto rect = MapToOriginImage(left, top, right, bottom, scale_factor.data(), w_offset, h_offset,
+                                 ori_width, ori_height, top_padding, left_padding);
+    if (rect[2] - rect[0] < min_bbox_size_ || rect[3] - rect[1] < min_bbox_size_) {
+      MMDEPLOY_DEBUG("ignore small bbox with width '{}' and height '{}", rect[2] - rect[0],
+                     rect[3] - rect[1]);
+      continue;
+    }
+    MMDEPLOY_DEBUG("remap left {}, top {}, right {}, bottom {}", rect[0], rect[1], rect[2],
+                   rect[3]);
+    Detection det{};
+    det.index = i;
+    det.label_id = static_cast<int>(*labels_ptr);
+    det.score = score;
+    det.bbox = rect;
+    objs.push_back(std::move(det));
+  }
+  return objs;
+}
+
+MMDEPLOY_REGISTER_CODEBASE_COMPONENT(MMDetection, ResizeBBox);
+
+}  // namespace mmdeploy::mmdet
--- a/csrc/mmdeploy/codebase/mmdet/object_detection.h
+++ b/csrc/mmdeploy/codebase/mmdet/object_detection.h
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#ifndef MMDEPLOY_SRC_CODEBASE_MMDET_OBJECT_DETECTION_H_
+#define MMDEPLOY_SRC_CODEBASE_MMDET_OBJECT_DETECTION_H_
+
+#include "mmdeploy/codebase/mmdet/mmdet.h"
+#include "mmdeploy/core/tensor.h"
+#include "mmdeploy/core/utils/formatter.h"
+
+namespace mmdeploy::mmdet {
+
+class ResizeBBox : public MMDetection {
+ public:
+  explicit ResizeBBox(const Value& cfg);
+
+  Result<Value> operator()(const Value& prep_res, const Value& infer_res);
+
+ protected:
+  Result<Detections> DispatchGetBBoxes(const Value& prep_res, const Tensor& dets,
+                                       const Tensor& labels);
+
+  template <typename T>
+  Result<Detections> GetBBoxes(const Value& prep_res, const Tensor& dets, const Tensor& labels);
+
+  std::vector<Tensor> GetDetsLabels(const Value& prep_res, const Value& infer_res);
+
+ protected:
+  constexpr static Device kHost{0, 0};
+  float score_thr_{0.f};
+  float min_bbox_size_{0.f};
+};
+
+}  // namespace mmdeploy::mmdet
+
+#endif  // MMDEPLOY_SRC_CODEBASE_MMDET_OBJECT_DETECTION_H_
--- a/csrc/mmdeploy/codebase/mmdet/rtmdet_head.cpp
+++ b/csrc/mmdeploy/codebase/mmdet/rtmdet_head.cpp
+// Copyright (c) OpenMMLab. All rights reserved.
+#include "rtmdet_head.h"
+
+#include <math.h>
+
+#include <algorithm>
+#include <numeric>
+
+#include "mmdeploy/core/model.h"
+#include "mmdeploy/core/utils/device_utils.h"
+#include "mmdeploy/core/utils/formatter.h"
+#include "utils.h"
+
+namespace mmdeploy::mmdet {
+
+RTMDetSepBNHead::RTMDetSepBNHead(const Value& cfg) : MMDetection(cfg) {
+  auto init = [&]() -> Result<void> {
+    auto model = cfg["context"]["model"].get<Model>();
+    if (cfg.contains("params")) {
+      nms_pre_ = cfg["params"].value("nms_pre", -1);
+      score_thr_ = cfg["params"].value("score_thr", 0.02f);
+      min_bbox_size_ = cfg["params"].value("min_bbox_size", 0);
+      max_per_img_ = cfg["params"].value("max_per_img", 100);
+      iou_threshold_ = cfg["params"].contains("nms")
+                           ? cfg["params"]["nms"].value("iou_threshold", 0.45f)
+                           : 0.45f;
+      if (cfg["params"].contains("anchor_generator")) {
+        offset_ = cfg["params"]["anchor_generator"].value("offset", 0);
+        from_value(cfg["params"]["anchor_generator"]["strides"], strides_);
+      }
+    }
+    return success();
+  };
+  init().value();
+}
+
+Result<Value> RTMDetSepBNHead::operator()(const Value& prep_res, const Value& infer_res) {
+  MMDEPLOY_DEBUG("prep_res: {}\ninfer_res: {}", prep_res, infer_res);
+  try {
+    std::vector<Tensor> cls_scores;
+    std::vector<Tensor> bbox_preds;
+    const Device kHost{0, 0};
+    int i = 0;
+    int divisor = infer_res.size() / 2;
+    for (auto iter = infer_res.begin(); iter != infer_res.end(); iter++) {
+      auto pred_map = iter->get<Tensor>();
+      OUTCOME_TRY(auto _pred_map, MakeAvailableOnDevice(pred_map, kHost, stream()));
+      if (i < divisor)
+        cls_scores.push_back(_pred_map);
+      else
+        bbox_preds.push_back(_pred_map);
+      i++;
+    }
+    OUTCOME_TRY(stream().Wait());
+    OUTCOME_TRY(auto result, GetBBoxes(prep_res["img_metas"], bbox_preds, cls_scores));
+    return to_value(result);
+  } catch (...) {
+    return Status(eFail);
+  }
+}
+
+static float sigmoid(float x) { return 1.0 / (1.0 + expf(-x)); }
+
+Result<Detections> RTMDetSepBNHead::GetBBoxes(const Value& prep_res,
+                                              const std::vector<Tensor>& bbox_preds,
+                                              const std::vector<Tensor>& cls_scores) const {
+  MMDEPLOY_DEBUG("bbox_pred: {}, {}", bbox_preds[0].shape(), dets[0].data_type());
+  MMDEPLOY_DEBUG("cls_score: {}, {}", scores[0].shape(), scores[0].data_type());
+
+  std::vector<float> filter_boxes;
+  std::vector<float> obj_probs;
+  std::vector<int> class_ids;
+
+  for (int i = 0; i < bbox_preds.size(); i++) {
+    RTMDetFeatDeocde(bbox_preds[i], cls_scores[i], strides_[i], offset_, filter_boxes, obj_probs,
+                     class_ids);
+  }
+
+  std::vector<int> indexArray;
+  for (int i = 0; i < obj_probs.size(); ++i) {
+    indexArray.push_back(i);
+  }
+  Sort(obj_probs, class_ids, indexArray);
+
+  Tensor dets(TensorDesc{Device{0, 0}, DataType::kFLOAT,
+                         TensorShape{int(filter_boxes.size() / 4), 4}, "dets"});
+  std::copy(filter_boxes.begin(), filter_boxes.end(), dets.data<float>());
+  NMS(dets, iou_threshold_, indexArray);
+
+  Detections objs;
+  std::vector<float> scale_factor;
+  if (prep_res.contains("scale_factor")) {
+    from_value(prep_res["scale_factor"], scale_factor);
+  } else {
+    scale_factor = {1.f, 1.f, 1.f, 1.f};
+  }
+  int ori_width = prep_res["ori_shape"][2].get<int>();
+  int ori_height = prep_res["ori_shape"][1].get<int>();
+  auto det_ptr = dets.data<float>();
+  for (int i = 0; i < indexArray.size(); ++i) {
+    if (indexArray[i] == -1) {
+      continue;
+    }
+    int j = indexArray[i];
+    auto x1 = det_ptr[j * 4 + 0];
+    auto y1 = det_ptr[j * 4 + 1];
+    auto x2 = det_ptr[j * 4 + 2];
+    auto y2 = det_ptr[j * 4 + 3];
+    int label_id = class_ids[i];
+    float score = obj_probs[i];
+
+    MMDEPLOY_DEBUG("{}-th box: ({}, {}, {}, {}), {}, {}", i, x1, y1, x2, y2, label_id, score);
+
+    auto rect =
+        MapToOriginImage(x1, y1, x2, y2, scale_factor.data(), 0, 0, ori_width, ori_height, 0, 0);
+    if (rect[2] - rect[0] < min_bbox_size_ || rect[3] - rect[1] < min_bbox_size_) {
+      MMDEPLOY_DEBUG("ignore small bbox with width '{}' and height '{}", rect[2] - rect[0],
+                     rect[3] - rect[1]);
+      continue;
+    }
+    Detection det{};
+    det.index = i;
+    det.label_id = label_id;
+    det.score = score;
+    det.bbox = rect;
+    objs.push_back(std::move(det));
+  }
+
+  return objs;
+}
+
+int RTMDetSepBNHead::RTMDetFeatDeocde(const Tensor& bbox_pred, const Tensor& cls_score,
+                                      const float stride, const float offset,
+                                      std::vector<float>& filter_boxes,
+                                      std::vector<float>& obj_probs,
+                                      std::vector<int>& class_ids) const {
+  int cls_param_num = cls_score.shape(1);
+  int feat_h = bbox_pred.shape(2);
+  int feat_w = bbox_pred.shape(3);
+  int feat_size = feat_h * feat_w;
+  auto bbox_ptr = bbox_pred.data<float>();
+  auto score_ptr = cls_score.data<float>();  // (b, c, h, w)
+  int valid_count = 0;
+  for (int i = 0; i < feat_h; i++) {
+    for (int j = 0; j < feat_w; j++) {
+      float max_score = score_ptr[i * feat_w + j];
+      int class_id = 0;
+      for (int k = 0; k < cls_param_num; k++) {
+        float score = score_ptr[k * feat_size + i * feat_w + j];
+        if (score > max_score) {
+          max_score = score;
+          class_id = k;
+        }
+      }
+      max_score = sigmoid(max_score);
+      if (max_score < score_thr_) continue;
+
+      obj_probs.push_back(max_score);
+      class_ids.push_back(class_id);
+
+      float tl_x = bbox_ptr[0 * feat_size + i * feat_w + j];
+      float tl_y = bbox_ptr[1 * feat_size + i * feat_w + j];
+      float br_x = bbox_ptr[2 * feat_size + i * feat_w + j];
+      float br_y = bbox_ptr[3 * feat_size + i * feat_w + j];
+
+      auto box = RTMDetdecode(tl_x, tl_y, br_x, br_y, stride, offset, j, i);
+
+      tl_x = box[0];
+      tl_y = box[1];
+      br_x = box[2];
+      br_y = box[3];
+
+      filter_boxes.push_back(tl_x);
+      filter_boxes.push_back(tl_y);
+      filter_boxes.push_back(br_x);
+      filter_boxes.push_back(br_y);
+      valid_count++;
+    }
+  }
+  return valid_count;
+}
+
+std::array<float, 4> RTMDetSepBNHead::RTMDetdecode(float tl_x, float tl_y, float br_x, float br_y,
+                                                   float stride, float offset, int j, int i) const {
+  tl_x = (offset + j) * stride - tl_x;
+  tl_y = (offset + i) * stride - tl_y;
+  br_x = (offset + j) * stride + br_x;
+  br_y = (offset + i) * stride + br_y;
+  return std::array<float, 4>{tl_x, tl_y, br_x, br_y};
+}
+
+MMDEPLOY_REGISTER_CODEBASE_COMPONENT(MMDetection, RTMDetSepBNHead);
+
+}  // namespace mmdeploy::mmdet
--- a/csrc/mmdeploy/codebase/mmdet/rtmdet_head.h
+++ b/csrc/mmdeploy/codebase/mmdet/rtmdet_head.h
+// Copyright (c) OpenMMLab. All rights reserved.
+#ifndef MMDEPLOY_CODEBASE_MMDET_RTMDET_HEAD_H_
+#define MMDEPLOY_CODEBASE_MMDET_RTMDET_HEAD_H_
+
+#include "mmdeploy/codebase/mmdet/mmdet.h"
+#include "mmdeploy/core/tensor.h"
+
+namespace mmdeploy::mmdet {
+
+class RTMDetSepBNHead : public MMDetection {
+ public:
+  explicit RTMDetSepBNHead(const Value& cfg);
+  Result<Value> operator()(const Value& prep_res, const Value& infer_res);
+  Result<Detections> GetBBoxes(const Value& prep_res, const std::vector<Tensor>& bbox_preds,
+                               const std::vector<Tensor>& cls_scores) const;
+  int RTMDetFeatDeocde(const Tensor& bbox_pred, const Tensor& cls_score, const float stride,
+                       const float offset, std::vector<float>& filter_boxes,
+                       std::vector<float>& obj_probs, std::vector<int>& class_ids) const;
+  std::array<float, 4> RTMDetdecode(float tl_x, float tl_y, float br_x, float br_y, float stride,
+                                    float offset, int j, int i) const;
+
+ private:
+  float score_thr_{0.4f};
+  int nms_pre_{1000};
+  float iou_threshold_{0.45f};
+  int min_bbox_size_{0};
+  int max_per_img_{100};
+  float offset_{0.0f};
+  std::vector<float> strides_;
+};
+
+}  // namespace mmdeploy::mmdet
+
+#endif  // MMDEPLOY_CODEBASE_MMDET_RTMDET_HEAD_H_
--- a/csrc/mmdeploy/codebase/mmdet/utils.cpp
+++ b/csrc/mmdeploy/codebase/mmdet/utils.cpp
+// Copyright (c) OpenMMLab. All rights reserved.
+#include "utils.h"
+
+#include <algorithm>
+#include <numeric>
+
+using mmdeploy::framework::Tensor;
+
+namespace mmdeploy::mmdet {
+
+std::array<float, 4> MapToOriginImage(float left, float top, float right, float bottom,
+                                      const float* scale_factor, float x_offset, float y_offset,
+                                      int ori_width, int ori_height, int top_padding,
+                                      int left_padding) {
+  left = std::max((left - left_padding) / scale_factor[0] + x_offset, 0.f);
+  top = std::max((top - top_padding) / scale_factor[1] + y_offset, 0.f);
+  right = std::min((right - left_padding) / scale_factor[2] + x_offset, (float)ori_width - 1.f);
+  bottom = std::min((bottom - top_padding) / scale_factor[3] + y_offset, (float)ori_height - 1.f);
+  return {left, top, right, bottom};
+}
+
+void FilterScoresAndTopk(const Tensor& scores, float score_thr, int topk, std::vector<float>& probs,
+                         std::vector<int>& label_ids, std::vector<int>& anchor_idxs) {
+  auto kDets = scores.shape(1);
+  auto kClasses = scores.shape(2);
+  auto score_ptr = scores.data<float>();
+
+  for (auto i = 0; i < kDets; ++i, score_ptr += kClasses) {
+    auto iter = std::max_element(score_ptr, score_ptr + kClasses);
+    auto max_score = *iter;
+    if (*iter < score_thr) {
+      continue;
+    }
+    probs.push_back(*iter);
+    label_ids.push_back(iter - score_ptr);
+    anchor_idxs.push_back(i);
+  }
+}
+
+float IOU(float xmin0, float ymin0, float xmax0, float ymax0, float xmin1, float ymin1, float xmax1,
+          float ymax1) {
+  auto w = std::max(0.f, std::min(xmax0, xmax1) - std::max(xmin0, xmin1));
+  auto h = std::max(0.f, std::min(ymax0, ymax1) - std::max(ymin0, ymin1));
+  auto area = w * h;
+  auto sum = (xmax0 - xmin0) * (ymax0 - ymin0) + (xmax1 - xmin1) * (ymax1 - ymin1);
+  auto iou = area / (sum - area);
+  return iou <= 0.f ? 0.f : iou;
+}
+
+void NMS(const Tensor& dets, float iou_threshold, std::vector<int>& keep_idxs) {
+  auto det_ptr = dets.data<float>();
+  for (auto i = 0; i < keep_idxs.size(); ++i) {
+    auto n = keep_idxs[i];
+    for (auto j = i + 1; j < keep_idxs.size(); ++j) {
+      auto m = keep_idxs[j];
+
+      // `delta_xywh_bbox_coder` decode return tl_x, tl_y, br_x, br_y
+      float xmin0 = det_ptr[n * 4 + 0];
+      float ymin0 = det_ptr[n * 4 + 1];
+      float xmax0 = det_ptr[n * 4 + 2];
+      float ymax0 = det_ptr[n * 4 + 3];
+
+      float xmin1 = det_ptr[m * 4 + 0];
+      float ymin1 = det_ptr[m * 4 + 1];
+      float xmax1 = det_ptr[m * 4 + 2];
+      float ymax1 = det_ptr[m * 4 + 3];
+
+      float iou = IOU(xmin0, ymin0, xmax0, ymax0, xmin1, ymin1, xmax1, ymax1);
+
+      if (iou > iou_threshold) {
+        keep_idxs[j] = -1;
+      }
+    }
+  }
+}
+
+void Sort(std::vector<float>& probs, std::vector<int>& label_ids, std::vector<int>& anchor_idxs) {
+  std::vector<int> prob_idxs(probs.size());
+  std::iota(prob_idxs.begin(), prob_idxs.end(), 0);
+  std::sort(prob_idxs.begin(), prob_idxs.end(), [&](int i, int j) { return probs[i] > probs[j]; });
+  std::vector<float> _probs;
+  std::vector<int> _label_ids;
+  std::vector<int> _keep_idxs;
+  for (auto idx : prob_idxs) {
+    _probs.push_back(probs[idx]);
+    _label_ids.push_back(label_ids[idx]);
+    _keep_idxs.push_back(anchor_idxs[idx]);
+  }
+  probs = std::move(_probs);
+  label_ids = std::move(_label_ids);
+  anchor_idxs = std::move(_keep_idxs);
+}
+
+}  // namespace mmdeploy::mmdet
--- a/csrc/mmdeploy/codebase/mmdet/utils.h
+++ b/csrc/mmdeploy/codebase/mmdet/utils.h
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#ifndef MMDEPLOY_CODEBASE_MMDET_UTILS_H_
+#define MMDEPLOY_CODEBASE_MMDET_UTILS_H_
+
+#include <array>
+#include <vector>
+
+#include "mmdeploy/core/tensor.h"
+
+namespace mmdeploy::mmdet {
+std::array<float, 4> MapToOriginImage(float left, float top, float right, float bottom,
+                                      const float* scale_factor, float x_offset, float y_offset,
+                                      int ori_width, int ori_height, int top_padding,
+                                      int left_padding);
+// @brief Filter results using score threshold and topk candidates.
+// scores (Tensor): The scores, shape (num_bboxes, K).
+// probs: The scores after being filtered
+// label_ids: The class labels
+// anchor_idxs: The anchor indexes
+void FilterScoresAndTopk(const mmdeploy::framework::Tensor& scores, float score_thr, int topk,
+                         std::vector<float>& probs, std::vector<int>& label_ids,
+                         std::vector<int>& anchor_idxs);
+float IOU(float xmin0, float ymin0, float xmax0, float ymax0, float xmin1, float ymin1, float xmax1,
+          float ymax1);
+
+void Sort(std::vector<float>& probs, std::vector<int>& label_ids, std::vector<int>& anchor_idxs);
+
+void NMS(const mmdeploy::framework::Tensor& dets, float iou_threshold, std::vector<int>& keep_idxs);
+
+}  // namespace mmdeploy::mmdet
+
+#endif  // MMDEPLOY_CODEBASE_MMDET_UTILS_H_
--- a/csrc/mmdeploy/codebase/mmdet/yolo_head.cpp
+++ b/csrc/mmdeploy/codebase/mmdet/yolo_head.cpp
+// Copyright (c) OpenMMLab. All rights reserved.
+#include "yolo_head.h"
+
+#include <math.h>
+
+#include <algorithm>
+#include <numeric>
+
+#include "mmdeploy/core/model.h"
+#include "mmdeploy/core/utils/device_utils.h"
+#include "mmdeploy/core/utils/formatter.h"
+#include "utils.h"
+
+namespace mmdeploy::mmdet {
+
+YOLOHead::YOLOHead(const Value& cfg) : MMDetection(cfg) {
+  auto init = [&]() -> Result<void> {
+    auto model = cfg["context"]["model"].get<Model>();
+    if (cfg.contains("params")) {
+      nms_pre_ = cfg["params"].value("nms_pre", -1);
+      score_thr_ = cfg["params"].value("score_thr", 0.02f);
+      min_bbox_size_ = cfg["params"].value("min_bbox_size", 0);
+      iou_threshold_ = cfg["params"].contains("nms")
+                           ? cfg["params"]["nms"].value("iou_threshold", 0.45f)
+                           : 0.45f;
+      if (cfg["params"].contains("anchor_generator")) {
+        from_value(cfg["params"]["anchor_generator"]["base_sizes"], anchors_);
+        from_value(cfg["params"]["anchor_generator"]["strides"], strides_);
+      }
+    }
+    return success();
+  };
+  init().value();
+}
+
+Result<Value> YOLOHead::operator()(const Value& prep_res, const Value& infer_res) {
+  MMDEPLOY_DEBUG("prep_res: {}\ninfer_res: {}", prep_res, infer_res);
+  try {
+    const Device kHost{0, 0};
+    std::vector<Tensor> pred_maps;
+    for (auto iter = infer_res.begin(); iter != infer_res.end(); iter++) {
+      auto pred_map = iter->get<Tensor>();
+      OUTCOME_TRY(auto _pred_map, MakeAvailableOnDevice(pred_map, kHost, stream()));
+      pred_maps.push_back(_pred_map);
+    }
+    OUTCOME_TRY(stream().Wait());
+    // reorder pred_maps according to strides and anchors, mainly for rknpu yolov3
+    if ((pred_maps.size() > 1) &&
+        !((strides_[0] < strides_[1]) ^ (pred_maps[0].shape(3) < pred_maps[1].shape(3)))) {
+      std::reverse(pred_maps.begin(), pred_maps.end());
+    }
+    OUTCOME_TRY(auto result, GetBBoxes(prep_res["img_metas"], pred_maps));
+    return to_value(result);
+  } catch (...) {
+    return Status(eFail);
+  }
+}
+
+inline static int clamp(float val, int min, int max) {
+  return val > min ? (val < max ? val : max) : min;
+}
+
+static float sigmoid(float x) { return 1.0 / (1.0 + expf(-x)); }
+
+static float unsigmoid(float y) { return -1.0 * logf((1.0 / y) - 1.0); }
+
+int YOLOHead::YOLOFeatDecode(const Tensor& feat_map, const std::vector<std::vector<float>>& anchor,
+                             int grid_h, int grid_w, int height, int width, int stride,
+                             std::vector<float>& boxes, std::vector<float>& obj_probs,
+                             std::vector<int>& class_id, float threshold) const {
+  auto input = const_cast<float*>(feat_map.data<float>());
+  auto prop_box_size = feat_map.shape(1) / anchor.size();
+  const int kClasses = prop_box_size - 5;
+  int valid_count = 0;
+  int grid_len = grid_h * grid_w;
+  float thres = unsigmoid(threshold);
+  for (int a = 0; a < anchor.size(); a++) {
+    for (int i = 0; i < grid_h; i++) {
+      for (int j = 0; j < grid_w; j++) {
+        float box_confidence = input[(prop_box_size * a + 4) * grid_len + i * grid_w + j];
+        if (box_confidence >= thres) {
+          int offset = (prop_box_size * a) * grid_len + i * grid_w + j;
+          float* in_ptr = input + offset;
+
+          float box_x = sigmoid(*in_ptr);
+          float box_y = sigmoid(in_ptr[grid_len]);
+          float box_w = in_ptr[2 * grid_len];
+          float box_h = in_ptr[3 * grid_len];
+          auto box = yolo_decode(box_x, box_y, box_w, box_h, stride, anchor, j, i, a);
+
+          box_x = box[0];
+          box_y = box[1];
+          box_w = box[2];
+          box_h = box[3];
+
+          box_x -= (box_w / 2.0);
+          box_y -= (box_h / 2.0);
+          boxes.push_back(box_x);
+          boxes.push_back(box_y);
+          boxes.push_back(box_x + box_w);
+          boxes.push_back(box_y + box_h);
+
+          float max_class_probs = in_ptr[5 * grid_len];
+          int max_class_id = 0;
+          for (int k = 1; k < kClasses; ++k) {
+            float prob = in_ptr[(5 + k) * grid_len];
+            if (prob > max_class_probs) {
+              max_class_id = k;
+              max_class_probs = prob;
+            }
+          }
+          obj_probs.push_back(sigmoid(max_class_probs) * sigmoid(box_confidence));
+          class_id.push_back(max_class_id);
+          valid_count++;
+        }
+      }
+    }
+  }
+  return valid_count;
+}
+
+Result<Detections> YOLOHead::GetBBoxes(const Value& prep_res,
+                                       const std::vector<Tensor>& pred_maps) const {
+  std::vector<float> filter_boxes;
+  std::vector<float> obj_probs;
+  std::vector<int> class_id;
+
+  int model_in_h = prep_res["img_shape"][1].get<int>();
+  int model_in_w = prep_res["img_shape"][2].get<int>();
+
+  for (int i = 0; i < pred_maps.size(); i++) {
+    int stride = strides_[i];
+    int grid_h = model_in_h / stride;
+    int grid_w = model_in_w / stride;
+    YOLOFeatDecode(pred_maps[i], anchors_[i], grid_h, grid_w, model_in_h, model_in_w, stride,
+                   filter_boxes, obj_probs, class_id, score_thr_);
+  }
+
+  std::vector<int> indexArray;
+  for (int i = 0; i < obj_probs.size(); ++i) {
+    indexArray.push_back(i);
+  }
+  Sort(obj_probs, class_id, indexArray);
+
+  Tensor dets(TensorDesc{Device{0, 0}, DataType::kFLOAT,
+                         TensorShape{int(filter_boxes.size() / 4), 4}, "dets"});
+  std::copy(filter_boxes.begin(), filter_boxes.end(), dets.data<float>());
+  NMS(dets, iou_threshold_, indexArray);
+
+  Detections objs;
+  std::vector<float> scale_factor;
+  if (prep_res.contains("scale_factor")) {
+    from_value(prep_res["scale_factor"], scale_factor);
+  } else {
+    scale_factor = {1.f, 1.f, 1.f, 1.f};
+  }
+
+  int top_padding = 0;
+  int left_padding = 0;
+  if (prep_res.contains("pad_param")) {
+    top_padding = prep_res["pad_param"][0].get<int>();
+    left_padding = prep_res["pad_param"][1].get<int>();
+  }
+
+  int ori_width = prep_res["ori_shape"][2].get<int>();
+  int ori_height = prep_res["ori_shape"][1].get<int>();
+  auto det_ptr = dets.data<float>();
+  for (int i = 0; i < indexArray.size(); ++i) {
+    if (indexArray[i] == -1) {
+      continue;
+    }
+    int j = indexArray[i];
+    auto x1 = clamp(det_ptr[j * 4 + 0], 0, model_in_w);
+    auto y1 = clamp(det_ptr[j * 4 + 1], 0, model_in_h);
+    auto x2 = clamp(det_ptr[j * 4 + 2], 0, model_in_w);
+    auto y2 = clamp(det_ptr[j * 4 + 3], 0, model_in_h);
+    int label_id = class_id[i];
+    float score = obj_probs[i];
+
+    MMDEPLOY_DEBUG("{}-th box: ({}, {}, {}, {}), {}, {}", i, x1, y1, x2, y2, label_id, score);
+
+    auto rect = MapToOriginImage(x1, y1, x2, y2, scale_factor.data(), 0, 0, ori_width, ori_height,
+                                 top_padding, left_padding);
+    if (rect[2] - rect[0] < min_bbox_size_ || rect[3] - rect[1] < min_bbox_size_) {
+      MMDEPLOY_DEBUG("ignore small bbox with width '{}' and height '{}", rect[2] - rect[0],
+                     rect[3] - rect[1]);
+      continue;
+    }
+    Detection det{};
+    det.index = i;
+    det.label_id = label_id;
+    det.score = score;
+    det.bbox = rect;
+    objs.push_back(std::move(det));
+  }
+
+  return objs;
+}
+
+std::array<float, 4> YOLOV3Head::yolo_decode(float box_x, float box_y, float box_w, float box_h,
+                                             float stride,
+                                             const std::vector<std::vector<float>>& anchor, int j,
+                                             int i, int a) const {
+  box_x = (box_x + j) * stride;
+  box_y = (box_y + i) * stride;
+  box_w = expf(box_w) * anchor[a][0];
+  box_h = expf(box_h) * anchor[a][1];
+  return std::array<float, 4>{box_x, box_y, box_w, box_h};
+}
+
+std::array<float, 4> YOLOv5Head::yolo_decode(float box_x, float box_y, float box_w, float box_h,
+                                             float stride,
+                                             const std::vector<std::vector<float>>& anchor, int j,
+                                             int i, int a) const {
+  box_x = box_x * 2 - 0.5;
+  box_y = box_y * 2 - 0.5;
+  box_w = sigmoid(box_w) * 2;
+  box_h = sigmoid(box_h) * 2;
+  box_x = (box_x + j) * stride;
+  box_y = (box_y + i) * stride;
+  box_w = box_w * box_w * anchor[a][0];
+  box_h = box_h * box_h * anchor[a][1];
+  return std::array<float, 4>{box_x, box_y, box_w, box_h};
+}
+
+MMDEPLOY_REGISTER_CODEBASE_COMPONENT(MMDetection, YOLOV3Head);
+MMDEPLOY_REGISTER_CODEBASE_COMPONENT(MMDetection, YOLOv5Head);
+
+}  // namespace mmdeploy::mmdet
--- a/csrc/mmdeploy/codebase/mmdet/yolo_head.h
+++ b/csrc/mmdeploy/codebase/mmdet/yolo_head.h
+// Copyright (c) OpenMMLab. All rights reserved.
+#ifndef MMDEPLOY_CODEBASE_MMDET_YOLO_HEAD_H_
+#define MMDEPLOY_CODEBASE_MMDET_YOLO_HEAD_H_
+
+#include "mmdeploy/codebase/mmdet/mmdet.h"
+#include "mmdeploy/core/tensor.h"
+
+namespace mmdeploy::mmdet {
+
+class YOLOHead : public MMDetection {
+ public:
+  explicit YOLOHead(const Value& cfg);
+  Result<Value> operator()(const Value& prep_res, const Value& infer_res);
+  int YOLOFeatDecode(const Tensor& feat_map, const std::vector<std::vector<float>>& anchor,
+                     int grid_h, int grid_w, int height, int width, int stride,
+                     std::vector<float>& boxes, std::vector<float>& obj_probs,
+                     std::vector<int>& class_id, float threshold) const;
+  Result<Detections> GetBBoxes(const Value& prep_res, const std::vector<Tensor>& pred_maps) const;
+  virtual std::array<float, 4> yolo_decode(float box_x, float box_y, float box_w, float box_h,
+                                           float stride,
+                                           const std::vector<std::vector<float>>& anchor, int j,
+                                           int i, int a) const = 0;
+
+ private:
+  float score_thr_{0.4f};
+  int nms_pre_{1000};
+  float iou_threshold_{0.45f};
+  int min_bbox_size_{0};
+  std::vector<std::vector<std::vector<float>>> anchors_;
+  std::vector<float> strides_;
+};
+
+class YOLOV3Head : public YOLOHead {
+ public:
+  using YOLOHead::YOLOHead;
+  std::array<float, 4> yolo_decode(float box_x, float box_y, float box_w, float box_h, float stride,
+                                   const std::vector<std::vector<float>>& anchor, int j, int i,
+                                   int a) const override;
+};
+
+class YOLOv5Head : public YOLOHead {
+ public:
+  using YOLOHead::YOLOHead;
+  std::array<float, 4> yolo_decode(float box_x, float box_y, float box_w, float box_h, float stride,
+                                   const std::vector<std::vector<float>>& anchor, int j, int i,
+                                   int a) const override;
+};
+
+}  // namespace mmdeploy::mmdet
+
+#endif  // MMDEPLOY_CODEBASE_MMDET_YOLO_HEAD_H_
--- a/csrc/mmdeploy/codebase/mmedit/CMakeLists.txt
+++ b/csrc/mmdeploy/codebase/mmedit/CMakeLists.txt
+# Copyright (c) OpenMMLab. All rights reserved.
+
+project(mmdeploy_mmedit)
+
+file(GLOB_RECURSE SRCS ${CMAKE_CURRENT_SOURCE_DIR} "*.cpp")
+mmdeploy_add_module(${PROJECT_NAME} "${SRCS}")
+target_link_libraries(${PROJECT_NAME}
+    PRIVATE mmdeploy_opencv_utils)
+add_library(mmdeploy::mmedit ALIAS ${PROJECT_NAME})
+
+set(MMDEPLOY_TASKS ${MMDEPLOY_TASKS} restorer CACHE INTERNAL "")
--- a/csrc/mmdeploy/codebase/mmedit/mmedit.cpp
+++ b/csrc/mmdeploy/codebase/mmedit/mmedit.cpp
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#include "mmdeploy/codebase/mmedit/mmedit.h"
+
+#include "mmdeploy/core/registry.h"
+
+namespace mmdeploy::mmedit {
+
+MMDEPLOY_REGISTER_CODEBASE(MMEdit);
+
+}  // namespace mmdeploy::mmedit
--- a/csrc/mmdeploy/codebase/mmedit/mmedit.h
+++ b/csrc/mmdeploy/codebase/mmedit/mmedit.h
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#ifndef MMDEPLOY_SRC_CODEBASE_MMEDIT_MMEDIT_H_
+#define MMDEPLOY_SRC_CODEBASE_MMEDIT_MMEDIT_H_
+
+#include "mmdeploy/codebase/common.h"
+#include "mmdeploy/core/device.h"
+#include "mmdeploy/core/mat.h"
+#include "mmdeploy/core/module.h"
+#include "mmdeploy/core/serialization.h"
+
+namespace mmdeploy::mmedit {
+
+using RestorerOutput = Mat;
+
+MMDEPLOY_DECLARE_CODEBASE(MMEdit, mmedit);
+
+}  // namespace mmdeploy::mmedit
+
+#endif  // MMDEPLOY_SRC_CODEBASE_MMEDIT_MMEDIT_H_
--- a/csrc/mmdeploy/codebase/mmedit/restorer.cpp
+++ b/csrc/mmdeploy/codebase/mmedit/restorer.cpp
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#include <opencv2/core.hpp>
+
+#include "mmdeploy/codebase/mmedit/mmedit.h"
+#include "mmdeploy/core/tensor.h"
+#include "mmdeploy/core/utils/device_utils.h"
+
+namespace mmdeploy::mmedit {
+
+class TensorToImg : public MMEdit {
+ public:
+  explicit TensorToImg(const Value& cfg) : MMEdit(cfg) {}
+
+  Result<Value> operator()(const Value& input) {
+    auto upscale = input["output"].get<Tensor>();
+    OUTCOME_TRY(auto upscale_cpu, MakeAvailableOnDevice(upscale, kHOST, stream()));
+    OUTCOME_TRY(stream().Wait());
+    if (upscale.shape().size() == 4 && upscale.data_type() == DataType::kFLOAT) {
+      auto channels = static_cast<int>(upscale.shape(1));
+      auto height = static_cast<int>(upscale.shape(2));
+      auto width = static_cast<int>(upscale.shape(3));
+      // TODO: handle BGR <-> RGB conversion
+      OUTCOME_TRY(auto format, ChannelsToFormat(channels));
+      Mat mat(height, width, format, DataType::kINT8, kHOST);
+      cv::Mat_<float> mat_chw(channels, height * width, upscale_cpu.data<float>());
+      cv::Mat mat_hwc(height * width, channels, CV_32F);
+      cv::transpose(mat_chw, mat_hwc);
+      cv::Mat rescale_uint8(height, width, CV_8UC(channels), mat.data<uint8_t>());
+      mat_hwc = mat_hwc.reshape(channels, height);
+      // convert has saturate_cast inside
+      mat_hwc.convertTo(rescale_uint8, CV_8UC(channels), 255.f);
+      return mat;
+    } else {
+      MMDEPLOY_ERROR("unsupported `output` tensor, shape: {}, dtype: {}", upscale.shape(),
+                     (int)upscale.data_type());
+      return Status(eNotSupported);
+    }
+  }
+
+ protected:
+  static Result<PixelFormat> ChannelsToFormat(int channels) {
+    switch (channels) {
+      case 1:
+        return PixelFormat::kGRAYSCALE;
+      case 3:
+        return PixelFormat::kRGB;
+      default:
+        return Status(eNotSupported);
+    }
+  }
+
+  static constexpr const Device kHOST{0, 0};
+};
+
+MMDEPLOY_REGISTER_CODEBASE_COMPONENT(MMEdit, TensorToImg);
+
+}  // namespace mmdeploy::mmedit
--- a/csrc/mmdeploy/codebase/mmocr/CMakeLists.txt
+++ b/csrc/mmdeploy/codebase/mmocr/CMakeLists.txt
+# Copyright (c) OpenMMLab. All rights reserved.
+
+project(mmdeploy_mmocr)
+
+aux_source_directory(${CMAKE_CURRENT_SOURCE_DIR} OCR_SRCS)
+aux_source_directory(${CMAKE_SOURCE_DIR}/third_party/clipper CLIPPER_SRCS)
+set(SRCS ${OCR_SRCS} ${CLIPPER_SRCS})
+
+mmdeploy_add_module(${PROJECT_NAME} "${SRCS}")
+
+add_subdirectory(cpu)
+add_subdirectory(cuda)
+
+target_include_directories(${PROJECT_NAME} PRIVATE
+        ${CMAKE_SOURCE_DIR}/third_party/clipper)
+target_link_libraries(${PROJECT_NAME} PRIVATE
+        mmdeploy_opencv_utils
+        mmdeploy_operation
+        mmdeploy::transform
+        mmdeploy::core)
+add_library(mmdeploy::mmocr ALIAS ${PROJECT_NAME})
+
+set(MMDEPLOY_TASKS ${MMDEPLOY_TASKS} text_detector text_recognizer CACHE INTERNAL "")
--- a/csrc/mmdeploy/codebase/mmocr/attention_convertor.cpp
+++ b/csrc/mmdeploy/codebase/mmocr/attention_convertor.cpp
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#include <algorithm>
+#include <sstream>
+
+#include "base_convertor.h"
+#include "mmdeploy/core/device.h"
+#include "mmdeploy/core/model.h"
+#include "mmdeploy/core/registry.h"
+#include "mmdeploy/core/tensor.h"
+#include "mmdeploy/core/utils/device_utils.h"
+#include "mmdeploy/core/utils/formatter.h"
+#include "mmdeploy/core/value.h"
+#include "mmdeploy/experimental/module_adapter.h"
+#include "mmocr.h"
+
+namespace mmdeploy::mmocr {
+
+using std::string;
+using std::vector;
+
+class AttnConvertor : public BaseConvertor {
+ public:
+  explicit AttnConvertor(const Value& cfg) : BaseConvertor(cfg) {}
+
+  Result<Value> operator()(const Value& _data, const Value& _prob) {
+    auto d_conf = _prob["output"].get<Tensor>();
+
+    if (!(d_conf.shape().size() == 3 && d_conf.data_type() == DataType::kFLOAT)) {
+      MMDEPLOY_ERROR("unsupported `output` tensor, shape: {}, dtype: {}", d_conf.shape(),
+                     (int)d_conf.data_type());
+      return Status(eNotSupported);
+    }
+
+    OUTCOME_TRY(auto h_conf, MakeAvailableOnDevice(d_conf, Device{0}, stream()));
+    OUTCOME_TRY(stream().Wait());
+
+    auto data = h_conf.data<float>();
+
+    auto shape = d_conf.shape();
+    auto w = static_cast<int>(shape[1]);
+    auto c = static_cast<int>(shape[2]);
+
+    auto [indexes, scores] = Tensor2Idx(data, w, c);
+
+    auto text = Idx2Str(indexes);
+    MMDEPLOY_DEBUG("text: {}", text);
+
+    TextRecognition output{text, scores};
+
+    return make_pointer(to_value(output));
+  }
+
+  std::pair<vector<int>, vector<float> > Tensor2Idx(const float* data, int w, int c) {
+    auto decode_len = w;
+    vector<int> indexes;
+    indexes.reserve(decode_len);
+    vector<float> scores;
+    scores.reserve(decode_len);
+    for (int t = 0; t < decode_len; ++t, data += c) {
+      vector<float> prob(data, data + c);
+      auto iter = max_element(begin(prob), end(prob));
+      auto index = static_cast<int>(iter - begin(prob));
+      if (index == end_idx_) break;
+      if (std::find(ignore_indexes_.begin(), ignore_indexes_.end(), index) ==
+          ignore_indexes_.end()) {
+        indexes.push_back(index);
+        scores.push_back(*iter);
+      }
+    }
+    return {indexes, scores};
+  }
+};
+
+MMDEPLOY_REGISTER_CODEBASE_COMPONENT(MMOCR, AttnConvertor);
+
+}  // namespace mmdeploy::mmocr
--- a/csrc/mmdeploy/codebase/mmocr/base_convertor.cpp
+++ b/csrc/mmdeploy/codebase/mmocr/base_convertor.cpp
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#include "base_convertor.h"
+
+namespace mmdeploy::mmocr {
+
+using std::string;
+using std::unordered_map;
+using std::unordered_set;
+using std::vector;
+
+BaseConvertor::BaseConvertor(const Value& cfg) : MMOCR(cfg) {
+  auto model = cfg["context"]["model"].get<Model>();
+  if (!cfg.contains("params")) {
+    MMDEPLOY_ERROR("'params' is required, but it's not in the config");
+    throw_exception(eInvalidArgument);
+  }
+  // BaseConverter
+  auto& _cfg = cfg["params"];
+  if (_cfg.contains("dict_file")) {
+    auto filename = _cfg["dict_file"].get<std::string>();
+    auto content = model.ReadFile(filename).value();
+    idx2char_ = SplitLines(content);
+  } else if (_cfg.contains("dict_list")) {
+    from_value(_cfg["dict_list"], idx2char_);
+  } else if (_cfg.contains("dict_type")) {
+    auto dict_type = _cfg["dict_type"].get<std::string>();
+    if (dict_type == "DICT36") {
+      idx2char_ = SplitChars(DICT36);
+    } else if (dict_type == "DICT90") {
+      idx2char_ = SplitChars(DICT90);
+    } else {
+      MMDEPLOY_ERROR("unknown dict_type: {}", dict_type);
+      throw_exception(eInvalidArgument);
+    }
+  } else {
+    MMDEPLOY_ERROR("either dict_file, dict_list or dict_type must be specified");
+    throw_exception(eInvalidArgument);
+  }
+  model_ = model;
+
+  // Update Dictionary
+  bool with_start = _cfg.value("with_start", false);
+  bool with_end = _cfg.value("with_end", false);
+  bool same_start_end = _cfg.value("same_start_end", false);
+  bool with_padding = _cfg.value("with_padding", false);
+  bool with_unknown = _cfg.value("with_unknown", false);
+
+  if (with_start && with_end && same_start_end) {
+    start_idx_ = static_cast<int>(idx2char_.size());
+    end_idx_ = start_idx_;
+    string start_end_token = _cfg.value("start_end_token", string("<BOS/EOS>"));
+    idx2char_.emplace_back(std::move(start_end_token));
+  } else {
+    if (with_start) {
+      start_idx_ = static_cast<int>(idx2char_.size());
+      string start_token = _cfg.value("start_token", string("<BOS>"));
+      idx2char_.emplace_back(std::move(start_token));
+    }
+    if (with_end) {
+      end_idx_ = static_cast<int>(idx2char_.size());
+      string end_token = _cfg.value("end_token", string("<EOS>"));
+      idx2char_.emplace_back(std::move(end_token));
+    }
+  }
+  if (with_padding) {
+    padding_idx_ = static_cast<int>(idx2char_.size());
+    string padding_token = _cfg.value("padding_token", string("<PAD>"));
+    idx2char_.emplace_back(std::move(padding_token));
+  }
+  if (with_unknown && _cfg.contains("unknown_token") && !_cfg["unknown_token"].is_null()) {
+    unknown_idx_ = static_cast<int>(idx2char_.size());
+    string unknown_token = _cfg.value("unknown_token", string("<UKN>"));
+    idx2char_.emplace_back(unknown_token);
+  }
+
+  // char2idx
+  for (int i = 0; i < (int)idx2char_.size(); i++) {
+    char2idx_[idx2char_[i]] = i;
+  }
+
+  vector<string> ignore_chars;
+  if (cfg.contains("ignore_chars")) {
+    for (int i = 0; i < cfg["ignore_chars"].size(); i++)
+      ignore_chars.emplace_back(cfg["ignore_chars"][i].get<string>());
+  } else {
+    ignore_chars.emplace_back("padding");
+  }
+  std::map<string, int> mapping_table = {
+      {"padding", padding_idx_}, {"end", end_idx_}, {"unknown", unknown_idx_}};
+  for (int i = 0; i < ignore_chars.size(); i++) {
+    const auto& ignore_char = ignore_chars[i];
+    int ignore_idx = -1;
+
+    if (auto it_default = mapping_table.find(ignore_char); it_default != mapping_table.end()) {
+      ignore_idx = it_default->second;
+    } else if (auto it_candidate = char2idx_.find(ignore_char); it_candidate != char2idx_.end()) {
+      ignore_idx = it_candidate->second;
+    } else if (with_unknown) {
+      ignore_idx = unknown_idx_;
+    }
+
+    if (ignore_idx == -1 || (ignore_idx == unknown_idx_ && ignore_char != "unknown")) {
+      MMDEPLOY_WARN("{} does not exist in the dictionary", ignore_char);
+      continue;
+    }
+    ignore_indexes_.insert(ignore_idx);
+  }
+}
+
+string BaseConvertor::Idx2Str(const vector<int>& indexes) {
+  size_t count = 0;
+  for (const auto& idx : indexes) {
+    if (idx >= idx2char_.size()) {
+      MMDEPLOY_ERROR("idx exceeds array bounds {} {}", idx, idx2char_.size());
+    }
+    count += idx2char_[idx].size();
+  }
+  std::string text;
+  text.reserve(count);
+  for (const auto& idx : indexes) {
+    text += idx2char_[idx];
+  }
+  return text;
+}
+
+vector<string> BaseConvertor::SplitLines(const string& s) {
+  std::istringstream is(s);
+  vector<string> ret;
+  string line;
+  while (std::getline(is, line)) {
+    ret.push_back(std::move(line));
+  }
+  return ret;
+}
+
+vector<string> BaseConvertor::SplitChars(const string& s) {
+  vector<string> ret;
+  ret.reserve(s.size());
+  for (char c : s) {
+    ret.push_back({c});
+  }
+  return ret;
+}
+
+}  // namespace mmdeploy::mmocr
--- a/csrc/mmdeploy/codebase/mmocr/base_convertor.h
+++ b/csrc/mmdeploy/codebase/mmocr/base_convertor.h
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#include <algorithm>
+#include <sstream>
+#include <unordered_map>
+#include <unordered_set>
+
+#include "mmdeploy/core/device.h"
+#include "mmdeploy/core/model.h"
+#include "mmdeploy/core/registry.h"
+#include "mmdeploy/core/tensor.h"
+#include "mmdeploy/core/utils/device_utils.h"
+#include "mmdeploy/core/utils/formatter.h"
+#include "mmdeploy/core/value.h"
+#include "mmdeploy/experimental/module_adapter.h"
+#include "mmocr.h"
+
+namespace mmdeploy::mmocr {
+
+using std::string;
+using std::unordered_map;
+using std::unordered_set;
+using std::vector;
+
+class BaseConvertor : public MMOCR {
+ public:
+  explicit BaseConvertor(const Value& cfg);
+
+  string Idx2Str(const vector<int>& indexes);
+
+  virtual Result<Value> operator()(const Value& _data, const Value& _prob) = 0;
+
+ protected:
+  static vector<string> SplitLines(const string& s);
+
+  static vector<string> SplitChars(const string& s);
+
+  static constexpr const auto DICT36 = R"(0123456789abcdefghijklmnopqrstuvwxyz)";
+  static constexpr const auto DICT90 = R"(0123456789abcdefghijklmnopqrstuvwxyz)"
+                                       R"(ABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&'())"
+                                       R"(*+,-./:;<=>?@[\]_`~)";
+
+  static constexpr const auto kHost = Device(0);
+
+  Model model_;
+
+  int padding_idx_{-1};
+  int end_idx_{-1};
+  int start_idx_{-1};
+  int unknown_idx_{-1};
+
+  unordered_set<int> ignore_indexes_;
+  unordered_map<string, int> char2idx_;
+  vector<string> idx2char_;
+
+};  // class BaseConvertor
+
+}  // namespace mmdeploy::mmocr