add csrc and mmdeploy module

546b4279 · limm · 502f4fb9 · 546b4279 · 546b4279 · 546b4279
Commit 546b4279 authored Jun 25, 2025 by limm
20 changed files
--- a/csrc/mmdeploy/codebase/mmocr/contour_expand.cpp
+++ b/csrc/mmdeploy/codebase/mmocr/contour_expand.cpp
+// Copyright (c) OpenMMLab. All rights reserved
+// Modified from https://github.com/whai362/PSENet
+// and
+// https://github.com/open-mmlab/mmcv/blob/master/mmcv/ops/csrc/pytorch/contour_expand.cpp
+
+#include <cmath>
+#include <iostream>
+#include <queue>
+#include <vector>
+
+#include "mmdeploy/core/tensor.h"
+#include "opencv2/imgproc/imgproc.hpp"
+
+namespace mmdeploy::mmocr {
+
+using namespace std;
+using cv::Mat_;
+
+class Point2d {
+ public:
+  int x;
+  int y;
+
+  Point2d() : x(0), y(0) {}
+  Point2d(int _x, int _y) : x(_x), y(_y) {}
+};
+
+void kernel_dilate(const uint8_t* data, int kernel_num, int height, int width, const int* label_map,
+                   int label_num, const float* score_map, int min_area, Mat_<int32_t>& text_labels,
+                   vector<int>& text_areas, vector<float>& text_scores,
+                   vector<vector<int>>& text_points) {
+  text_labels = Mat_<int32_t>::zeros(height, width);
+  text_areas = vector<int>(label_num);
+  text_scores = vector<float>(label_num);
+  text_points = vector<vector<int>>(label_num);
+
+  for (int x = 0; x < height; ++x) {
+    for (int y = 0; y < width; ++y) {
+      int label = label_map[x * width + y];
+      if (label == 0) continue;
+      text_areas[label] += 1;
+      text_scores[label] += score_map[x * width + y];
+      text_points[label].push_back(y);
+      text_points[label].push_back(x);
+    }
+  }
+
+  queue<Point2d> queue, next_queue;
+  for (int x = 0; x < height; ++x) {
+    auto row = text_labels[x];
+    for (int y = 0; y < width; ++y) {
+      int label = label_map[x * width + y];
+      if (label == 0) continue;
+      if (text_areas[label] < min_area) continue;
+      Point2d point(x, y);
+      queue.push(point);
+      row[y] = label;
+    }
+  }
+
+  const int dx[] = {-1, 1, 0, 0};
+  const int dy[] = {0, 0, -1, 1};
+  vector<int> kernel_step(kernel_num);
+  std::for_each(kernel_step.begin(), kernel_step.end(), [=](int& k) { return k * height * width; });
+
+  for (int kernel_id = kernel_num - 2; kernel_id >= 0; --kernel_id) {
+    while (!queue.empty()) {
+      Point2d point = queue.front();
+      queue.pop();
+      int x = point.x;
+      int y = point.y;
+      int label = text_labels[x][y];
+      bool is_edge = true;
+      for (int d = 0; d < 4; ++d) {
+        int tmp_x = x + dx[d];
+        int tmp_y = y + dy[d];
+        if (tmp_x < 0 || tmp_x >= height) continue;
+        if (tmp_y < 0 || tmp_y >= width) continue;
+        int kernel_value = data[kernel_step[kernel_id] + tmp_x * width + tmp_y];
+        if (kernel_value == 0) continue;
+        if (text_labels[tmp_x][tmp_y] > 0) continue;
+        Point2d point(tmp_x, tmp_y);
+        queue.push(point);
+        text_labels[tmp_x][tmp_y] = label;
+        text_areas[label] += 1;
+        text_scores[label] += score_map[tmp_x * width + tmp_y];
+        text_points[label].push_back(tmp_y);
+        text_points[label].push_back(tmp_x);
+        is_edge = false;
+      }
+      if (is_edge) {
+        next_queue.push(point);
+      }
+    }
+    swap(queue, next_queue);
+  }
+
+  for (int i = 1; i < label_num; ++i) {
+    if (text_areas[i]) {
+      text_scores[i] /= static_cast<float>(text_areas[i]);
+    }
+  }
+}
+
+void contour_expand(const Mat_<uint8_t>& kernel_masks, const Mat_<int32_t>& kernel_label,
+                    const Mat_<float>& score, int min_kernel_area, int kernel_num,
+                    vector<int>& text_areas, vector<float>& text_scores,
+                    vector<vector<int>>& text_points) {
+  assert(kernel_masks.cols == kernel_label.total());
+  assert(score.size() == kernel_label.size());
+
+  auto ptr_data = kernel_masks.ptr<uint8_t>();
+  auto data_score_map = score.ptr<float>();
+  auto data_label_map = kernel_label.ptr<int32_t>();
+  vector<vector<int>> text_line;
+
+  Mat_<int32_t> text_labels;
+
+  kernel_dilate(ptr_data, kernel_masks.rows, kernel_label.rows, kernel_label.cols, data_label_map,
+                kernel_num, data_score_map, min_kernel_area, text_labels, text_areas, text_scores,
+                text_points);
+}
+
+}  // namespace mmdeploy::mmocr
--- a/csrc/mmdeploy/codebase/mmocr/cpu/CMakeLists.txt
+++ b/csrc/mmdeploy/codebase/mmocr/cpu/CMakeLists.txt
+# Copyright (c) OpenMMLab. All rights reserved.
+
+project(mmdeploy_mmocr_cpu_impl CXX)
+
+if ("cpu" IN_LIST MMDEPLOY_TARGET_DEVICES)
+    add_library(${PROJECT_NAME} OBJECT dbnet.cpp panet.cpp psenet.cpp)
+    set_target_properties(${PROJECT_NAME} PROPERTIES POSITION_INDEPENDENT_CODE 1)
+    if (NOT (MMDEPLOY_SHARED_LIBS OR MSVC))
+        target_compile_options(${PROJECT_NAME} PRIVATE $<$<COMPILE_LANGUAGE:CXX>:-fvisibility=hidden>)
+    endif ()
+    target_link_libraries(${PROJECT_NAME} PRIVATE
+            mmdeploy_opencv_utils
+            mmdeploy::core)
+    target_link_libraries(mmdeploy_mmocr PRIVATE ${PROJECT_NAME})
+    mmdeploy_export(${PROJECT_NAME})
+endif ()
--- a/csrc/mmdeploy/codebase/mmocr/cpu/dbnet.cpp
+++ b/csrc/mmdeploy/codebase/mmocr/cpu/dbnet.cpp
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#include "mmdeploy/codebase/mmocr/dbnet.h"
+
+#include "mmdeploy/core/utils/device_utils.h"
+#include "opencv2/imgproc.hpp"
+
+namespace mmdeploy::mmocr {
+
+class DbHeadCpuImpl : public DbHeadImpl {
+ public:
+  void Init(const Stream& stream) override {
+    DbHeadImpl::Init(stream);
+    device_ = Device("cpu");
+  }
+
+  Result<void> Process(Tensor prob, float mask_thr, int max_candidates,
+                       std::vector<std::vector<cv::Point>>& points,
+                       std::vector<float>& scores) override {
+    OUTCOME_TRY(auto conf, MakeAvailableOnDevice(prob, device_, stream_));
+    OUTCOME_TRY(stream_.Wait());
+
+    auto h = conf.shape(1);
+    auto w = conf.shape(2);
+    auto data = conf.data<float>();
+
+    cv::Mat score_map((int)h, (int)w, CV_32F, data);
+
+    cv::Mat text_mask = score_map >= mask_thr;
+
+    std::vector<std::vector<cv::Point>> contours;
+    cv::findContours(text_mask, contours, cv::RETR_LIST, cv::CHAIN_APPROX_SIMPLE);
+
+    if (contours.size() > max_candidates) {
+      contours.resize(max_candidates);
+    }
+
+    for (auto& poly : contours) {
+      auto epsilon = 0.01 * cv::arcLength(poly, true);
+      std::vector<cv::Point> approx;
+      cv::approxPolyDP(poly, approx, epsilon, true);
+      if (approx.size() < 4) {
+        continue;
+      }
+      auto score = box_score_fast(score_map, approx);
+
+      points.push_back(approx);
+      scores.push_back(score);
+    }
+
+    return success();
+  }
+
+  static float box_score_fast(const cv::Mat& bitmap, const std::vector<cv::Point>& box) noexcept {
+    auto rect = cv::boundingRect(box) & cv::Rect({}, bitmap.size());
+
+    cv::Mat mask(rect.size(), CV_8U, cv::Scalar(0));
+
+    cv::fillPoly(mask, std::vector<std::vector<cv::Point>>{box}, 1, cv::LINE_8, 0, -rect.tl());
+    auto mean = cv::mean(bitmap(rect), mask)[0];
+    return static_cast<float>(mean);
+  }
+
+  Device device_;
+};
+
+MMDEPLOY_REGISTER_FACTORY_FUNC(DbHeadImpl, (cpu, 0),
+                               [] { return std::make_unique<DbHeadCpuImpl>(); });
+
+}  // namespace mmdeploy::mmocr
--- a/csrc/mmdeploy/codebase/mmocr/cpu/panet.cpp
+++ b/csrc/mmdeploy/codebase/mmocr/cpu/panet.cpp
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#include "mmdeploy/codebase/mmocr/panet.h"
+
+#include "opencv2/imgproc.hpp"
+
+namespace mmdeploy::mmocr {
+
+class PaHeadCpuImpl : public PaHeadImpl {
+ public:
+  Result<void> Process(Tensor text_pred,             //
+                       Tensor kernel_pred,           //
+                       Tensor embed_pred,            //
+                       float min_text_confidence,    //
+                       float min_kernel_confidence,  //
+                       cv::Mat_<float>& text_score,  //
+                       cv::Mat_<uint8_t>& text,      //
+                       cv::Mat_<uint8_t>& kernel,    //
+                       cv::Mat_<int>& label,         //
+                       cv::Mat_<float>& embed,       //
+                       int& region_num) override {
+    OUTCOME_TRY(stream_.Wait());
+
+    auto height = static_cast<int>(text_pred.shape(1));
+    auto width = static_cast<int>(text_pred.shape(2));
+
+    text_score = cv::Mat_<float>(height, width, text_pred.data<float>());
+    sigmoid(text_score);
+
+    text = text_score > min_text_confidence;
+
+    cv::Mat_<float> kernel_score(height, width, kernel_pred.data<float>());
+    sigmoid(kernel_score);
+
+    kernel = kernel_score > min_kernel_confidence & text;
+
+    auto n_embed_channels = static_cast<int>(embed_pred.shape(0));
+    embed = cv::Mat_<float>(n_embed_channels, height * width, embed_pred.data<float>());
+    cv::transpose(embed, embed);
+
+    region_num = cv::connectedComponents(kernel, label, 4, CV_32S);
+
+    return success();
+  }
+
+  static void sigmoid(cv::Mat_<float>& score) {
+    cv::exp(-score, score);
+    score = 1 / (1 + score);
+  }
+};
+
+MMDEPLOY_REGISTER_FACTORY_FUNC(PaHeadImpl, (cpu, 0),
+                               [] { return std::make_unique<PaHeadCpuImpl>(); });
+
+}  // namespace mmdeploy::mmocr
--- a/csrc/mmdeploy/codebase/mmocr/cpu/psenet.cpp
+++ b/csrc/mmdeploy/codebase/mmocr/cpu/psenet.cpp
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#include "mmdeploy/codebase/mmocr/psenet.h"
+
+#include "mmdeploy/core/utils/device_utils.h"
+#include "opencv2/imgproc.hpp"
+
+namespace mmdeploy::mmocr {
+
+class PseHeadCpuImpl : public PseHeadImpl {
+ public:
+  PseHeadCpuImpl() : device_(0) {}
+
+  Result<void> Process(Tensor preds,                 //
+                       float min_kernel_confidence,  //
+                       cv::Mat_<float>& score,       //
+                       cv::Mat_<uint8_t>& masks,     //
+                       cv::Mat_<int>& label,         //
+                       int& region_num) override {
+    OUTCOME_TRY(preds, MakeAvailableOnDevice(preds, device_, stream_));
+    OUTCOME_TRY(stream_.Wait());
+
+    auto channels = static_cast<int>(preds.shape(0));
+    auto height = static_cast<int>(preds.shape(1));
+    auto width = static_cast<int>(preds.shape(2));
+
+    cv::Mat_<float> probs(channels, height * width, preds.data<float>());
+    sigmoid(probs);
+
+    probs.row(0).reshape(1, height).copyTo(score);
+
+    masks = probs > min_kernel_confidence;
+
+    for (int i = 1; i < channels; ++i) {
+      masks.row(i) &= masks.row(0);
+    }
+
+    cv::Mat_<uint8_t> kernel = masks.row(channels - 1).reshape(1, height);
+    region_num = cv::connectedComponents(kernel, label, 4, CV_32S);
+
+    return success();
+  }
+
+  static void sigmoid(cv::Mat_<float>& score) {
+    cv::exp(-score, score);
+    score = 1 / (1 + score);
+  }
+
+ private:
+  Device device_;
+};
+
+MMDEPLOY_REGISTER_FACTORY_FUNC(PseHeadImpl, (cpu, 0),
+                               [] { return std::make_unique<PseHeadCpuImpl>(); });
+
+}  // namespace mmdeploy::mmocr
--- a/csrc/mmdeploy/codebase/mmocr/crnn.cpp
+++ b/csrc/mmdeploy/codebase/mmocr/crnn.cpp
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#include <algorithm>
+#include <sstream>
+
+#include "base_convertor.h"
+#include "mmdeploy/core/device.h"
+#include "mmdeploy/core/model.h"
+#include "mmdeploy/core/registry.h"
+#include "mmdeploy/core/tensor.h"
+#include "mmdeploy/core/utils/device_utils.h"
+#include "mmdeploy/core/utils/formatter.h"
+#include "mmdeploy/core/value.h"
+#include "mmdeploy/experimental/module_adapter.h"
+#include "mmocr.h"
+
+namespace mmdeploy::mmocr {
+
+using std::string;
+using std::vector;
+
+class CTCConvertor : public BaseConvertor {
+ public:
+  explicit CTCConvertor(const Value& cfg) : BaseConvertor(cfg) {}
+
+  Result<Value> operator()(const Value& _data, const Value& _prob) {
+    auto d_conf = _prob["output"].get<Tensor>();
+
+    if (!(d_conf.shape().size() == 3 && d_conf.data_type() == DataType::kFLOAT)) {
+      MMDEPLOY_ERROR("unsupported `output` tensor, shape: {}, dtype: {}", d_conf.shape(),
+                     (int)d_conf.data_type());
+      return Status(eNotSupported);
+    }
+
+    OUTCOME_TRY(auto h_conf, MakeAvailableOnDevice(d_conf, Device{0}, stream()));
+    OUTCOME_TRY(stream().Wait());
+
+    auto data = h_conf.data<float>();
+
+    auto shape = d_conf.shape();
+    auto w = static_cast<int>(shape[1]);
+    auto c = static_cast<int>(shape[2]);
+
+    auto valid_ratio = _data["img_metas"]["valid_ratio"].get<float>();
+    auto [indexes, scores] = Tensor2Idx(data, w, c, valid_ratio);
+
+    auto text = Idx2Str(indexes);
+    MMDEPLOY_DEBUG("text: {}", text);
+
+    TextRecognition output{text, scores};
+
+    return make_pointer(to_value(output));
+  }
+
+  std::pair<vector<int>, vector<float> > Tensor2Idx(const float* data, int w, int c,
+                                                    float valid_ratio) {
+    auto decode_len = std::min(w, static_cast<int>(std::ceil(w * valid_ratio)));
+    vector<int> indexes;
+    indexes.reserve(decode_len);
+    vector<float> scores;
+    scores.reserve(decode_len);
+    int prev = padding_idx_;
+    for (int t = 0; t < decode_len; ++t, data += c) {
+      vector<float> prob(data, data + c);
+      auto iter = max_element(begin(prob), end(prob));
+      auto index = static_cast<int>(iter - begin(prob));
+      if (index != prev && ignore_indexes_.find(index) == ignore_indexes_.end()) {
+        indexes.push_back(index);
+        scores.push_back(*iter);
+      }
+      prev = index;
+    }
+    return {indexes, scores};
+  }
+};
+
+MMDEPLOY_REGISTER_CODEBASE_COMPONENT(MMOCR, CTCConvertor);
+
+}  // namespace mmdeploy::mmocr
--- a/csrc/mmdeploy/codebase/mmocr/cuda/CMakeLists.txt
+++ b/csrc/mmdeploy/codebase/mmocr/cuda/CMakeLists.txt
+# Copyright (c) OpenMMLab. All rights reserved.
+
+project(mmdeploy_mmocr_cuda_impl)
+
+if ("cuda" IN_LIST MMDEPLOY_TARGET_DEVICES)
+    add_library(${PROJECT_NAME} OBJECT
+            connected_component.cu
+            utils.cu
+            dbnet.cpp
+            panet.cpp
+            psenet.cpp)
+    set_target_properties(${PROJECT_NAME} PROPERTIES POSITION_INDEPENDENT_CODE 1)
+    target_include_directories(${PROJECT_NAME} PRIVATE
+            ${CUDA_INCLUDE_DIRS})
+    target_link_libraries(${PROJECT_NAME} PRIVATE
+            mmdeploy_opencv_utils
+            mmdeploy::core
+            ${CUDA_LIBRARIES}
+            cuda)
+    target_link_libraries(mmdeploy_mmocr PRIVATE
+            ${PROJECT_NAME})
+    mmdeploy_export(${PROJECT_NAME})
+endif ()
--- a/csrc/mmdeploy/codebase/mmocr/cuda/connected_component.cu
+++ b/csrc/mmdeploy/codebase/mmocr/cuda/connected_component.cu
+// Copyright (c) OpenMMLab. All rights reserved
+
+// implementation based on "A new Direct Connected Component Labeling and Analysis Algorithms for
+// GPUs"
+// https://ieeexplore.ieee.org/document/8596835
+#include <vector>
+
+#include "connected_component.h"
+#include "thrust/for_each.h"
+#include "thrust/iterator/counting_iterator.h"
+
+namespace mmdeploy {
+
+__device__ int start_distance(unsigned pixels, int tx) {
+  unsigned v = ~(pixels << (32 - tx));
+  return __clz(reinterpret_cast<int&>(v));
+}
+
+__device__ int end_distance(unsigned pixels, int tx) {
+  unsigned v = ~(pixels >> (tx + 1));
+  return __ffs(reinterpret_cast<int&>(v));
+}
+
+template <typename T>
+__device__ void swap(T& x, T& y) {
+  T tmp = x;
+  x = y;
+  y = tmp;
+}
+
+__device__ void merge(int* label, int u, int v) {
+  // find root of u
+  while (u != v && u != label[u]) {
+    u = label[u];
+  }
+  // find root of v
+  while (u != v && v != label[v]) {
+    v = label[v];
+  }
+  while (u != v) {
+    // post-condition: u > v
+    if (u < v) swap(u, v);
+    // try to set label[u] = v
+    auto w = atomicMin(label + u, v);
+    // if u is modified by other threads, try again
+    u = u == w ? v : w;
+  }
+}
+
+__host__ __device__ int div_up(int x, int y) { return (x + y - 1) / y; }
+
+__host__ __device__ int round_up(int x, int y) { return div_up(x, y) * y; }
+
+template <int block_w, int block_h>
+__global__ void LabelStripsKernel(const uint8_t* mask, int h, int w, int* label) {
+  __shared__ unsigned shared_pixels[block_h];
+  auto tx = static_cast<int>(threadIdx.x);
+  auto ty = static_cast<int>(threadIdx.y);
+  auto x0 = tx + static_cast<int>(blockIdx.x * blockDim.x);
+  auto y0 = ty + static_cast<int>(blockIdx.y * blockDim.y);
+  auto w_32 = round_up(w, 32);
+  for (auto y = y0; y < h; y += blockDim.y * gridDim.y) {
+    //* 0 -> current line
+    //* 1 -> line above
+    int distance0 = 0;
+    int distance1 = 0;
+    for (auto x = x0; x < w_32; x += blockDim.x * gridDim.x) {
+      unsigned active = __ballot_sync(0xffffffff, x < w);
+      if (x < w) {
+        auto key = y * w + x;
+        auto p0 = mask[y * w + x];
+        auto pixels0 = __ballot_sync(active, p0);
+        auto s_dist0 = start_distance(pixels0, tx);
+        if (p0 && s_dist0 == 0) {
+          auto l = tx ? key : key - distance0;
+          label[y * w + x] = static_cast<int>(l);
+        }
+        if (tx == 0) {
+          shared_pixels[ty] = pixels0;
+        }
+        __syncthreads();
+        auto pixels1 = ty ? shared_pixels[ty - 1] : 0;
+        int p1 = (pixels1 & (1 << tx));
+        int s_dist1 = start_distance(pixels1, tx);
+        if (tx == 0) {
+          s_dist0 = distance0;
+          s_dist1 = distance1;
+        }
+        if (p0 && p1 && (s_dist0 == 0 || s_dist1 == 0)) {
+          int label0 = key - s_dist0;
+          int label1 = key - w - s_dist1;
+          merge(label, label0, label1);
+        }
+        auto d1 = start_distance(pixels1, 32);
+        distance1 = d1 == 32 ? d1 + distance1 : d1;
+        auto d0 = start_distance(pixels0, 32);
+        distance0 = d0 == 32 ? d0 + distance0 : d0;
+      }
+    }
+  }
+}
+
+__global__ void MergeStripsKernel(const uint8_t* mask, int h, int w, int* label) {
+  auto tx = threadIdx.x;
+  auto ty = threadIdx.y;
+  auto x0 = tx + blockIdx.x * blockDim.x;
+  auto y0 = ty + blockIdx.y * blockDim.y;
+  auto w_32 = round_up(w, 32);
+  for (auto y = y0; y < h; y += blockDim.y * gridDim.y) {
+    if (y > 0) {
+      for (auto x = x0; x < w_32; x += blockDim.x * gridDim.x) {
+        unsigned active = __ballot_sync(0xffffffff, x < w);
+        if (x < w) {
+          auto key0 = y * w + x;
+          auto key1 = key0 - w;
+          auto p0 = mask[key0];
+          auto p1 = mask[key1];
+          auto pixels0 = __ballot_sync(active, p0);
+          auto pixels1 = __ballot_sync(active, p1);
+          if (p0 && p1) {
+            auto s_dist0 = start_distance(pixels0, tx);
+            auto s_dist1 = start_distance(pixels1, tx);
+            if (s_dist0 == 0 || s_dist1 == 0) {
+              merge(label, key0 - s_dist0, key1 - s_dist1);
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+__device__ int encode(int label) { return -2 - label; }
+
+__device__ int decode(int label) { return -2 - label; }
+
+struct _discretize_label_op {
+  int* label;
+  int* n_comp;
+  __device__ void operator()(int index) const {
+    if (label[index] == index) {
+      auto comp = atomicAdd(n_comp, 1);
+      label[index] = encode(comp);
+    }
+  }
+};
+
+struct _decode_label_op {
+  const int* label;
+  int* output;
+  __device__ void operator()(int index) const {
+    auto comp = label[index];
+    output[index] = comp < -1 ? decode(comp) + 1 : 0;
+  }
+};
+
+__global__ void RelabelStripsKernel(const uint8_t* mask, int h, int w, int* label) {
+  auto tx = threadIdx.x;
+  auto ty = threadIdx.y;
+  auto x0 = tx + blockIdx.x * blockDim.x;
+  auto y0 = ty + blockIdx.y * blockDim.y;
+  const auto stride_x = static_cast<int>(blockDim.x * gridDim.x);
+  const auto stride_y = static_cast<int>(blockDim.y * gridDim.y);
+  const auto w_32 = round_up(w, 32);
+  for (auto y = y0; y < h; y += stride_y) {
+    for (auto x = x0; x < w_32; x += stride_x) {
+      unsigned active = __ballot_sync(0xffffffff, x < w);
+      if (x < w) {
+        auto k = y * w + x;
+        auto p = mask[k];
+        auto pixels = __ballot_sync(active, p);
+        auto s_dist = start_distance(pixels, tx);
+        auto idx = 0;
+        if (p && s_dist == 0) {
+          idx = label[k];
+          while (idx > 0) {
+            idx = label[idx];
+          }
+        }
+        idx = __shfl_sync(active, idx, tx - s_dist);
+        if (p) {
+          label[k] = idx;
+        }
+      }
+    }
+  }
+}
+
+__global__ void ComputeStatsKernel_v2(const uint8_t* mask, const int* label, const float* score,
+                                      int h, int w, float* comp_score, int* comp_area) {
+  auto tx = threadIdx.x;
+  auto ty = threadIdx.y;
+  auto x0 = tx + blockIdx.x * blockDim.x;
+  auto y0 = ty + blockIdx.y * blockDim.y;
+  const auto stride_x = static_cast<int>(blockDim.x * gridDim.x);
+  const auto stride_y = static_cast<int>(blockDim.y * gridDim.y);
+  const auto w_32 = round_up(w, 32);
+  for (auto y = y0; y < h; y += stride_y) {
+    for (auto x = x0; x < w_32; x += stride_x) {
+      unsigned active = __ballot_sync(0xffffffff, x < w);
+      if (x < w) {
+        auto k = y * w + x;
+        auto p = mask[k];
+        auto pixels = __ballot_sync(active, p);
+        auto s_dist = start_distance(pixels, tx);
+        auto count = end_distance(pixels, tx);
+
+        float s = p ? score[k] : 0;
+        for (int offset = 16; offset > 0; offset /= 2) {
+          auto v = __shfl_down_sync(active, s, offset);
+          // mask out past-the-end items
+          s += offset < count ? v : 0.f;
+        }
+
+        if (p && s_dist == 0) {
+          auto idx = decode(label[k]);
+          atomicAdd(comp_area + idx, count);
+          atomicAdd(comp_score + idx, s);
+        }
+      }
+    }
+  }
+}
+
+__global__ void GetContoursKernel(const int* label, int h, int w, int2* contour, int* size) {
+  const auto x0 = static_cast<int>(threadIdx.x + blockIdx.x * blockDim.x);
+  const auto y0 = static_cast<int>(threadIdx.y + blockIdx.y * blockDim.y);
+  const auto stride_x = static_cast<int>(blockDim.x * gridDim.x);
+  const auto stride_y = static_cast<int>(blockDim.y * gridDim.y);
+  for (auto y = y0; y < h; y += stride_y) {
+    for (auto x = x0; x < w; x += stride_x) {
+      const auto index = y * w + x;
+      // encoded label
+      const auto comp = label[index];
+      if (comp < -1) {
+        // non-linear filters
+        const auto l = x > 0 && label[index - 1] == comp;
+        const auto t = y > 0 && label[index - w] == comp;
+        const auto r = x < w - 1 && label[index + 1] == comp;
+        const auto b = y < h - 1 && label[index + w] == comp;
+        const auto tl = y > 0 && x > 0 && label[index - w - 1] == comp;
+        const auto tr = y > 0 && x < w - 1 && label[index - w + 1] == comp;
+        const auto bl = y < h - 1 && x > 0 && label[index + w - 1] == comp;
+        const auto br = y < h - 1 && x < w - 1 && label[index + w + 1] == comp;
+        if (!((l && r) || (t && b) || (tl && br) || (tr && bl))) {
+          const auto p = atomicAdd(size, 1);
+          contour[p] = {index, decode(comp)};
+        }
+      }
+    }
+  }
+}
+
+struct ConnectedComponents::Impl {
+ public:
+  explicit Impl(cudaStream_t stream);
+
+  void Resize(int height, int width);
+
+  int GetComponents(const uint8_t* d_mask, int* h_label);
+
+  void GetContours(std::vector<std::vector<cv::Point>>& corners);
+
+  void GetStats(const uint8_t* d_mask, const float* d_score, std::vector<float>& scores,
+                std::vector<int>& areas);
+
+  ~Impl();
+
+  int* d_label_{nullptr};
+  float* d_comp_score_{nullptr};
+  int* d_comp_area_{nullptr};
+  int* d_contour_{nullptr};  // int2
+  int* d_contour_size_{nullptr};
+  int* d_n_comp_{nullptr};
+  int n_comp_{0};
+  int height_{0};
+  int width_{0};
+  size_t size_{0};
+  size_t capacity_{0};
+  double growth_factor_{1.1};
+  cudaStream_t stream_{nullptr};
+  bool owned_stream_{false};
+};
+
+int ConnectedComponents::Impl::GetComponents(const uint8_t* d_mask, int* h_label) {
+  {
+    dim3 threads(32, 4);
+    dim3 blocks(1, div_up(height_, (int)threads.y));
+    cudaMemsetAsync(d_label_, -1, sizeof(int) * size_, stream_);
+    LabelStripsKernel<32, 4><<<blocks, threads, 0, stream_>>>(d_mask, height_, width_, d_label_);
+  }
+  {
+    dim3 threads(32, 4);
+    dim3 blocks(div_up(width_, (int)threads.x), div_up(height_, (int)threads.y));
+    MergeStripsKernel<<<blocks, threads, 0, stream_>>>(d_mask, height_, width_, d_label_);
+
+    cudaMemsetAsync(d_n_comp_, 0, sizeof(int), stream_);
+    thrust::for_each_n(thrust::cuda::par.on(stream_), thrust::counting_iterator<int>(0),
+                       height_ * width_, _discretize_label_op{d_label_, d_n_comp_});
+    RelabelStripsKernel<<<blocks, threads, 0, stream_>>>(d_mask, height_, width_, d_label_);
+  }
+  cudaMemcpyAsync(&n_comp_, d_n_comp_, sizeof(int), cudaMemcpyDefault, stream_);
+  if (h_label) {
+    dim3 threads(32, 4);
+    dim3 blocks(div_up(width_, (int)threads.x), div_up(height_, (int)threads.y));
+    // reuse d_comp_area_, which is also an int buffer
+    thrust::for_each_n(thrust::cuda::par.on(stream_), thrust::counting_iterator<int>(0),
+                       height_ * width_, _decode_label_op{d_label_, d_comp_area_});
+    cudaMemcpyAsync(h_label, d_comp_area_, sizeof(int) * size_, cudaMemcpyDefault, stream_);
+  }
+  cudaStreamSynchronize(stream_);
+  return n_comp_;
+}
+
+void ConnectedComponents::Impl::GetStats(const uint8_t* d_mask, const float* d_score,
+                                         std::vector<float>& scores, std::vector<int>& areas) {
+  cudaMemsetAsync(d_comp_score_, 0, sizeof(float) * size_, stream_);
+  cudaMemsetAsync(d_comp_area_, 0, sizeof(int) * size_, stream_);
+  dim3 threads(32, 4);
+  dim3 blocks(div_up(width_, (int)threads.x), div_up(height_, (int)threads.y));
+  ComputeStatsKernel_v2<<<blocks, threads, 0, stream_>>>(d_mask, d_label_, d_score, height_, width_,
+                                                         d_comp_score_, d_comp_area_);
+  scores.resize(n_comp_);
+  areas.resize(n_comp_);
+  cudaMemcpyAsync(scores.data(), d_comp_score_, sizeof(float) * n_comp_, cudaMemcpyDefault,
+                  stream_);
+  cudaMemcpyAsync(areas.data(), d_comp_area_, sizeof(int) * n_comp_, cudaMemcpyDefault, stream_);
+
+  cudaStreamSynchronize(stream_);
+}
+
+void ConnectedComponents::Impl::GetContours(std::vector<std::vector<cv::Point>>& corners) {
+  cudaMemsetAsync(d_contour_size_, 0, sizeof(int), stream_);
+
+  auto d_contour = reinterpret_cast<int2*>(d_contour_);
+  {
+    dim3 threads(32, 4);
+    dim3 blocks(div_up(width_, (int)threads.x), div_up(height_, (int)threads.y));
+    GetContoursKernel<<<blocks, threads, 0, stream_>>>(d_label_, height_, width_, d_contour,
+                                                       d_contour_size_);
+  }
+
+  int contour_size{};
+  cudaMemcpyAsync(&contour_size, d_contour_size_, sizeof(int), cudaMemcpyDefault, stream_);
+
+  cudaStreamSynchronize(stream_);
+
+  std::vector<int2> index_comp(contour_size);
+  cudaMemcpyAsync(index_comp.data(), d_contour_, sizeof(int2) * contour_size, cudaMemcpyDefault,
+                  stream_);
+
+  cudaStreamSynchronize(stream_);
+
+  corners.resize(n_comp_);
+  for (const auto& p : index_comp) {
+    auto comp = p.y;
+    assert(0 <= comp && comp < n_comp_);
+    corners[comp].emplace_back(p.x % width_, p.x / width_);
+  }
+}
+
+void ConnectedComponents::Impl::Resize(int height, int width) {
+  size_t size = height * width;
+  if (size > capacity_) {
+    if (!capacity_) {
+      capacity_ = size;
+    } else {
+      while (capacity_ < size) {
+        capacity_ *= growth_factor_;
+      }
+    }
+    cudaFree(d_label_);
+    cudaFree(d_comp_score_);
+    cudaFree(d_comp_area_);
+    cudaFree(d_contour_);
+    cudaMalloc(&d_label_, sizeof(int) * capacity_);
+    cudaMalloc(&d_comp_score_, sizeof(float) * capacity_);
+    cudaMalloc(&d_comp_area_, sizeof(int) * capacity_);
+    cudaMalloc(&d_contour_, sizeof(int2) * capacity_);
+  }
+  if (!d_contour_size_) {
+    cudaMalloc(&d_contour_size_, sizeof(int));
+  }
+  if (!d_n_comp_) {
+    cudaMalloc(&d_n_comp_, sizeof(int));
+  }
+  height_ = height;
+  width_ = width;
+  size_ = size;
+}
+
+ConnectedComponents::Impl::Impl(cudaStream_t stream) : stream_(stream) {
+  if (!stream_) {
+    cudaStreamCreate(&stream_);
+    owned_stream_ = true;
+  }
+}
+
+ConnectedComponents::Impl::~Impl() {
+  cudaFree(d_label_);
+  cudaFree(d_comp_score_);
+  cudaFree(d_comp_area_);
+  cudaFree(d_contour_);
+  cudaFree(d_contour_size_);
+  cudaFree(d_n_comp_);
+  if (owned_stream_) {
+    cudaStreamDestroy(stream_);
+  }
+}
+
+ConnectedComponents::ConnectedComponents(void* stream)
+    : impl_(std::make_unique<Impl>((cudaStream_t)stream)) {}
+
+ConnectedComponents::~ConnectedComponents() = default;
+
+void ConnectedComponents::Resize(int height, int width) { impl_->Resize(height, width); }
+
+int ConnectedComponents::GetComponents(const uint8_t* d_mask, int* h_label) {
+  return impl_->GetComponents(d_mask, h_label);
+}
+
+void ConnectedComponents::GetContours(std::vector<std::vector<cv::Point>>& corners) {
+  return impl_->GetContours(corners);
+}
+
+void ConnectedComponents::GetStats(const uint8_t* d_mask, const float* d_score,
+                                   std::vector<float>& scores, std::vector<int>& areas) {
+  return impl_->GetStats(d_mask, d_score, scores, areas);
+}
+
+}  // namespace mmdeploy
--- a/csrc/mmdeploy/codebase/mmocr/cuda/connected_component.h
+++ b/csrc/mmdeploy/codebase/mmocr/cuda/connected_component.h
+// Copyright (c) OpenMMLab. All rights reserved
+
+#ifndef FAST_CC__CONNECTED_COMPONENT_H_
+#define FAST_CC__CONNECTED_COMPONENT_H_
+
+#include <cstdint>
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "opencv2/core.hpp"
+
+namespace mmdeploy {
+
+class ConnectedComponents {
+ public:
+  explicit ConnectedComponents(void* stream);
+
+  ~ConnectedComponents();
+
+  void Resize(int height, int width);
+
+  int GetComponents(const uint8_t* d_mask, int* h_label);
+
+  void GetContours(std::vector<std::vector<cv::Point>>& corners);
+
+  void GetStats(const uint8_t* d_mask, const float* d_score, std::vector<float>& scores,
+                std::vector<int>& areas);
+
+ private:
+  struct Impl;
+  std::unique_ptr<Impl> impl_;
+};
+
+}  // namespace mmdeploy
+
+#endif  // FAST_CC__CONNECTED_COMPONENT_H_
--- a/csrc/mmdeploy/codebase/mmocr/cuda/dbnet.cpp
+++ b/csrc/mmdeploy/codebase/mmocr/cuda/dbnet.cpp
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#include "mmdeploy/codebase/mmocr/dbnet.h"
+
+#include "mmdeploy/codebase/mmocr/cuda/connected_component.h"
+#include "mmdeploy/codebase/mmocr/cuda/utils.h"
+#include "mmdeploy/core/utils/device_utils.h"
+#include "mmdeploy/device/cuda/cuda_device.h"
+#include "opencv2/imgproc.hpp"
+
+namespace mmdeploy::mmocr {
+
+class DbHeadCudaImpl : public DbHeadImpl {
+ public:
+  void Init(const Stream& stream) override {
+    DbHeadImpl::Init(stream);
+    device_ = stream_.GetDevice();
+    {
+      CudaDeviceGuard device_guard(device_);
+      cc_.emplace(GetNative<cudaStream_t>(stream_));
+    }
+  }
+
+  ~DbHeadCudaImpl() override {
+    CudaDeviceGuard device_guard(device_);
+    cc_.reset();
+  }
+
+  Result<void> Process(Tensor score, float mask_thr, int max_candidates,
+                       std::vector<std::vector<cv::Point>>& contours,
+                       std::vector<float>& scores) override {
+    CudaDeviceGuard device_guard(device_);
+
+    auto height = static_cast<int>(score.shape(1));
+    auto width = static_cast<int>(score.shape(2));
+
+    Buffer mask(device_, score.size() * sizeof(uint8_t));
+
+    auto score_data = score.data<float>();
+    auto mask_data = GetNative<uint8_t*>(mask);
+
+    dbnet::Threshold(score_data, height * width, mask_thr, mask_data,
+                     GetNative<cudaStream_t>(stream_));
+
+    cc_->Resize(height, width);
+    cc_->GetComponents(mask_data, nullptr);
+
+    std::vector<std::vector<cv::Point>> points;
+    cc_->GetContours(points);
+
+    std::vector<float> _scores;
+    std::vector<int> _areas;
+    cc_->GetStats(mask_data, score_data, _scores, _areas);
+
+    if (points.size() > max_candidates) {
+      points.resize(max_candidates);
+    }
+
+    for (int i = 0; i < points.size(); ++i) {
+      std::vector<cv::Point> hull;
+      cv::convexHull(points[i], hull);
+      if (hull.size() < 4) {
+        continue;
+      }
+      contours.push_back(hull);
+      scores.push_back(_scores[i] / (float)_areas[i]);
+    }
+    return success();
+  }
+
+ private:
+  Device device_;
+  std::optional<ConnectedComponents> cc_;
+};
+
+MMDEPLOY_REGISTER_FACTORY_FUNC(DbHeadImpl, (cuda, 0),
+                               [] { return std::make_unique<DbHeadCudaImpl>(); });
+
+}  // namespace mmdeploy::mmocr
--- a/csrc/mmdeploy/codebase/mmocr/cuda/panet.cpp
+++ b/csrc/mmdeploy/codebase/mmocr/cuda/panet.cpp
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#include "mmdeploy/codebase/mmocr/panet.h"
+
+#include "mmdeploy/codebase/mmocr/cuda/connected_component.h"
+#include "mmdeploy/codebase/mmocr/cuda/utils.h"
+#include "mmdeploy/device/cuda/cuda_device.h"
+
+namespace mmdeploy::mmocr {
+
+class PaHeadCudaImpl : public PaHeadImpl {
+ public:
+  void Init(const Stream& stream) override {
+    PaHeadImpl::Init(stream);
+    device_ = stream.GetDevice();
+    {
+      CudaDeviceGuard device_guard(device_);
+      cc_.emplace(GetNative<cudaStream_t>(stream_));
+    }
+  }
+
+  ~PaHeadCudaImpl() override {
+    CudaDeviceGuard device_guard(device_);
+    cc_.reset();
+  }
+
+  Result<void> Process(Tensor text_pred,             //
+                       Tensor kernel_pred,           //
+                       Tensor embed_pred,            //
+                       float min_text_confidence,    //
+                       float min_kernel_confidence,  //
+                       cv::Mat_<float>& text_score,  //
+                       cv::Mat_<uint8_t>& text,      //
+                       cv::Mat_<uint8_t>& kernel,    //
+                       cv::Mat_<int>& label,         //
+                       cv::Mat_<float>& embed,       //
+                       int& region_num) override {
+    CudaDeviceGuard device_guard(device_);
+
+    auto height = static_cast<int>(text_pred.shape(1));
+    auto width = static_cast<int>(text_pred.shape(2));
+
+    Buffer text_buf(device_, height * width);
+    Buffer text_score_buf(device_, height * width * sizeof(float));
+    Buffer kernel_buf(device_, height * width);
+
+    auto text_buf_data = GetNative<uint8_t*>(text_buf);
+    auto text_score_buf_data = GetNative<float*>(text_score_buf);
+    auto kernel_buf_data = GetNative<uint8_t*>(kernel_buf);
+
+    auto stream = GetNative<cudaStream_t>(stream_);
+
+    panet::ProcessMasks(text_pred.data<float>(),    //
+                        kernel_pred.data<float>(),  //
+                        min_text_confidence,        //
+                        min_kernel_confidence,      //
+                        height * width,             //
+                        text_buf_data,              //
+                        kernel_buf_data,            //
+                        text_score_buf_data,        //
+                        stream);
+
+    auto n_embed_channels = embed_pred.shape(0);
+    Buffer embed_buf(device_, embed_pred.byte_size());
+
+    panet::Transpose(embed_pred.data<float>(),      //
+                     n_embed_channels,              //
+                     height * width,                //
+                     GetNative<float*>(embed_buf),  //
+                     stream);
+
+    label = cv::Mat_<int>(height, width);
+
+    cc_->Resize(height, width);
+    region_num = cc_->GetComponents(kernel_buf_data, label.ptr<int>()) + 1;
+
+    text_score = cv::Mat_<float>(label.size());
+    OUTCOME_TRY(stream_.Copy(text_score_buf, text_score.data));
+
+    text = cv::Mat_<uint8_t>(label.size());
+    OUTCOME_TRY(stream_.Copy(text_buf, text.data));
+
+    kernel = cv::Mat_<uint8_t>(label.size());
+    OUTCOME_TRY(stream_.Copy(kernel_buf, kernel.data));
+
+    embed = cv::Mat_<float>(height * width, n_embed_channels);
+    OUTCOME_TRY(stream_.Copy(embed_buf, embed.data));
+
+    OUTCOME_TRY(stream_.Wait());
+
+    return success();
+  }
+
+ private:
+  Device device_;
+  std::optional<ConnectedComponents> cc_;
+};
+
+MMDEPLOY_REGISTER_FACTORY_FUNC(PaHeadImpl, (cuda, 0),
+                               [] { return std::make_unique<PaHeadCudaImpl>(); });
+
+}  // namespace mmdeploy::mmocr
--- a/csrc/mmdeploy/codebase/mmocr/cuda/psenet.cpp
+++ b/csrc/mmdeploy/codebase/mmocr/cuda/psenet.cpp
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#include "mmdeploy/codebase/mmocr/psenet.h"
+
+#include "mmdeploy/codebase/mmocr/cuda/connected_component.h"
+#include "mmdeploy/codebase/mmocr/cuda/utils.h"
+#include "mmdeploy/core/utils/device_utils.h"
+#include "mmdeploy/device/cuda/cuda_device.h"
+#include "opencv2/imgproc.hpp"
+
+namespace mmdeploy::mmocr {
+
+class PseHeadCudaImpl : public PseHeadImpl {
+ public:
+  void Init(const Stream& stream) override {
+    PseHeadImpl::Init(stream);
+    device_ = stream.GetDevice();
+    {
+      CudaDeviceGuard device_guard(device_);
+      cc_.emplace(GetNative<cudaStream_t>(stream_));
+    }
+  }
+
+  ~PseHeadCudaImpl() override {
+    CudaDeviceGuard device_guard(device_);
+    cc_.reset();
+  }
+
+  Result<void> Process(Tensor preds,                 //
+                       float min_kernel_confidence,  //
+                       cv::Mat_<float>& score,       //
+                       cv::Mat_<uint8_t>& masks,     //
+                       cv::Mat_<int>& label,         //
+                       int& region_num) override {
+    CudaDeviceGuard device_guard(device_);
+
+    OUTCOME_TRY(preds, MakeAvailableOnDevice(preds, device_, stream_));
+    OUTCOME_TRY(stream_.Wait());
+
+    auto channels = static_cast<int>(preds.shape(0));
+    auto height = static_cast<int>(preds.shape(1));
+    auto width = static_cast<int>(preds.shape(2));
+
+    Buffer masks_buf(device_, preds.size());
+    Buffer score_buf(device_, height * width * sizeof(float));
+
+    auto masks_data = GetNative<uint8_t*>(masks_buf);
+    auto score_data = GetNative<float*>(score_buf);
+
+    psenet::ProcessMasks(preds.data<float>(), channels, height * width, min_kernel_confidence,
+                         masks_data, score_data, GetNative<cudaStream_t>(stream_));
+
+    cc_->Resize(height, width);
+
+    label = cv::Mat_<int>(height, width);
+
+    auto kernel_mask_data = masks_data + height * width * (channels - 1);
+    region_num = cc_->GetComponents(kernel_mask_data, label.ptr<int>()) + 1;
+
+    score = cv::Mat_<float>(label.size());
+    OUTCOME_TRY(stream_.Copy(score_buf, score.ptr<float>()));
+
+    masks = cv::Mat_<uint8_t>(channels, height * width);
+    OUTCOME_TRY(stream_.Copy(masks_buf, masks.ptr<uint8_t>()));
+
+    return success();
+  }
+
+ private:
+  Device device_;
+  std::optional<ConnectedComponents> cc_;
+};
+
+MMDEPLOY_REGISTER_FACTORY_FUNC(PseHeadImpl, (cuda, 0),
+                               [] { return std::make_unique<PseHeadCudaImpl>(); });
+
+}  // namespace mmdeploy::mmocr
--- a/csrc/mmdeploy/codebase/mmocr/cuda/utils.cu
+++ b/csrc/mmdeploy/codebase/mmocr/cuda/utils.cu
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#include "mmdeploy/codebase/mmocr/cuda/utils.h"
+#include "thrust/iterator/counting_iterator.h"
+#include "thrust/transform.h"
+
+namespace mmdeploy {
+
+namespace mmocr {
+
+__device__ float sigmoid(float x) { return 1.f / (1.f + expf(-x)); }
+
+namespace panet {
+
+struct _process_masks_op {
+  const float* text_pred;
+  const float* kernel_pred;
+  float text_thr;
+  float kernel_thr;
+  uint8_t* text_mask;
+  uint8_t* kernel_mask;
+  float* text_score;
+  __device__ void operator()(int index) const {
+    auto text_sigmoid = sigmoid(text_pred[index]);
+    auto kernel_sigmoid = sigmoid(kernel_pred[index]);
+    text_score[index] = text_sigmoid;
+    auto text_valid = text_sigmoid > text_thr;
+    text_mask[index] = text_valid ? 255 : 0;
+    kernel_mask[index] = (text_valid && kernel_sigmoid > kernel_thr) ? 255 : 0;
+  }
+};
+
+void ProcessMasks(const float* d_text_pred, const float* d_kernel_pred, float text_thr,
+                  float kernel_thr, int n, uint8_t* d_text_mask, uint8_t* d_kernel_mask,
+                  float* d_text_score, cudaStream_t stream) {
+  thrust::for_each_n(thrust::cuda::par.on(stream), thrust::counting_iterator<int>(0), n,
+                     _process_masks_op{d_text_pred, d_kernel_pred, text_thr, kernel_thr,
+                                       d_text_mask, d_kernel_mask, d_text_score});
+}
+
+struct _transpose_op {
+  const float* input;
+  float* output;
+  int h;
+  int w;
+  __device__ void operator()(int index) const {
+    int i = index / w;
+    int j = index % w;
+    output[j * h + i] = input[index];
+  }
+};
+
+void Transpose(const float* d_input, int h, int w, float* d_output, cudaStream_t stream) {
+  thrust::for_each_n(thrust::cuda::par.on(stream), thrust::counting_iterator<int>(0), h * w,
+                     _transpose_op{d_input, d_output, h, w});
+}
+
+}  // namespace panet
+
+namespace dbnet {
+
+struct _threshold_op {
+  float thr;
+  __device__ bool operator()(float score) const { return score >= thr; }
+};
+
+void Threshold(const float* d_score, int n, float thr, uint8_t* d_mask, cudaStream_t stream) {
+  thrust::transform(thrust::cuda::par.on(stream), d_score, d_score + n, d_mask, _threshold_op{thr});
+}
+
+}  // namespace dbnet
+
+namespace psenet {
+
+struct _process_masks_op {
+  const float* preds;
+  int c;
+  int n;
+  float thr;
+  uint8_t* masks;
+  float* score;
+  __device__ void operator()(int index) const {
+    bool m0 = false;
+    for (int i = 0; i < c; ++i) {
+      auto v = sigmoid(preds[i * n + index]);
+      if (i == 0) {
+        score[index] = v;
+        m0 = v > thr;
+      }
+      masks[i * n + index] = (m0 && v > thr) ? 255 : 0;
+    }
+  }
+};
+
+void ProcessMasks(const float* d_preds, int c, int n, float thr, uint8_t* d_masks, float* d_score,
+                  cudaStream_t stream) {
+  thrust::for_each_n(thrust::cuda::par.on(stream), thrust::counting_iterator<int>(0), n,
+                     _process_masks_op{d_preds, c, n, thr, d_masks, d_score});
+}
+
+}  // namespace psenet
+
+}  // namespace mmocr
+
+}  // namespace mmdeploy
--- a/csrc/mmdeploy/codebase/mmocr/cuda/utils.h
+++ b/csrc/mmdeploy/codebase/mmocr/cuda/utils.h
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#ifndef MMDEPLOY_CSRC_CODEBASE_MMOCR_CUDA_UTILS_H_
+#define MMDEPLOY_CSRC_CODEBASE_MMOCR_CUDA_UTILS_H_
+
+#include <cstdint>
+
+#include "cuda_runtime.h"
+
+namespace mmdeploy {
+
+namespace mmocr {
+
+namespace panet {
+
+void ProcessMasks(const float* d_text_pred, const float* d_kernel_pred, float text_thr,
+                  float kernel_thr, int n, uint8_t* d_text_mask, uint8_t* d_kernel_mask,
+                  float* d_text_score, cudaStream_t stream);
+
+void Transpose(const float* d_input, int h, int w, float* d_output, cudaStream_t stream);
+
+}  // namespace panet
+
+namespace dbnet {
+
+void Threshold(const float* d_score, int n, float thr, uint8_t* d_mask, cudaStream_t stream);
+
+}
+
+namespace psenet {
+
+void ProcessMasks(const float* d_preds, int c, int n, float thr, uint8_t* d_masks, float* d_score,
+                  cudaStream_t stream);
+
+}
+
+}  // namespace mmocr
+
+}  // namespace mmdeploy
+
+#endif  // MMDEPLOY_CSRC_CODEBASE_MMOCR_CUDA_UTILS_H_
--- a/csrc/mmdeploy/codebase/mmocr/dbnet.cpp
+++ b/csrc/mmdeploy/codebase/mmocr/dbnet.cpp
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#include "mmdeploy/codebase/mmocr/dbnet.h"
+
+#include <opencv2/imgproc.hpp>
+
+#include "clipper.hpp"
+#include "mmdeploy/core/device.h"
+#include "mmdeploy/core/tensor.h"
+#include "mmdeploy/core/utils/formatter.h"
+#include "mmdeploy/experimental/module_adapter.h"
+#include "mmocr.h"
+
+namespace mmdeploy {
+
+namespace mmocr {
+
+using std::string;
+using std::vector;
+
+class DBHead : public MMOCR {
+ public:
+  explicit DBHead(const Value& config) : MMOCR(config) {
+    if (config.contains("params")) {
+      auto& params = config["params"];
+      text_repr_type_ = params.value("text_repr_type", text_repr_type_);
+      mask_thr_ = params.value("mask_thr", mask_thr_);
+      min_text_score_ = params.value("min_text_score", min_text_score_);
+      min_text_width_ = params.value("min_text_width", min_text_width_);
+      unclip_ratio_ = params.value("unclip_ratio", unclip_ratio_);
+      max_candidates_ = params.value("max_candidate", max_candidates_);
+      rescale_ = params.value("rescale", rescale_);
+      downsample_ratio_ = params.value("downsample_ratio", downsample_ratio_);
+    }
+    auto platform = Platform(device_.platform_id()).GetPlatformName();
+    auto creator = gRegistry<DbHeadImpl>().Get(platform);
+    if (!creator) {
+      MMDEPLOY_ERROR(
+          "DBHead: implementation for platform \"{}\" not found. Available platforms: {}", platform,
+          gRegistry<DbHeadImpl>().List());
+      throw_exception(eEntryNotFound);
+    }
+    impl_ = creator->Create();
+    impl_->Init(stream_);
+  }
+
+  Result<Value> operator()(const Value& _data, const Value& _prob) const {
+    auto conf = _prob["output"].get<Tensor>();
+    if (!(conf.shape().size() == 3 && conf.data_type() == DataType::kFLOAT)) {
+      MMDEPLOY_ERROR("unsupported `output` tensor, shape: {}, dtype: {}", conf.shape(),
+                     (int)conf.data_type());
+      return Status(eNotSupported);
+    }
+
+    std::vector<std::vector<cv::Point>> contours;
+    std::vector<float> scores;
+    OUTCOME_TRY(impl_->Process(conf, mask_thr_, max_candidates_, contours, scores));
+
+    auto scale_w = 1.f;
+    auto scale_h = 1.f;
+    if (rescale_) {
+      scale_w /= downsample_ratio_ * _data["img_metas"]["scale_factor"][0].get<float>();
+      scale_h /= downsample_ratio_ * _data["img_metas"]["scale_factor"][1].get<float>();
+    }
+
+    TextDetections output;
+    for (int idx = 0; idx < contours.size(); ++idx) {
+      if (scores[idx] < min_text_score_) {
+        continue;
+      }
+      auto expanded = unclip(contours[idx], unclip_ratio_);
+      if (expanded.empty()) {
+        continue;
+      }
+      auto rect = cv::minAreaRect(expanded);
+      if ((int)rect.size.width <= min_text_width_) {
+        continue;
+      }
+      std::array<cv::Point2f, 4> box_points;
+      rect.points(box_points.data());
+      auto& det = output.emplace_back();
+      for (int i = 0; i < 4; ++i) {
+        // ! performance metrics drops without rounding here
+        det.bbox[i * 2] = cvRound(box_points[i].x * scale_w);
+        det.bbox[i * 2 + 1] = cvRound(box_points[i].y * scale_h);
+      }
+      det.score = scores[idx];
+    }
+
+    return to_value(output);
+  }
+
+  static std::vector<cv::Point> unclip(std::vector<cv::Point>& box, float unclip_ratio) {
+    namespace cl = ClipperLib;
+
+    auto area = cv::contourArea(box);
+    auto length = cv::arcLength(box, true);
+    auto distance = area * unclip_ratio / length;
+
+    cl::Path src;
+    transform(begin(box), end(box), back_inserter(src), [](auto p) {
+      return cl::IntPoint{p.x, p.y};
+    });
+
+    cl::ClipperOffset offset;
+    offset.AddPath(src, cl::jtRound, cl::etClosedPolygon);
+
+    std::vector<cl::Path> dst;
+    offset.Execute(dst, distance);
+    if (dst.size() != 1) {
+      return {};
+    }
+
+    std::vector<cv::Point> ret;
+    transform(begin(dst[0]), end(dst[0]), back_inserter(ret), [](auto p) {
+      return cv::Point{static_cast<int>(p.X), static_cast<int>(p.Y)};
+    });
+    return ret;
+  }
+
+  std::string text_repr_type_{"quad"};
+  float mask_thr_{.3};
+  float min_text_score_{.3};
+  int min_text_width_{5};
+  float unclip_ratio_{1.5};
+  int max_candidates_{3000};
+  bool rescale_{true};
+  float downsample_ratio_{1.};
+
+  std::unique_ptr<DbHeadImpl> impl_;
+};
+
+MMDEPLOY_REGISTER_CODEBASE_COMPONENT(MMOCR, DBHead);
+
+MMDEPLOY_DEFINE_REGISTRY(DbHeadImpl);
+
+}  // namespace mmocr
+
+}  // namespace mmdeploy
--- a/csrc/mmdeploy/codebase/mmocr/dbnet.h
+++ b/csrc/mmdeploy/codebase/mmocr/dbnet.h
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#ifndef MMDEPLOY_CSRC_CODEBASE_MMOCR_DBNET_H_
+#define MMDEPLOY_CSRC_CODEBASE_MMOCR_DBNET_H_
+
+#include "mmdeploy/codebase/mmocr/mmocr.h"
+#include "mmdeploy/core/device.h"
+#include "mmdeploy/core/registry.h"
+#include "mmdeploy/core/tensor.h"
+#include "opencv2/core.hpp"
+
+namespace mmdeploy::mmocr {
+
+class DbHeadImpl {
+ public:
+  virtual ~DbHeadImpl() = default;
+
+  virtual void Init(const Stream& stream) { stream_ = stream; }
+
+  virtual Result<void> Process(Tensor prob, float mask_thr, int max_candidates,
+                               std::vector<std::vector<cv::Point>>& points,
+                               std::vector<float>& scores) = 0;
+
+ protected:
+  Stream stream_;
+};
+
+MMDEPLOY_DECLARE_REGISTRY(DbHeadImpl, std::unique_ptr<DbHeadImpl>());
+
+}  // namespace mmdeploy::mmocr
+
+#endif  // MMDEPLOY_CSRC_CODEBASE_MMOCR_DBNET_H_
--- a/csrc/mmdeploy/codebase/mmocr/mmocr.cpp
+++ b/csrc/mmdeploy/codebase/mmocr/mmocr.cpp
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#include "mmdeploy/codebase/mmocr/mmocr.h"
+
+#include "mmdeploy/core/registry.h"
+#include "mmdeploy/core/utils/formatter.h"
+
+namespace mmdeploy::mmocr {
+
+MMDEPLOY_REGISTER_CODEBASE(MMOCR);
+
+}  // namespace mmdeploy::mmocr
--- a/csrc/mmdeploy/codebase/mmocr/mmocr.h
+++ b/csrc/mmdeploy/codebase/mmocr/mmocr.h
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#ifndef MMDEPLOY_MMOCR_H
+#define MMDEPLOY_MMOCR_H
+
+#include <array>
+
+#include "mmdeploy/codebase/common.h"
+#include "mmdeploy/core/device.h"
+#include "mmdeploy/core/module.h"
+
+namespace mmdeploy::mmocr {
+
+struct TextDetection {
+  std::array<float, 8> bbox;
+  float score;
+  MMDEPLOY_ARCHIVE_MEMBERS(bbox, score);
+};
+
+using TextDetections = std::vector<TextDetection>;
+
+struct TextRecognition {
+  std::string text;
+  std::vector<float> score;
+  MMDEPLOY_ARCHIVE_MEMBERS(text, score);
+};
+
+MMDEPLOY_DECLARE_CODEBASE(MMOCR, mmocr);
+
+}  // namespace mmdeploy::mmocr
+
+#endif  // MMDEPLOY_MMOCR_H
--- a/csrc/mmdeploy/codebase/mmocr/panet.cpp
+++ b/csrc/mmdeploy/codebase/mmocr/panet.cpp
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#include "mmdeploy/codebase/mmocr/panet.h"
+
+#include <algorithm>
+
+#include "mmdeploy/codebase/mmocr/mmocr.h"
+#include "mmdeploy/core/device.h"
+#include "mmdeploy/core/registry.h"
+#include "mmdeploy/core/serialization.h"
+#include "mmdeploy/core/utils/device_utils.h"
+#include "opencv2/imgproc/imgproc.hpp"
+
+namespace mmdeploy::mmocr {
+
+std::vector<std::vector<float>> pixel_group_cpu(const cv::Mat_<float>& score,
+                                                const cv::Mat_<uint8_t>& mask,
+                                                const cv::Mat_<float>& embedding,
+                                                const cv::Mat_<int32_t>& kernel_label,
+                                                const cv::Mat_<uint8_t>& kernel_contour,
+                                                int kernel_region_num, float dis_threshold);
+
+class PANHead : public MMOCR {
+ public:
+  explicit PANHead(const Value& config) : MMOCR(config) {
+    if (config.contains("params")) {
+      auto& params = config["params"];
+      min_text_avg_confidence_ = params.value("min_text_avg_confidence", min_text_avg_confidence_);
+      min_kernel_confidence_ = params.value("min_kernel_confidence", min_kernel_confidence_);
+      min_text_avg_confidence_ = params.value("min_text_avg_confidence", min_text_avg_confidence_);
+      min_text_area_ = params.value("min_text_area", min_text_area_);
+      rescale_ = params.value("rescale", rescale_);
+      downsample_ratio_ = params.value("downsample_ratio", downsample_ratio_);
+    }
+    auto platform = Platform(device_.platform_id()).GetPlatformName();
+    auto creator = gRegistry<PaHeadImpl>().Get(platform);
+    if (!creator) {
+      MMDEPLOY_ERROR(
+          "PANHead: implementation for platform \"{}\" not found. Available platforms: {}",
+          platform, gRegistry<PaHeadImpl>().List());
+      throw_exception(eEntryNotFound);
+    }
+    impl_ = creator->Create();
+    impl_->Init(stream_);
+  }
+
+  Result<Value> operator()(const Value& _data, const Value& _pred) noexcept {
+    OUTCOME_TRY(auto pred, MakeAvailableOnDevice(_pred["output"].get<Tensor>(), device_, stream_));
+    OUTCOME_TRY(stream_.Wait());
+
+    if (pred.shape().size() != 4 || pred.shape(0) != 1 || pred.data_type() != DataType::kFLOAT) {
+      MMDEPLOY_ERROR("unsupported `output` tensor, shape: {}, dtype: {}", pred.shape(),
+                     (int)pred.data_type());
+      return Status(eNotSupported);
+    }
+
+    // drop batch dimension
+    pred.Squeeze(0);
+
+    auto text_pred = pred.Slice(0);
+    auto kernel_pred = pred.Slice(1);
+    auto embed_pred = pred.Slice(2, pred.shape(0));
+
+    cv::Mat_<float> text_score;
+    cv::Mat_<uint8_t> text;
+    cv::Mat_<uint8_t> kernel;
+    cv::Mat_<int> labels;
+    cv::Mat_<float> embed;
+    int region_num = 0;
+
+    OUTCOME_TRY(impl_->Process(text_pred, kernel_pred, embed_pred, min_text_confidence_,
+                               min_kernel_confidence_, text_score, text, kernel, labels, embed,
+                               region_num));
+
+    auto text_points = pixel_group_cpu(text_score, text, embed, labels, kernel, region_num,
+                                       min_text_avg_confidence_);
+
+    auto scale_w = _data["img_metas"]["scale_factor"][0].get<float>();
+    auto scale_h = _data["img_metas"]["scale_factor"][1].get<float>();
+
+    TextDetections output;
+    for (auto& text_point : text_points) {
+      auto text_confidence = text_point[0];
+      auto area = text_point.size() - 2;
+      if (filter_instance(static_cast<float>(area), text_confidence, min_text_area_,
+                          min_text_avg_confidence_)) {
+        continue;
+      }
+      cv::Mat_<float> points(text_point.size() / 2 - 1, 2, text_point.data() + 2);
+      cv::RotatedRect rect = cv::minAreaRect(points);
+      std::vector<cv::Point2f> vertices(4);
+      rect.points(vertices.data());
+      if (rescale_) {
+        for (auto& p : vertices) {
+          p.x /= scale_w * downsample_ratio_;
+          p.y /= scale_h * downsample_ratio_;
+        }
+      }
+      auto& det = output.emplace_back();
+      for (int i = 0; i < 4; ++i) {
+        det.bbox[i * 2] = vertices[i].x;
+        det.bbox[i * 2 + 1] = vertices[i].y;
+      }
+      det.score = text_confidence;
+    }
+    return to_value(output);
+  }
+
+  static bool filter_instance(float area, float confidence, float min_area, float min_confidence) {
+    return area < min_area || confidence < min_confidence;
+  }
+
+  float min_text_confidence_{.5f};
+  float min_kernel_confidence_{.5f};
+  float min_text_avg_confidence_{0.85};
+  float min_text_area_{16};
+  bool rescale_{true};
+  float downsample_ratio_{.25f};
+  std::unique_ptr<PaHeadImpl> impl_;
+};
+
+MMDEPLOY_REGISTER_CODEBASE_COMPONENT(MMOCR, PANHead);
+
+MMDEPLOY_DEFINE_REGISTRY(PaHeadImpl);
+
+}  // namespace mmdeploy::mmocr
--- a/csrc/mmdeploy/codebase/mmocr/panet.h
+++ b/csrc/mmdeploy/codebase/mmocr/panet.h
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#ifndef MMDEPLOY_CSRC_CODEBASE_MMOCR_PANET_H_
+#define MMDEPLOY_CSRC_CODEBASE_MMOCR_PANET_H_
+
+#include "mmdeploy/codebase/mmocr/mmocr.h"
+#include "mmdeploy/core/device.h"
+#include "mmdeploy/core/registry.h"
+#include "mmdeploy/core/tensor.h"
+#include "opencv2/core.hpp"
+
+namespace mmdeploy::mmocr {
+
+class PaHeadImpl {
+ public:
+  virtual ~PaHeadImpl() = default;
+
+  virtual void Init(const Stream& stream) { stream_ = stream; }
+
+  virtual Result<void> Process(Tensor text_pred,             //
+                               Tensor kernel_pred,           //
+                               Tensor embed_pred,            //
+                               float min_text_confidence,    //
+                               float min_kernel_confidence,  //
+                               cv::Mat_<float>& text_score,  //
+                               cv::Mat_<uint8_t>& text,      //
+                               cv::Mat_<uint8_t>& kernel,    //
+                               cv::Mat_<int>& label,         //
+                               cv::Mat_<float>& embed,       //
+                               int& region_num) = 0;
+
+ protected:
+  Stream stream_;
+};
+
+MMDEPLOY_DECLARE_REGISTRY(PaHeadImpl, std::unique_ptr<PaHeadImpl>());
+
+}  // namespace mmdeploy::mmocr
+
+#endif  // MMDEPLOY_CSRC_CODEBASE_MMOCR_PANET_H_