Initial commit

5713e0ca · yangql · 5713e0ca · 5713e0ca · 5713e0ca · 5713e0ca
Commit 5713e0ca authored Aug 09, 2023 by yangql
20 changed files
--- a/3rdParty/InstallOpenCVDependences.sh
+++ b/3rdParty/InstallOpenCVDependences.sh
+#! /bin/sh
+############### Ubuntu  ###############
+# 参考：https://docs.opencv.org/3.4.11/d7/d9f/tutorial_linux_install.html
+# apt-get install build-essential -y
+# apt-get install cmake git libgtk2.0-dev pkg-config libavcodec-dev libavformat-dev libswscale-dev -y
+# apt-get install python-dev python-numpy libtbb2 libtbb-dev libjpeg-dev libpng-dev libtiff-dev libjasper-dev libdc1394-22-dev -y # 处理图像所需的包，可选
+############### CentOS ###############
+yum install gcc gcc-c++ gtk2-devel gimp-devel gimp-devel-tools gimp-help-browser zlib-devel libtiff-devel libjpeg-devel libpng-devel gstreamer-devel libavc1394-devel libraw1394-devel libdc1394-devel jasper-devel jasper-utils swig python libtool nasm -y
\ No newline at end of file
--- a/3rdParty/InstallRBuild.sh
+++ b/3rdParty/InstallRBuild.sh
+############################ 在线安装依赖 ###############################
+#cd ./3rdParty
+#pip install rbuild-master.tar.gz
+############################ 离线安装依赖 ###############################
+# 安装依赖
+cd ./3rdParty/rbuild_depend
+pip install click-6.6-py2.py3-none-any.whl
+pip install six-1.15.0-py2.py3-none-any.whl
+pip install subprocess32-3.5.4.tar.gz
+pip install cget-0.1.9.tar.gz
+# 安装rbuild
+cd ../
+pip install rbuild-master.tar.gz
--- a/3rdParty/opencv-3.4.11_mini.tar.gz
+++ b/3rdParty/opencv-3.4.11_mini.tar.gz
--- a/3rdParty/rbuild-master.tar.gz
+++ b/3rdParty/rbuild-master.tar.gz
--- a/3rdParty/rbuild_depend/cget-0.1.9.tar.gz
+++ b/3rdParty/rbuild_depend/cget-0.1.9.tar.gz
--- a/3rdParty/rbuild_depend/click-6.6-py2.py3-none-any.whl
+++ b/3rdParty/rbuild_depend/click-6.6-py2.py3-none-any.whl
--- a/3rdParty/rbuild_depend/six-1.15.0-py2.py3-none-any.whl
+++ b/3rdParty/rbuild_depend/six-1.15.0-py2.py3-none-any.whl
--- a/3rdParty/rbuild_depend/subprocess32-3.5.4.tar.gz
+++ b/3rdParty/rbuild_depend/subprocess32-3.5.4.tar.gz
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
+# 设置cmake的最低版本
+cmake_minimum_required(VERSION 3.5)
+# 设置项目名
+project(RapidOcrOnnx)
+# 设置编译器
+set(CMAKE_CXX_COMPILER g++)
+set(CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS} -std=c++17) # 2.2版本以上需要c++17
+set(CMAKE_BUILD_TYPE release)
+# 添加头文件路径
+set(INCLUDE_PATH    ${CMAKE_CURRENT_SOURCE_DIR}/include/
+                    $ENV{DTKROOT}/include/
+                    ${CMAKE_CURRENT_SOURCE_DIR}/depend/include/)
+include_directories(${INCLUDE_PATH})
+# 添加依赖库路径
+set(LIBRARY_PATH ${CMAKE_CURRENT_SOURCE_DIR}/depend/lib64/
+                $ENV{DTKROOT}/lib/)
+link_directories(${LIBRARY_PATH})
+# 添加依赖库
+set(LIBRARY opencv_core
+            opencv_imgproc
+            opencv_imgcodecs
+            opencv_dnn
+            onnxruntime
+            )
+link_libraries(${LIBRARY})
+# 添加源文件
+set(SOURCE_FILES ${CMAKE_CURRENT_SOURCE_DIR}/Src/main.cpp
+                ${CMAKE_CURRENT_SOURCE_DIR}/Src/AngleNet.cpp
+                ${CMAKE_CURRENT_SOURCE_DIR}/Src/clipper.cpp
+                ${CMAKE_CURRENT_SOURCE_DIR}/Src/CrnnNet.cpp
+                ${CMAKE_CURRENT_SOURCE_DIR}/Src/DbNet.cpp
+                ${CMAKE_CURRENT_SOURCE_DIR}/Src/getopt.cpp
+                ${CMAKE_CURRENT_SOURCE_DIR}/Src/OcrLite.cpp
+                ${CMAKE_CURRENT_SOURCE_DIR}/Src/OcrLiteCApi.cpp
+                ${CMAKE_CURRENT_SOURCE_DIR}/Src/OcrLiteJni.cpp
+                ${CMAKE_CURRENT_SOURCE_DIR}/Src/OcrResultUtils.cpp
+                ${CMAKE_CURRENT_SOURCE_DIR}/Src/OcrUtils.cpp
+                )
+# 添加可执行目标
+add_executable(RapidOcr ${SOURCE_FILES})
--- a/Doc/Tutorial_Cpp.md
+++ b/Doc/Tutorial_Cpp.md
+# RapidOcr
+本示例通过RapidOcr模型说明如何使用ONNXRuntime C++ API进行图像文本识别模型的推理，包括如何预处理、推理并获取推理结果。
+## 模型简介
+本示例使用了ch_PP-OCRv3_det + ch_ppocr_mobile_v2.0_cls + ch_PP-OCRv3_rec三个模型，onnx文件在Resource/Models/文件夹下，模型结构可以通过netron (https://netron.app/) 查看，并通过netron查询各个模型的输入输出。
+## 预处理
+在将数据输入到模型之前，需要对图像做如下预处理操作：
+这段代码的目的是在进行字符识别之前，对图像进行预处理，包括读取图像、调整大小、填充、缩放等操作。然后，它将处理后的图像和其他参数传递给同名的detect函数来执行字符识别，并将结果存储在result对象中，最后将result对象作为函数的返回值。
+本示例代码主要实现了预处理操作：
+```c++
+OcrResult OcrLite::detect(const char *path, const char *imgName,
+                          const int padding, const int maxSideLen,
+                          float boxScoreThresh, float boxThresh, float unClipRatio, bool doAngle, bool mostAngle) {
+    std::string imgFile = getSrcImgFilePath(path, imgName);
+    cv::Mat originSrc = imread(imgFile, cv::IMREAD_COLOR);//default : BGR
+    int originMaxSide = (std::max)(originSrc.cols, originSrc.rows);
+    int resize;
+    if (maxSideLen <= 0 || maxSideLen > originMaxSide) {
+        resize = originMaxSide;
+    } else {
+        resize = maxSideLen;
+    }
+    resize += 2 * padding;
+    cv::Rect paddingRect(padding, padding, originSrc.cols, originSrc.rows);
+    cv::Mat paddingSrc = makePadding(originSrc, padding);
+    ScaleParam scale = getScaleParam(paddingSrc, resize);
+    OcrResult result;
+    result = detect(path, imgName, paddingSrc, paddingRect, scale,
+                    boxScoreThresh, boxThresh, unClipRatio, doAngle, mostAngle);
+    return result;
+OcrResult OcrLite::detect(const cv::Mat &mat, int padding, int maxSideLen, float boxScoreThresh, float boxThresh,
+                          float unClipRatio, bool doAngle, bool mostAngle) {
+    cv::Mat originSrc = mat;
+    int originMaxSide = (std::max)(originSrc.cols, originSrc.rows);
+    int resize;
+    if (maxSideLen <= 0 || maxSideLen > originMaxSide) {
+        resize = originMaxSide;
+    } else {
+        resize = maxSideLen;
+    }
+    resize += 2 * padding;
+    cv::Rect paddingRect(padding, padding, originSrc.cols, originSrc.rows);
+    cv::Mat paddingSrc = makePadding(originSrc, padding);
+    ScaleParam scale = getScaleParam(paddingSrc, resize);
+    OcrResult result;
+    result = detect(NULL, NULL, paddingSrc, paddingRect, scale,
+                    boxScoreThresh, boxThresh, unClipRatio, doAngle, mostAngle);
+    return result;
+}
+```
+这两段代码展示了OcrLite类中的两个重载版本的detect函数。
+第一个版本接受文件路径和图像名称作为参数，从文件中读取图像进行处理。它的实现流程如下：
+1、使用OpenCV的imread函数从文件中读取图像，以BGR格式存储在originSrc变量中。计算originSrc图像的最大边长，将其存储在originMaxSide变量中，使用std::max函数比较图像的宽度和高度。
+2、创建一个cv::Rect对象paddingRect，表示填充区域的位置和大小，其中padding用于指定填充的大小，originSrc.cols和originSrc.rows表示填充后的图像的宽度和高度。调用名为makePadding的函数，传递originSrc和padding参数，返回填充后的图像，并将其存储在paddingSrc变量中。
+3、调用名为getScaleParam的函数，传递paddingSrc和resize参数，返回一个ScaleParam对象scale，用于缩放图像。
+4、调用同名的detect函数，传递文件路径和图像名称作为参数，填充后的图像paddingSrc、填充区域paddingRect、缩放参数scale，以及其他参数，将返回的结果存储在result对象中。返回result对象作为函数的结果。
+第二个版本接受一个cv::Mat对象作为输入，直接对该图像进行处理。它的实现流程与前一个版本类似，只是省略了文件读取的步骤，而是直接使用传入的mat作为原始图像。这两个版本的detect函数的目的是根据输入的图像进行字符识别，并返回一个OcrResult对象作为结果。具体的处理步骤包括图像大小调整、填充、缩放和字符识别等过程。
+## 推理
+### 推理分为三部分：
+#### 第一部分：
+DbNet::getTextBoxes(){}使用ch_ppocr_v3_det_infer.onnx模型，这是一个预训练的文本检测模型，用于文本检测任务。它可以检测图像中的文本区域，并返回文本框的位置和边界框信息。
+```c++
+DbNet::getTextBoxes(cv::Mat &src, ScaleParam &s, float boxScoreThresh, float boxThresh, float unClipRatio) {
+    //创建输入
+    cv::Mat srcResize;
+    resize(src, srcResize, cv::Size(s.dstWidth, s.dstHeight));
+    std::vector<float> inputTensorValues = substractMeanNormalize(srcResize, meanValues, normValues);
+    std::array<int64_t, 4> inputShape{1, srcResize.channels(), srcResize.rows, srcResize.cols};
+    auto memoryInfo = Ort::MemoryInfo::CreateCpu(OrtDeviceAllocator, OrtMemTypeCPU);
+    Ort::Value inputTensor = Ort::Value::CreateTensor<float>(memoryInfo, inputTensorValues.data(),
+                                                             inputTensorValues.size(), inputShape.data(),
+                                                             inputShape.size());
+    assert(inputTensor.IsTensor());
+    std::vector<const char *> inputNames = {inputNamesPtr.data()->get()};
+    std::vector<const char *> outputNames = {outputNamesPtr.data()->get()};
+    //进行推理
+    auto outputTensor = session->Run(Ort::RunOptions{nullptr}, inputNames.data(), &inputTensor,
+                                     inputNames.size(), outputNames.data(), outputNames.size());
+    assert(outputTensor.size() == 1 && outputTensor.front().IsTensor());
+    std::vector<int64_t> outputShape = outputTensor[0].GetTensorTypeAndShapeInfo().GetShape();
+    int64_t outputCount = std::accumulate(outputShape.begin(), outputShape.end(), 1,
+                                          std::multiplies<int64_t>());
+    float *floatArray = outputTensor.front().GetTensorMutableData<float>();
+    std::vector<float> outputData(floatArray, floatArray + outputCount);
+    ...
+}
+```
+#### 第二部分：
+Angle AngleNet::getAngle(){}使用ch_ppocr_v2_cls_infer.onnx模型：这是一个预训练的分类器模型，用于文本分类任务。它可以用于判断文本属于哪个类别或类别的概率。
+```c++
+Angle AngleNet::getAngle(cv::Mat &src) {
+    std::vector<float> inputTensorValues = substractMeanNormalize(src, meanValues, normValues);
+    std::array<int64_t, 4> inputShape{1, src.channels(), src.rows, src.cols};
+    auto memoryInfo = Ort::MemoryInfo::CreateCpu(OrtDeviceAllocator, OrtMemTypeCPU);
+    Ort::Value inputTensor = Ort::Value::CreateTensor<float>(memoryInfo, inputTensorValues.data(),
+                                                             inputTensorValues.size(), inputShape.data(),
+                                                             inputShape.size());
+    //创建输入
+    assert(inputTensor.IsTensor());
+    std::vector<const char *> inputNames = {inputNamesPtr.data()->get()};
+    std::vector<const char *> outputNames = {outputNamesPtr.data()->get()};
+    //进行推理    
+    auto outputTensor = session->Run(Ort::RunOptions{nullptr}, inputNames.data(), &inputTensor,
+                                     inputNames.size(), outputNames.data(), outputNames.size());
+    assert(outputTensor.size() == 1 && outputTensor.front().IsTensor());
+    std::vector<int64_t> outputShape = outputTensor[0].GetTensorTypeAndShapeInfo().GetShape();
+    int64_t outputCount = std::accumulate(outputShape.begin(), outputShape.end(), 1,
+                                          std::multiplies<int64_t>());
+    float *floatArray = outputTensor.front().GetTensorMutableData<float>();
+    std::vector<float> outputData(floatArray, floatArray + outputCount);
+    return scoreToAngle(outputData);
+}
+```
+#### 第三部分：
+TextLine CrnnNet::getTextLine(){}使用ch_ppocr_v3_rec_infer.onnx：这是一个预训练的文本识别模型，用于文本识别任务。它可以接收一个文本框的图像区域作为输入，并返回该区域中文本的识别
+```c++
+TextLine CrnnNet::getTextLine(const cv::Mat &src) {
+    float scale = (float) dstHeight / (float) src.rows;
+    int dstWidth = int((float) src.cols * scale);
+    cv::Mat srcResize;
+    resize(src, srcResize, cv::Size(dstWidth, dstHeight));
+    std::vector<float> inputTensorValues = substractMeanNormalize(srcResize, meanValues, normValues);
+    std::array<int64_t, 4> inputShape{1, srcResize.channels(), srcResize.rows, srcResize.cols};
+    auto memoryInfo = Ort::MemoryInfo::CreateCpu(OrtDeviceAllocator, OrtMemTypeCPU);
+    Ort::Value inputTensor = Ort::Value::CreateTensor<float>(memoryInfo, inputTensorValues.data(),
+                                                             inputTensorValues.size(), inputShape.data(),
+                                                             inputShape.size());
+    //创建输入
+    assert(inputTensor.IsTensor());
+    std::vector<const char *> inputNames = {inputNamesPtr.data()->get()};
+    std::vector<const char *> outputNames = {outputNamesPtr.data()->get()};
+    //进行推理    
+    auto outputTensor = session->Run(Ort::RunOptions{nullptr}, inputNames.data(), &inputTensor,
+                                     inputNames.size(), outputNames.data(), outputNames.size());
+    assert(outputTensor.size() == 1 && outputTensor.front().IsTensor());
+    std::vector<int64_t> outputShape = outputTensor[0].GetTensorTypeAndShapeInfo().GetShape();
+    int64_t outputCount = std::accumulate(outputShape.begin(), outputShape.end(), 1,
+                                          std::multiplies<int64_t>());
+    float *floatArray = outputTensor.front().GetTensorMutableData<float>();
+    std::vector<float> outputData(floatArray, floatArray + outputCount);
+    return scoreToTextLine(outputData, outputShape[1], outputShape[2]);
+}
--- a/Doc/Tutorial_Python.md
+++ b/Doc/Tutorial_Python.md
+# RapidOcr
+本示例通过RapidOcr模型说明如何使用ONNXRuntime Python API进行图像文本识别模型的推理，包括如何预处理、推理并获取推理结果。
+## 模型简介
+本示例使用了ch_PP-OCRv3_det + ch_ppocr_mobile_v2.0_cls + ch_PP-OCRv3_rec三个模型，onnx文件在Resource/Models/文件夹下，模型结构可以通过netron (https://netron.app/) 查看，并通过netron查询各个模型的输入输出。
+## 预处理
+在将数据输入到模型之前，需要对图像做如下预处理操作：
+这段代码的目的是在进行字符识别之前，对图像进行预处理，包括读取图像、调整大小、填充、缩放等操作。
+本示例代码采用了OpenCV实现了预处理操作：
+### TextDetector的预处理
+```python
+ pre_process_list = {
+            "DetResizeForTest": {
+                "limit_side_len": config.get("limit_side_len", 736),
+                "limit_type": config.get("limit_type", "min"),
+            },
+            "NormalizeImage": {
+                "std": [0.229, 0.224, 0.225],
+                "mean": [0.485, 0.456, 0.406],
+                "scale": "1./255.",
+                "order": "hwc",
+            },
+            "ToCHWImage": None,
+            "KeepKeys": {"keep_keys": ["image", "shape"]},
+        }
+        self.preprocess_op = create_operators(pre_process_list)
+        post_process = {
+            "thresh": config.get("thresh", 0.3),
+            "box_thresh": config.get("box_thresh", 0.5),
+            "max_candidates": config.get("max_candidates", 1000),
+            "unclip_ratio": config.get("unclip_ratio", 1.6),
+            "use_dilation": config.get("use_dilation", True),
+            "score_mode": config.get("score_mode", "fast"),
+        }
+```
+### TextClassifier的预处理
+```python
+ def resize_norm_img(self, img):
+        img_c, img_h, img_w = self.cls_image_shape
+        h, w = img.shape[:2]
+        ratio = w / float(h)
+        if math.ceil(img_h * ratio) > img_w:
+            resized_w = img_w
+        else:
+            resized_w = int(math.ceil(img_h * ratio))
+        resized_image = cv2.resize(img, (resized_w, img_h))
+        resized_image = resized_image.astype("float32")
+        if img_c == 1:
+            resized_image = resized_image / 255
+            resized_image = resized_image[np.newaxis, :]
+        else:
+            resized_image = resized_image.transpose((2, 0, 1)) / 255
+        resized_image -= 0.5
+        resized_image /= 0.5
+        padding_im = np.zeros((img_c, img_h, img_w), dtype=np.float32)
+        padding_im[:, :, :resized_w] = resized_image
+        return padding_im
+```
+### TextRecognizer的预处理
+```python
+        def resize_norm_img(self, img, max_wh_ratio):
+        img_channel, img_height, img_width = self.rec_image_shape
+        assert img_channel == img.shape[2]
+        img_width = int(img_height * max_wh_ratio)
+        h, w = img.shape[:2]
+        ratio = w / float(h)
+        if math.ceil(img_height * ratio) > img_width:
+            resized_w = img_width
+        else:
+            resized_w = int(math.ceil(img_height * ratio))
+        resized_image = cv2.resize(img, (resized_w, img_height))
+        resized_image = resized_image.astype("float32")
+        resized_image = resized_image.transpose((2, 0, 1)) / 255
+        resized_image -= 0.5
+        resized_image /= 0.5
+        padding_im = np.zeros((img_channel, img_height, img_width), dtype=np.float32)
+        padding_im[:, :, 0:resized_w] = resized_image
+        return padding_im
+```
+## 推理
+### 推理分为三部分：
+#### 第一部分：
+TextDetector使用ch_ppocr_v3_det_infer.onnx模型，这是一个预训练的文本检测模型，用于文本检测任务。它可以检测图像中的文本区域，并返回文本框的位置和边界框信息。
+```python
+class TextDetector:
+    ...
+        post_process = {
+            "thresh": config.get("thresh", 0.3),
+            "box_thresh": config.get("box_thresh", 0.5),
+            "max_candidates": config.get("max_candidates", 1000),
+            "unclip_ratio": config.get("unclip_ratio", 1.6),
+            "use_dilation": config.get("use_dilation", True),
+            "score_mode": config.get("score_mode", "fast"),
+        }
+        self.postprocess_op = DBPostProcess(**post_process)
+        self.infer = OrtInferSession(config)
+    ...
+```
+#### 第二部分：
+TextClassifier使用ch_ppocr_v2_cls_infer.onnx模型：这是一个预训练的分类器模型，用于文本分类任务。它可以用于判断文本属于哪个类别或类别的概率。
+```python
+class TextClassifier:
+    ...
+    def __init__(self, config):
+        self.cls_image_shape = config["cls_image_shape"]
+        self.cls_batch_num = config["cls_batch_num"]
+        self.cls_thresh = config["cls_thresh"]
+        self.postprocess_op = ClsPostProcess(config["label_list"])
+        self.infer = OrtInferSession(config)
+    ...
+```
+#### 第三部分：
+TextDetector使用ch_ppocr_v3_rec_infer.onnx：这是一个预训练的文本识别模型，用于文本识别任务。它可以接收一个文本框的图像区域作为输入，并返回该区域中文本的识别
+```python
+class TextDetector:
+    ...
+        def __init__(self, config):
+        pre_process_list = {
+            "DetResizeForTest": {
+                "limit_side_len": config.get("limit_side_len", 736),
+                "limit_type": config.get("limit_type", "min"),
+            },
+            "NormalizeImage": {
+                "std": [0.229, 0.224, 0.225],
+                "mean": [0.485, 0.456, 0.406],
+                "scale": "1./255.",
+                "order": "hwc",
+            },
+            "ToCHWImage": None,
+            "KeepKeys": {"keep_keys": ["image", "shape"]},
+        }
+        self.preprocess_op = create_operators(pre_process_list)
+        post_process = {
+            "thresh": config.get("thresh", 0.3),
+            "box_thresh": config.get("box_thresh", 0.5),
+            "max_candidates": config.get("max_candidates", 1000),
+            "unclip_ratio": config.get("unclip_ratio", 1.6),
+            "use_dilation": config.get("use_dilation", True),
+            "score_mode": config.get("score_mode", "fast"),
+        }
+        self.postprocess_op = DBPostProcess(**post_process)
+        self.infer = OrtInferSession(config)
+    ...
+```
--- a/Python/rapidocr.py
+++ b/Python/rapidocr.py
+#导入rapidocr_onnxruntime
+from rapidocr_onnxruntime import RapidOCR
+if __name__ == "__main__":
+    rapid_ocr = RapidOCR()
+    #设置图片路径
+    image_path = "../Resource/Images/1.jpg"
+    with open(image_path, "rb") as f:
+        img = f.read()
+    #对图片进行文字识别
+    result, elapse_list = rapid_ocr(img)
+    print(result)
--- a/Python/rapidocr_onnxruntime/__init__.py
+++ b/Python/rapidocr_onnxruntime/__init__.py
+from .rapid_ocr_api import RapidOCR
+from .utils import LoadImageError
--- a/Python/rapidocr_onnxruntime/ch_ppocr_v2_cls/__init__.py
+++ b/Python/rapidocr_onnxruntime/ch_ppocr_v2_cls/__init__.py
+from .text_cls import TextClassifier
--- a/Python/rapidocr_onnxruntime/ch_ppocr_v2_cls/config.yaml
+++ b/Python/rapidocr_onnxruntime/ch_ppocr_v2_cls/config.yaml
+#模型的信息
+cls_image_shape: [3, 48, 192]
+cls_batch_num: 6
+cls_thresh: 0.9
+label_list: ['0', '180']
\ No newline at end of file
--- a/Python/rapidocr_onnxruntime/ch_ppocr_v2_cls/text_cls.py
+++ b/Python/rapidocr_onnxruntime/ch_ppocr_v2_cls/text_cls.py
+import argparse
+import copy
+import math
+import time
+from typing import List
+import cv2
+import numpy as np
+from rapidocr_onnxruntime.utils import OrtInferSession, read_yaml
+from .utils import ClsPostProcess
+class TextClassifier:
+    def __init__(self, config):
+        self.cls_image_shape = config["cls_image_shape"]
+        self.cls_batch_num = config["cls_batch_num"]
+        self.cls_thresh = config["cls_thresh"]
+        self.postprocess_op = ClsPostProcess(config["label_list"])
+        self.infer = OrtInferSession(config)
+    def __call__(self, img_list: List[np.ndarray]):
+        if isinstance(img_list, np.ndarray):
+            img_list = [img_list]
+        img_list = copy.deepcopy(img_list)
+        # Calculate the aspect ratio of all text bars
+        width_list = [img.shape[1] / float(img.shape[0]) for img in img_list]
+        # Sorting can speed up the cls process
+        indices = np.argsort(np.array(width_list))
+        img_num = len(img_list)
+        cls_res = [["", 0.0]] * img_num
+        batch_num = self.cls_batch_num
+        elapse = 0
+        for beg_img_no in range(0, img_num, batch_num):
+            end_img_no = min(img_num, beg_img_no + batch_num)
+            norm_img_batch = []
+            for ino in range(beg_img_no, end_img_no):
+                norm_img = self.resize_norm_img(img_list[indices[ino]])
+                norm_img = norm_img[np.newaxis, :]
+                norm_img_batch.append(norm_img)
+            norm_img_batch = np.concatenate(norm_img_batch).astype(np.float32)
+            starttime = time.time()
+            prob_out = self.infer(norm_img_batch)[0]
+            cls_result = self.postprocess_op(prob_out)
+            elapse += time.time() - starttime
+            for rno in range(len(cls_result)):
+                label, score = cls_result[rno]
+                cls_res[indices[beg_img_no + rno]] = [label, score]
+                if "180" in label and score > self.cls_thresh:
+                    img_list[indices[beg_img_no + rno]] = cv2.rotate(
+                        img_list[indices[beg_img_no + rno]], 1
+                    )
+        return img_list, cls_res, elapse
+    def resize_norm_img(self, img):
+        img_c, img_h, img_w = self.cls_image_shape
+        h, w = img.shape[:2]
+        ratio = w / float(h)
+        if math.ceil(img_h * ratio) > img_w:
+            resized_w = img_w
+        else:
+            resized_w = int(math.ceil(img_h * ratio))
+        resized_image = cv2.resize(img, (resized_w, img_h))
+        resized_image = resized_image.astype("float32")
+        if img_c == 1:
+            resized_image = resized_image / 255
+            resized_image = resized_image[np.newaxis, :]
+        else:
+            resized_image = resized_image.transpose((2, 0, 1)) / 255
+        resized_image -= 0.5
+        resized_image /= 0.5
+        padding_im = np.zeros((img_c, img_h, img_w), dtype=np.float32)
+        padding_im[:, :, :resized_w] = resized_image
+        return padding_im
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--image_path", type=str, help="image_dir|image_path")
+    parser.add_argument("--config_path", type=str, default="config.yaml")
+    args = parser.parse_args()
+    config = read_yaml(args.config_path)
+    text_classifier = TextClassifier(config)
+    img = cv2.imread(args.image_path)
+    img_list, cls_res, predict_time = text_classifier(img)
+    for ino in range(len(img_list)):
+        print(f"cls result:{cls_res[ino]}")
--- a/Python/rapidocr_onnxruntime/ch_ppocr_v2_cls/utils.py
+++ b/Python/rapidocr_onnxruntime/ch_ppocr_v2_cls/utils.py
+class ClsPostProcess:
+    """Convert between text-label and text-index"""
+    def __init__(self, label_list):
+        super(ClsPostProcess, self).__init__()
+        self.label_list = label_list
+    def __call__(self, preds, label=None):
+        pred_idxs = preds.argmax(axis=1)
+        decode_out = [
+            (self.label_list[idx], preds[i, idx]) for i, idx in enumerate(pred_idxs)
+        ]
+        if label is None:
+            return decode_out
+        label = [(self.label_list[idx], 1.0) for idx in label]
+        return decode_out, label
--- a/Python/rapidocr_onnxruntime/ch_ppocr_v3_det/__init__.py
+++ b/Python/rapidocr_onnxruntime/ch_ppocr_v3_det/__init__.py
+from .text_detect import TextDetector
--- a/Python/rapidocr_onnxruntime/ch_ppocr_v3_det/config.yaml
+++ b/Python/rapidocr_onnxruntime/ch_ppocr_v3_det/config.yaml
+#前处理参数
+pre_process:
+    DetResizeForTest:
+        limit_side_len: 736
+        limit_type: min
+    NormalizeImage:
+        std: [0.229, 0.224, 0.225]
+        mean: [0.485, 0.456, 0.406]
+        scale: 1./255.
+        order: hwc
+    ToCHWImage:
+    KeepKeys:
+        keep_keys: ['image', 'shape']
+#后处理参数
+post_process:
+    thresh: 0.3
+    box_thresh: 0.5
+    max_candidates: 1000
+    unclip_ratio: 1.6
+    use_dilation: true
+    score_mode: "fast"
--- a/Python/rapidocr_onnxruntime/ch_ppocr_v3_det/text_detect.py
+++ b/Python/rapidocr_onnxruntime/ch_ppocr_v3_det/text_detect.py
+import argparse
+import time
+import cv2
+import numpy as np
+from rapidocr_onnxruntime.utils import OrtInferSession, read_yaml
+from .utils import DBPostProcess, create_operators, transform
+class TextDetector:
+    def __init__(self, config):
+        pre_process_list = {
+            "DetResizeForTest": {
+                "limit_side_len": config.get("limit_side_len", 736),
+                "limit_type": config.get("limit_type", "min"),
+            },
+            "NormalizeImage": {
+                "std": [0.229, 0.224, 0.225],
+                "mean": [0.485, 0.456, 0.406],
+                "scale": "1./255.",
+                "order": "hwc",
+            },
+            "ToCHWImage": None,
+            "KeepKeys": {"keep_keys": ["image", "shape"]},
+        }
+        self.preprocess_op = create_operators(pre_process_list)
+        post_process = {
+            "thresh": config.get("thresh", 0.3),
+            "box_thresh": config.get("box_thresh", 0.5),
+            "max_candidates": config.get("max_candidates", 1000),
+            "unclip_ratio": config.get("unclip_ratio", 1.6),
+            "use_dilation": config.get("use_dilation", True),
+            "score_mode": config.get("score_mode", "fast"),
+        }
+        self.postprocess_op = DBPostProcess(**post_process)
+        self.infer = OrtInferSession(config)
+    def __call__(self, img):
+        if img is None:
+            raise ValueError("img is None")
+        ori_im_shape = img.shape[:2]
+        data = {"image": img}
+        data = transform(data, self.preprocess_op)
+        img, shape_list = data
+        if img is None:
+            return None, 0
+        img = np.expand_dims(img, axis=0).astype(np.float32)
+        shape_list = np.expand_dims(shape_list, axis=0)
+        starttime = time.time()
+        preds = self.infer(img)[0]
+        post_result = self.postprocess_op(preds, shape_list)
+        dt_boxes = post_result[0]["points"]
+        dt_boxes = self.filter_tag_det_res(dt_boxes, ori_im_shape)
+        elapse = time.time() - starttime
+        return dt_boxes, elapse
+    def order_points_clockwise(self, pts):
+        """
+        reference from:
+        https://github.com/jrosebr1/imutils/blob/master/imutils/perspective.py
+        sort the points based on their x-coordinates
+        """
+        xSorted = pts[np.argsort(pts[:, 0]), :]
+        # grab the left-most and right-most points from the sorted
+        # x-roodinate points
+        leftMost = xSorted[:2, :]
+        rightMost = xSorted[2:, :]
+        # now, sort the left-most coordinates according to their
+        # y-coordinates so we can grab the top-left and bottom-left
+        # points, respectively
+        leftMost = leftMost[np.argsort(leftMost[:, 1]), :]
+        (tl, bl) = leftMost
+        rightMost = rightMost[np.argsort(rightMost[:, 1]), :]
+        (tr, br) = rightMost
+        rect = np.array([tl, tr, br, bl], dtype="float32")
+        return rect
+    def clip_det_res(self, points, img_height, img_width):
+        for pno in range(points.shape[0]):
+            points[pno, 0] = int(min(max(points[pno, 0], 0), img_width - 1))
+            points[pno, 1] = int(min(max(points[pno, 1], 0), img_height - 1))
+        return points
+    def filter_tag_det_res(self, dt_boxes, image_shape):
+        img_height, img_width = image_shape[:2]
+        dt_boxes_new = []
+        for box in dt_boxes:
+            box = self.order_points_clockwise(box)
+            box = self.clip_det_res(box, img_height, img_width)
+            rect_width = int(np.linalg.norm(box[0] - box[1]))
+            rect_height = int(np.linalg.norm(box[0] - box[3]))
+            if rect_width <= 3 or rect_height <= 3:
+                continue
+            dt_boxes_new.append(box)
+        dt_boxes = np.array(dt_boxes_new)
+        return dt_boxes
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--config_path", type=str, default="config.yaml")
+    parser.add_argument("--image_path", type=str, default=None)
+    args = parser.parse_args()
+    config = read_yaml(args.config_path)
+    text_detector = TextDetector(config)
+    img = cv2.imread(args.image_path)
+    dt_boxes, elapse = text_detector(img)
+    from utils import draw_text_det_res
+    src_im = draw_text_det_res(dt_boxes, args.image_path)
+    cv2.imwrite("det_results.jpg", src_im)
+    print("The det_results.jpg has been saved in the current directory.")