1、新增warm up功能 2、新增图片叠加OCR字符功能

417a4ca0 · liuhy · 369751c2 · 417a4ca0 · 369751c2 · 417a4ca0
Commit 417a4ca0 authored Jul 08, 2025 by liuhy
16 changed files
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -10,6 +10,7 @@ set(CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS} -std=c++17)
 set(CMAKE_BUILD_TYPE release)
 set(INCLUDE_PATH    ${CMAKE_CURRENT_SOURCE_DIR}/Src/
+                    /usr/include/freetype2
                    $ENV{DTKROOT}/include/
                    ${CMAKE_CURRENT_SOURCE_DIR}/Src/Utility
                    ${CMAKE_CURRENT_SOURCE_DIR}/3rdParty/opencv/include)
@@ -17,6 +18,7 @@ include_directories(${INCLUDE_PATH})
 # 添加依赖库路径
 set(LIBRARY_PATH ${CMAKE_CURRENT_SOURCE_DIR}/3rdParty/opencv/lib
+                /usr/lib/x86_64-linux-gnu
                $ENV{DTKROOT}/lib/)
 link_directories(${LIBRARY_PATH})
@@ -24,6 +26,7 @@ link_directories(${LIBRARY_PATH})
 set(LIBRARY opencv_core
            opencv_imgproc
            opencv_imgcodecs
+            freetype
            opencv_dnn
            migraphx
            migraphx_gpu
@@ -36,6 +39,7 @@ link_libraries(${LIBRARY})
 set(SOURCE_FILES ${CMAKE_CURRENT_SOURCE_DIR}/Src/main.cpp
                ${CMAKE_CURRENT_SOURCE_DIR}/Src/Utility/Filesystem.cpp
                ${CMAKE_CURRENT_SOURCE_DIR}/Src/Utility/clipper.cpp
+                ${CMAKE_CURRENT_SOURCE_DIR}/Src/Utility/cv_put_Text.cpp
                ${CMAKE_CURRENT_SOURCE_DIR}/Src/ocr_engine.cpp)
 # 添加可执行目标

--- a/Doc/Images/CRNN.png
+++ b/Doc/Images/CRNN.png
--- a/Doc/Images/DBNet.png
+++ b/Doc/Images/DBNet.png
--- a/Doc/Images/res.jpg
+++ b/Doc/Images/res.jpg
--- a/Doc/Tutorial_Cpp.md
+++ b/Doc/Tutorial_Cpp.md
@@ -4,11 +4,15 @@ PP-OCRv5 是PP-OCR新一代文字识别解决方案，该方案聚焦于多场
 ## 模型简介
 ### 文本检测 
-文本检测使用了dbnet(论文地址：https://arxiv.org/pdf/1911.08947),网络结构:![alt text](Images/DBNet.png),模型输出概率图，并用Vatti Clipping算法对字符区域多边形简化处理，sample中借助Clipping 库。 sample中使用动态shape（N,3,H,C）,最大输入shape是[1,3,640,640],模型地址：Resource/Models/ppocrv5_server_det_infer.onnx
+文本检测使用了dbnet( 论文地址：https://arxiv.org/pdf/1911.08947 ),网络结构:
+![alt text](Images/DBNet.png) 
+模型输出概率图，并用Vatti Clipping算法对字符区域多边形简化处理，sample中借助Clipping 库。 sample模型输入shape为[1,3,640,640],模型路径：Resource/Models/ppocrv5_server_det_infer.onnx
 ### 文本识别
-文本识别使用了CRNN+CTCDecode(https://arxiv.org/pdf/2009.09941)，网络结构：![alt text](Images/CRNN.png)，sample中使用了动态shape (N,3,48,W),最大输入shape是[1,3,48,720],模型地址：Resource/Models/ppocrv5_server_rec_infer.onnx
+文本识别使用了CRNN+CTCDecode( https://arxiv.org/pdf/2009.09941 )，网络结构：
+![(Images/CRNN.png)](Images/CRNN.png)
+sample中模型输入shape为[1,3,48,720],模型路径：Resource/Models/ppocrv5_server_rec_infer.onnx
 ## 预处理
 ### 检测模型预处理
 检测模型输入数据预处理：
@@ -110,7 +114,7 @@ class ppOcrEngine {
                    const float segm_thres=0.3,
                    const float box_thresh=0.7,
                    bool offload_copy =true,
-                    std::string precision_mode = "fp32") ;
+                    std::string precision_mode = "fp16") ;
                    /**
         * @brief OCR engine初始化
         * @param det_model_path  字符检测模型路径
@@ -119,7 +123,7 @@ class ppOcrEngine {
         * @param segm_thres   像素分割阈值
         * @param box_thresh   字符区域box阈值
         * @param offload_copy 内存拷贝存模式， 支持两种数据拷贝方式：*offload_copy=true、offload_copy=false。当offload_copy为true时，不需*要进行内存拷贝，如果为false，需要先预分配输入输出的设备内存，并在推理* *前，将预处理数据拷贝到设备内存，推理后将模型输出从设备内存中拷贝出来
-         * @param precision_mode   精度模式，支持：fp32、fp16
+         * @param precision_mode   精度模式，支持：fp32、fp16，默认支持fp16
         * 
         * @return NONE
         */
@@ -130,36 +134,11 @@ class ppOcrEngine {
    class CTCDecode
    {
    private:
-        //inference image
+        ...
-        float* data;
-        std::unordered_map<std::string, migraphx::argument> device_data;
-        migraphx::program net;
-        int batch_size;
-        int net_input_width;
-        int net_input_height;
-        int net_input_channel;
-        bool offload_copy;
-        std::string precision_mode;
-        std::unordered_map<std::string, migraphx::argument> dev_argument;
-        void* input_buffer_device;
-        void* output_buffer_device;
-        void* output_buffer_host;
-        migraphx::shape input_shape;
-        migraphx::shape output_shape;
-        std::string input_name;
-        std::string output_name;
-        //postprocess: n_channel->model output channel,feature_size--> feature size one channel
-        int n_channel;
-        int feature_size;
-        std::vector<std::string> k_words;
    public:
        CTCDecode(std::string rec_model_path,
-        std::string precision_mode="fp32",
+        std::string precision_mode="fp16",
        int image_width=480,
        int image_height=48,
        int channel=3,
@@ -169,73 +148,21 @@ class ppOcrEngine {
        ~CTCDecode();
        /**
-         * @brief 字符识别编码，可支持，最长可支持预测90个字符，18385个字符
+         * @brief 字符识别、编码API 字符识别编码，可支持，最长可支持预测90个字符，18385个字符
+         * @param img 输入图片
+         * @return 编码后的字符串
         */
        std::string forward(cv::Mat& img);
    private:
-        /**
+       ...
-         * @brief 预处理
-         * pixel = (src_img*scale-0.5)/0.5;
-         * scale = 1.0/255
-         * @param img  字符图片
-         * @param data 预处理输出
-         * @param img_w 模型输入宽
-         * @param img_h 模型输入高
-         * @return 成功：true,失败：false
-         */
-        bool preproc(cv::Mat img,float* data,int img_w=480,int img_h=48);
-        /**
-         * @brief 模型预测后处理，获取每行中概率最大的字符，组成一句长度最大为90个字符的句子，模型预测输出shape=[1,90,18385]
-         * @param feature model output 
-         * @return 成功：text,失败：""
-         */
-        std::string postprocess(float* feature);
-        /**
-         * @brief 解码，将模型预测输出与字符集关联起来
-         * @param probs 模型预测的最大概率
-         * @param indexs 模型预测的最大概率的索引值
-         * @param mean_prob 预测句子的平均概率
-         * @return 成功：text,失败：""
-         */
-        std::string decode(std::vector<float>& probs,std::vector<int>& indexs,float& mean_prob);
    };
    class OcrDet
    {
    private:
-        std::string precision_mode;
+        ...
-        bool offload_copy;
-        migraphx::program net;
-        migraphx::shape input_shape;
-        migraphx::shape output_shape;
-        std::string input_name;
-        std::string output_name;
-        int det_batch_size;
-        int data_size ; 
-        float segm_thres;
-        float box_thres;  
-        int net_input_width;
-        int net_input_height;
-        int net_input_channel;
-        float* data;
-        //Allocate device buffer and host buffer,if offload_copy is false
-        std::unordered_map<std::string, migraphx::argument> dev_argument;
-        void* input_buffer_device;
-        void* output_buffer_device;
-        void* output_buffer_host;
-        //postprocess
-        int n_channel;
-        int feature_size;  //single channel feature map size.
-        int output_width;
-        int output_height;
-        int max_candidates;//maximun number of candidates contours.
    public:
        OcrDet(std::string det_model_path,
@@ -244,113 +171,19 @@ class ppOcrEngine {
            float segm_thres = 0.3,
            float box_thresh = 0.7);
        ~OcrDet();
-        bool forward(cv::Mat& img,std::vector<std::vector<std::vector<int>>>& text_roi_boxes);
-    private:
-        /**
-         * @brief 预处理
-         * pixel = (scale*src_img*mean/std);
-         * scale = 1.0/255
-         * mean = [0.485, 0.456, 0.406]
-         * std = [0.229, 0.224, 0.225]  
-         * @param img  字符图片
-         * @param data 预处理输出
-         * @return 成功：w,h维度的缩放比例
-         */
-        cv::Size preproc(cv::Mat img,float* data);
-        /**
-         * @brief 后处理，通过模型预测的二值图获取文本区域
-         * @param feature  模型预测tensor（这里字符检测使用了dbnet）
-         * @param boxes 字符区域坐标
-         * @return 成功：0,失败：-1
-         */
-        int  postprocess(float* feature, std::vector<std::vector<std::vector<int>>> &boxes);
-        int boxes_from_bitmap(cv::Mat& bit_map,std::vector<T_BOX>& box);
-        std::vector<std::vector<std::vector<int>>>boxes_from_bitmap(
-        const cv::Mat pred, const cv::Mat bitmap, const float &box_thresh,
-        const float &det_db_unclip_ratio, const bool &use_polygon_score);
-        std::vector<std::vector<float>> Mat2Vector(cv::Mat mat);
-        /**
-         * @brief 统计多边形区域的平均得分
-         * @param contour  字符区域的轮廓点集合
-         * @param pred  模型预测二值图
-         * @return score
-         */
-        float polygon_score_acc(std::vector<cv::Point> contour,cv::Mat pred);
-        /**
-         * @brief 对模型预测的区域进行向内或向外扩散，扩散比例是unclip_ratio ，目的是找到更加合适的字符区域
-         * @param box  字符区域坐标
-         * @param pred  模型预测二值图
-         * @return 处理后的字符区域
-         */
-        cv::RotatedRect unClip(std::vector<std::vector<float>> box,
-                                      const float &unclip_ratio);
-         /**
-         * @brief 计算偏移距离
-         *  distance = area * unclip_ratio / dist;
-         *  area = ∑(x_i*y_{i+1} - x_{i+1}*y_i)
-         *  dist = sqrtf(dx * dx + dy * dy)
-         *
-         * @param box  字符区域坐标
-         * @param unclip_ratio  缩放比例
-         * @param distance 偏移距离
-         * @return  NONE
-         */
-        void get_contour_area(const std::vector<std::vector<float>> &box,
-                                   float unclip_ratio, float &distance) ;
-        /**
-         * @brief 无效字符区域过滤。首先将boxes映射回原始图像，然后过滤无效区域
-         * @param boxes  字符区域坐标
-         * @param ratio_h  垂直方向缩放比例
-         * @param ratio_w  水平方向缩放比例
-         * @param srcimg   原始图像
-         * 
-         * @return  字符区域有效坐标
-         */
-        std::vector<std::vector<std::vector<int>>> filter_det_res(std::vector<std::vector<std::vector<int>>> boxes,
-                                float ratio_h, float ratio_w, cv::Mat srcimg);
        /**
-         * @brief 对字符区域按照从上到下，从左到右的顺序排序
+         * @brief 字符检测模型推理API
-         * @param pts  字符区域坐标
+         * @param img 原始图片
-         * 
+         * @param text_roi_boxes  字符区域坐标，格式：[[[tl.x, tl.y], [tr.x, tr.y],[], [br.x, br.y], [bl.x, bl.y]]]]
-         * @return  字符区域有效坐标
+         *                                                  |              |               |              |
-         */
+         *                                               左上坐标        右上坐标         右下坐标        左下坐标
-        std::vector<std::vector<int>> order_points_clockwise(std::vector<std::vector<int>> pts);
+         * @return 成功返回true，失败返回false
-         /**
-         * @brief 获取最小矩形坐标
-         * @param box  字符区域最小外接矩形的坐标
-         * @param ssid  box的最大边
-         * @return  字符区域有效坐标
         */
-        std::vector<std::vector<float>> get_mini_boxes(cv::RotatedRect box,float &ssid) ;
+        bool forward(cv::Mat& img,std::vector<std::vector<std::vector<int>>>& text_roi_boxes);
-        /**
-         * @brief 计算bitmap上的t_rect区域的平均分数
-         * @param box_array  模型预测的字符区域
-         * @param pred  模型预测二值图
-         * @return score
-         */
-        float box_score_fast(std::vector<std::vector<float>> box_array,cv::Mat pred) ;
-        void visualize_boxes(const cv::Mat &srcimg,
-        const std::vector<std::vector<std::vector<int>>> &boxes) ;
-        bool text_recognition(const cv::Mat &srcimg,
+    private:
-        const std::vector<std::vector<std::vector<int>>> &boxes);
+        ...
    };
@@ -358,119 +191,84 @@ class ppOcrEngine {
 ## 推理
-### 字符检测模型推理
+- 字符检测
+- 字符识别、解码
+- 字符框可视化
+- OCR结果可视化
 ```c++
-bool OcrDet::forward(cv::Mat& img,std::vector<std::vector<std::vector<int>>>& text_roi_boxes)
+std::vector<std::string> ppOcrEngine::forward(cv::Mat &srcimg){
-    {
+        std::vector<std::vector<std::vector<int>>> text_roi_boxes;
-        std::vector<std::vector<std::vector<int>>> boxes;
-        //输入数据预处理
+        std::vector<std::string> text_vec;
-        cv::Size ratio = preproc(img,data);
+        auto start = std::chrono::high_resolution_clock::now();
-        /*
+        //字符区域检测
-        支持两种数据拷贝方式：offload_copy=true、offload_copy=false。当offload_copy为true时，不需要进行内存拷贝，如果为false，需要先预分配输入输出的设备内存，并在推理前，将预处理数据拷贝到设备内存，推理后将模型输出从设备内存中拷贝出来，在做后处理。
+        text_detector->forward(srcimg,text_roi_boxes);
-        */
+        if(text_roi_boxes.size() == 0)
-        if( this->offload_copy ==false )
        {
-            hipMemcpy(input_buffer_device,
+            std::cout<<"Not found text roi !\n";
-                  (void*)data,
+            return std::vector<std::string>();
-                  this->input_shape.bytes(),
-                  hipMemcpyHostToDevice);
-            std::vector<migraphx::argument> results = net.eval(dev_argument);
-            hipMemcpy(output_buffer_host,
-            (void*)output_buffer_device,
-            output_shape.bytes(),
-            hipMemcpyDeviceToHost);
-            postprocess((float *)output_buffer_host,boxes);
-            std::cout<<"copy mode ..."<<std::endl;
-        }else{
-            std::unordered_map<std::string, migraphx::argument> inputData;
-            inputData[input_name] = migraphx::argument{input_shape, (float *)data};
-            std::vector<migraphx::argument> results = net.eval(inputData);
-            migraphx::argument result = results[0] ; //get output data  
-            postprocess((float *)result.data(),boxes);
-            std::cout<<"offload copy mode ..."<<std::endl;
        }
-        //计算等比缩放比例
+        std::vector<cv::Point> points;
-        float ratio_w = float(net_input_width) / float(img.cols);
+        //字符识别+编码
-        float ratio_h = float(net_input_height) / float(img.rows);
+        for (int n = 0; n < text_roi_boxes.size(); n++) {
-        //过滤无效框
-        text_roi_boxes = filter_det_res(boxes, ratio_h, ratio_w, img);
+            cv::Rect rect;
-        //可视化检测结果
+            cv::Mat text_roi_mat;
-        visualize_boxes(img,text_roi_boxes);
+            rect.x = text_roi_boxes[n][0][0];
-        // TextRecognition(img,boxes);
+            rect.y = text_roi_boxes[n][0][1];
-        return true;
+            rect.width = text_roi_boxes[n][2][0] -  text_roi_boxes[n][0][0];
-    }
+            rect.height = text_roi_boxes[n][2][1] - text_roi_boxes[n][0][1];
+            if(rect.width <3 || rect.height<3)
+            {
-```
+                continue;
-### 字符识别推理
+            }
-```c++
+            text_roi_mat = srcimg(rect).clone();
-std::string  CTCDecode::forward(cv::Mat& img)
-    {
-        //预处理
-        preproc(img,data,net_input_width,net_input_height);
-        /*
-        支持两种数据拷贝方式：offload_copy=true、offload_copy=false。当offload_copy为true时，不需要进行内存拷贝，如果为false，需要先预分配输入输出的设备内存，并在推理前，将预处理数据拷贝到设备内存，推理后将模型输出从设备内存中拷贝出来，在做后处理。
-        */
-        if( this->offload_copy ==false )
+            std::string text = text_recognizer->forward(text_roi_mat);
-        {
+            text_vec.push_back(text);
-            hipMemcpy(input_buffer_device,
+            points.push_back(cv::Point(rect.x,rect.y));
-                  (void*)data,
+        }  
-                  this->input_shape.bytes(),
+        auto end = std::chrono::high_resolution_clock::now(); 
-                  hipMemcpyHostToDevice);
+        auto duration_ms = std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
+        std::cout<<"[Time info] elapsed: "<< duration_ms.count() <<" ms\n";
+        //字符框可视化
+        visualize_boxes(srcimg,text_roi_boxes);
+        //OCR可视化
+        cv::Mat res_img = visualize_text(text_vec,points, srcimg);
+        ...
+}
-            std::vector<migraphx::argument> results = net.eval(dev_argument);
-            hipMemcpy(output_buffer_host,
-            (void*)output_buffer_device,
-            output_shape.bytes(),
-            hipMemcpyDeviceToHost);
-            //模型后处理，获取字符的最大概率和索引，并根据索引在字符库中查找对应的字符，然后合成一个句子
-            std::string text = postprocess((float *)output_buffer_device);
-            return text;
-        }else{
-            std::unordered_map<std::string, migraphx::argument> inputData;
-            inputData[input_name] = migraphx::argument{input_shape, (float *)data};
-            std::vector<migraphx::argument> results = net.eval(inputData);
-            migraphx::argument result = results[0] ;  
-            std::string text = postprocess((float *)result.data());
-            // std::cout<<"ctc: offload copy mode ..."<<std::endl;
-            return text;
-        } 
-    }
 ```
 # Ocrv5 API调用说明
 API调用步骤如下：
 - 类实例化
+- 读取测试图片
 - 识别接口调用
 例：
 ```c++
-int main(int argc, char** argv)
+int main(int argc, char** argv){
-{
    std::string det_model_onnx = "../Resource/Models/ppocrv5_server_det_infer.onnx";
    std::string rec_model_onnx = "../Resource/Models/ppocrv5_server_rec_infer.onnx";
-    std::string img_path = "../Resource/Images/20250703205038.png";
+    std::string img_path = "../Resource/Images/demo.png";
    std::string character_dict_path = "../Resource/ppocr_keys_v5.txt";
+    std::string front = "../Resource/fonts/SimHei.ttf";
    float segm_thres=0.3;
    float box_thresh=0.3; 
    ppOcrEngine ocr_engine(det_model_onnx,
        rec_model_onnx,
        character_dict_path,
+        front,
        segm_thres,
        box_thresh,
        true,
-        "fp32");
+        "fp16");
    cv::Mat img=cv::imread(img_path);
    ocr_engine.forward(img);
    return 0;
 }
 ```
-sample支持两种精度推理（fp32和fp16），默认是fp32）,精度和内存拷贝方式分别通过ocr_engine的构造函数传入参数来设置。
+sample支持两种精度推理（fp32和fp16），默认是fp16）,精度和内存拷贝方式分别通过ocr_engine的构造函数传入参数来设置。
\ No newline at end of file
--- a/Doc/Tutorial_Python.md
+++ b/Doc/Tutorial_Python.md
@@ -4,10 +4,14 @@ PP-OCRv5 是PP-OCR新一代文字识别解决方案，该方案聚焦于多场
 ## 模型简介
 ### 文本检测 
-文本检测使用了dbnet(论文地址：https://arxiv.org/pdf/1911.08947),网络结构:![alt text](Images/DBNet.png),模型输出概率图，并用Vatti Clipping算法对字符区域多边形简化处理。 sample中使用动态shape（N,3,H,C）,最大输入shape是[1,3,640,640],模型地址：Resource/Models/ppocrv5_server_det_infer.onnx
+文本检测使用了dbnet( 论文地址：https://arxiv.org/pdf/1911.08947 ),网络结构:
+![alt text](Images/DBNet.png) 
+模型输出概率图，并用Vatti Clipping算法对字符区域多边形简化处理，sample中借助Clipping 库。 sample中模型输入shape为[1,3,640,640],模型路径：Resource/Models/ppocrv5_server_det_infer.onnx
 ### 文本识别
-文本识别使用了CRNN+CTCDecode(https://arxiv.org/pdf/2009.09941)，网络结构：![alt text](Images/CRNN.png)，sample中使用了动态shape (N,3,48,W),最大输入shape是[1,3,48,720],模型地址：Resource/Models/ppocrv5_server_rec_infer.onnx
+文本识别使用了CRNN+CTCDecode( https://arxiv.org/pdf/2009.09941 )，网络结构：
+![(Images/CRNN.png)](Images/CRNN.png)
+sample中模型输入shape为[1,3,48,720],模型路径：Resource/Models/ppocrv5_server_rec_infer.onnx																						
 ## 预处理
 ### 检测模型预处理
@@ -80,28 +84,20 @@ def preprocess(self, img, max_wh_ratio):
        imgH, imgW = self.rec_input_size
        max_h,max_w = self.rec_input_size
        h, w = img.shape[:2]
-        # re_size = (max_w,max_h)
        #保留H的原始维度
        if h <= max_h:
            ratio = max_h / h
            w = int(w*ratio)
            if w <= max_w:
-                re_size =(w,max_h)
+                re_size =(w,max_h)    
            else:
-                re_size = (max_w,max_h)
+                re_size = (max_w,max_h)  
        else:
            ratio = max_h/h
            w,h = int(w*ratio),max_h
            if w <= max_w:
-                re_size = (w,h)
+                re_size = (w,h)  
            else:
                re_size = (max_w,h)
@@ -112,12 +108,9 @@ def preprocess(self, img, max_wh_ratio):
        resized_image = resized_image.transpose((2, 0, 1)) / 255
        resized_image -= 0.5
        resized_image /= 0.5
        #填充，沿着右、下填充
        padding_im = np.zeros((3, imgH, imgW), dtype=np.float32)
        padding_im[:, :, 0:re_size[0]] = resized_image
        return padding_im
 ```
 ## 类介绍
@@ -154,7 +147,7 @@ class PPOcrV5():
        **kwargs       ：设置字符检测模型后处理相关参数
    Returns:
-        return_type: NONE。
+        return_type: 无返回值
    Examples:
        det_onnx_path = "PATH/TO/det_onnx_model.onnx"
@@ -198,7 +191,7 @@ class TextDetector(object):
        **kwargs       ：设置字符检测模型后处理相关参数
    Returns:
-        return_type: NONE。
+        return_type: 无返回值。
    Examples:
        self.db_detector = TextDetector(
@@ -216,7 +209,6 @@ class TextDetector(object):
    """
 class TextRecgnizer(object):
-    """Support SVTR_LCNet """
    def __init__(
        self,
        rec_model_path,
@@ -240,7 +232,7 @@ class TextRecgnizer(object):
        **kwargs       ：设置字符识别模型后处理相关参数
    Returns:
-        return_type: NONE。
+        return_type: 无返回值。
    Examples:
        self.text_extractor = TextRecgnizer(rec_model_path=rec_model_path,
@@ -252,18 +244,15 @@ class TextRecgnizer(object):
    class BaseRecLabelDecode(object):
    def __init__(self, character_dict_path=None,
     use_space_char=False)
-     """Convert between text-label and text-index
+    """
    字符识别(crnn+ctc)。
    Args:
        character_dict_path ：字符集文件路径。
        use_space_char      ：字符集中是否包含空格。
    Returns:
-        return_type: NONE。
+        return_type: 无返回值。
+    Examples: 
-    Examples:
    """
    class CTCLabelDecode(BaseRecLabelDecode):
@@ -277,140 +266,28 @@ class TextRecgnizer(object):
        character_dict_path ：字符集文件路径。
        use_space_char      ：字符集中是否包含空格。
    Returns:
-        return_type: NONE。
+        return_type: 无返回值。
    Examples:
    """
 ```
 ## 推理
-### 字符检测模型推理
-```python
-def __call__(self, src_img):
-        data = self.preprocess(src_img)
-        """支持两种数据拷贝方式：offload_copy=true、offload_copy=false。当offload_copy为true时，不需要进行内存拷贝，如果为false，需要先预分配输入输出的设备内存，并在推理前，将预处理数据拷贝到设备内存，推理后将模型输出从设备内存中拷贝出来，在做后处理。"""
-        if self.offload_copy==False:
-            self.d_mem[self.det_input_name] =migraphx.to_gpu(migraphx.argument(data["image"]))
-            results = self.db_model.run(self.d_mem)
-        else:
-            results = self.db_model.run({self.det_input_name:data["image"]})
-        if self.offload_copy==False :
-            #从gpu拷贝推理结果到cpu 
-            result=migraphx.from_gpu(results[0])
-            print("offload copy model")
-            result = np.array(result)
-        else:
-            result = results[0]
-        shape_list = np.expand_dims(data["shape"], axis=0)
-        pred = np.array(result)
-        pred = pred[:, 0, :, :]
-        #获取大于阈值的概率
-        segmentation = pred > self.thresh
-        boxes_batch = []
-        for batch_index in range(pred.shape[0]):
-            src_h, src_w, ratio_h, ratio_w = shape_list[batch_index]
-            if self.dilation_kernel is not None:
-                mask = cv2.dilate(
-                    np.array(segmentation[batch_index]).astype(np.uint8),
-                    self.dilation_kernel,
-                )
-            else:
-                mask = segmentation[batch_index]
-            #根据预测的bitmap获取文本区域
-            if self.box_type == "poly":
-                boxes, scores = self.polygons_from_bitmap(
-                    pred[batch_index], mask, ratio_w,ratio_h, src_w, src_h
-                )
-            elif self.box_type == "quad":
-                boxes, scores = self.boxes_from_bitmap(
-                    pred[batch_index], mask, ratio_w,ratio_h, src_w, src_h
-                )
-            else:
-                raise ValueError("box_type can only be one of ['quad', 'poly']")
-            boxes_batch.append(boxes)
-        #文本区域按照从上到下，从左到右的顺序排序
-        det_box_batch = self.sorted_boxes(boxes_batch)
-        #文本区域按坐标映射到原始图像
-        dt_boxes,det_rects = self.box_standardization(det_box_batch,shape_list)
-        return dt_boxes,det_rects
-```
-### 字符识别推理
 ```python
-def __call__(self, batch_img_list):
+def __call__(self, src_img):
-        if len(batch_img_list) == 0:
+        import time
-            return []
+        start = time.time()
-        width_list = []
+        #字符检测
-        #遍历图片列表（字符roi存放在图片列表中），为了支持多batch推理，这里还会将batch_size张图片进行拼接np.concatenate(batch_norm_imgs)
+        dt_boxs,dt_rects = self.db_detector(src_img)
-        for b in range(len(batch_img_list)):
+        res_img = self.vis_boxes(dt_boxs,src_img)
-            for img in batch_img_list[b]:
+        #字符区域图片裁剪
-                width_list.append(img.shape[1] / float(img.shape[0]))
+        batch_img_list = self.detection_roi_crop(src_img,dt_rects)
+        #字符特征提取
-        indices = np.argsort(np.array(width_list))
+        batch_outputs_pre ,batch_max_wh_ratio_pre   = self.text_extractor(batch_img_list)
+        #字符编码
-        input_batch = self.rec_batch_num
+        batch_text_list, batch_label_list = self.ctc_decoder(batch_outputs_pre,return_word_box=False,wh_ratio_list = batch_max_wh_ratio_pre)
-        batch_outputs_pre = []
-        batch_max_wh_ratio_pre = []
-        for b in range(len(batch_img_list)):
-            im_count = len(batch_img_list[b])
-            batch_outputs = []
-            batch_max_wh_ratio = []
-            for beg_img_no in range(0, im_count, input_batch):
-                end_img_no = min(im_count, beg_img_no + input_batch)
-                # for ino in range(beg_img_no, end_img_no):
-                #     h, w = batch_img_list[b][indices[ino]].shape[0:2]
-                #     wh_ratio = w * 1.0 / h
-                #     max_wh_ratio = max(max_wh_ratio, wh_ratio)
-                batch_norm_imgs = []
-                max_wh_ratio = list()
-                # N batch
-                for ino in range(beg_img_no, end_img_no):
-                    #单张图片预处理
-                    norm_img = self.preprocess(batch_img_list[b][indices[ino]], max_wh_ratio)
-                    norm_img = norm_img[np.newaxis, :].astype(np.float32)
-                    batch_norm_imgs.append(norm_img)
-                batch_max_wh_ratio.append(max_wh_ratio)
-                #batch_size张图片进行拼接
-                if self.rec_batch_num >1:
-                    norm_img_batch = np.concatenate(batch_norm_imgs)
-                    norm_img_batch = norm_img_batch.copy()
-                else:
-                    norm_img_batch = np.array([batch_norm_imgs.copy()])
-                if self.offload_copy==False:
-                    print("offload copy model")
-                    self.d_mem[self.rec_input_name] =migraphx.to_gpu(migraphx.argument(norm_img_batch))
-                    results = self.rec_model.run(self.d_mem)
-                    output = np.array(results[0])
-                else:
-                    results = self.rec_model.run({self.rec_input_name:norm_img_batch})
-                    output = results[0]
-                # batch_outputs.append(np.array(output))
-                #将所有batch的输出结果append到batch_outputs中方便后处理
-                [batch_outputs.append(out) for out in np.array(output)]
-            batch_outputs_pre.append(np.array(batch_outputs))   
-            batch_max_wh_ratio_pre.append(batch_max_wh_ratio)            
-        return batch_outputs_pre ,batch_max_wh_ratio_pre  
 ```
 # Ocrv5 API调用说明
 API调用步骤如下：
@@ -425,8 +302,8 @@ if __name__ == '__main__':
    rec_onnx_path = "../Resource/Models/ppocrv5_server_rec_infer.onnx"
    image_path = "../Resource/Images/lite_demo.png"
    img = cv2.imread(image_path)
-    ppocrv5 = PPOcrV5(det_onnx_path,rec_onnx_path,offload_copy=True,precision_mode="fp32")
+    ppocrv5 = PPOcrV5(det_onnx_path,rec_onnx_path,offload_copy=True,precision_mode="fp16")
    res_img = ppocrv5(img)
    cv2.imwrite("res.jpg",res_img)
 ```
-sample支持两种精度推理（fp32和fp16），默认是fp32）,精度和内存拷贝方式分别通过precision_mode和offload_copy参数控制。
+sample支持两种精度推理（fp32和fp16），默认是fp16）,精度和内存拷贝方式分别通过precision_mode和offload_copy参数控制。
\ No newline at end of file
--- a/Python/inference.py
+++ b/Python/inference.py
@@ -6,8 +6,6 @@ import pyclipper
 import migraphx
 import os
 from PIL import Image
 def AllocateOutputMemory(model):
    outputData={}
    for key in model.get_outputs().keys():
@@ -15,7 +13,9 @@ def AllocateOutputMemory(model):
    return outputData
 class BaseRecLabelDecode(object):
-    """Convert between text-label and text-index"""
+    """
+    特征空间映射到文本空间
+    """
    def __init__(self, character_dict_path=None, use_space_char=False):
        self.beg_str = "sos"
        self.end_str = "eos"
@@ -42,7 +42,6 @@ class BaseRecLabelDecode(object):
        for i, char in enumerate(dict_character):
            self.dict[char] = i
        self.character = dict_character
    def pred_reverse(self, pred):
        pred_re = []
        c_current = ""
@@ -84,11 +83,11 @@ class BaseRecLabelDecode(object):
                and state == "en&num"
                and c_i + 1 < len(text)
                and bool(re.search("[0-9]", text[c_i + 1]))
-            ):  # grouping floating number
+            ):   
                c_state = "en&num"
            if (
                char == "-" and state == "en&num"
-            ):  # grouping word with '-', such as 'state-of-the-art'
+            ):   
                c_state = "en&num"
            if state == None:
@@ -121,20 +120,16 @@ class BaseRecLabelDecode(object):
        is_remove_duplicate=False,
        return_word_box=False,
    ):
-        """convert text-index into text-label."""
        result_list = []
        ignored_tokens = self.get_ignored_tokens()
        batch_size = len(text_index)
-        print(f"Info:{text_index.shape},{text_prob.shape}")
        for batch_idx in range(batch_size):
            selection = np.ones(len(text_index[batch_idx]), dtype=bool)
            if is_remove_duplicate:
                selection[1:] = text_index[batch_idx][1:] != text_index[batch_idx][:-1]
            for ignored_token in ignored_tokens:
                selection &= text_index[batch_idx] != ignored_token
-            # print(f"[debug]  {len(text_index)},{batch_idx},{selection},{text_index[batch_idx][selection]},{len(self.character)}")
            char_list = [
                self.character[text_id] for text_id in text_index[batch_idx][selection]
@@ -147,8 +142,8 @@ class BaseRecLabelDecode(object):
                conf_list = [0]
            text = "".join(char_list)
-            if self.reverse:  # for arabic rec
+            if self.reverse: 
                text = self.pred_reverse(text)
            if return_word_box:
@@ -173,22 +168,24 @@ class BaseRecLabelDecode(object):
        return result_list
    def get_ignored_tokens(self):
-        return [0]  # for ctc blank
+        return [0]  
 class CTCLabelDecode(BaseRecLabelDecode):
-    """Convert between text-label and text-index"""
    def __init__(self, character_dict_path=None, use_space_char=False, **kwargs):
        super(CTCLabelDecode, self).__init__(character_dict_path, use_space_char)
    def __call__(self, preds, label=None, return_word_box=False, *args, **kwargs):
+        """
+        1、获取每个通道上的最大概率值（ppocrv5每次可预测18385个字符）
+        2、字符解码 ，从模型输出从特征空间向字符空间映射
+        3、输出字符串/字符
+        """
        batch_text_list = []
        batch_label_list = []
        for b in range(len(preds)):
-            print(preds[b].shape)
+            #获取最大概率和最大概率的索引
            preds_idx =  preds[b].argmax(axis=2)
            preds_prob = preds[b].max(axis=2)
            text = self.decode(
                preds_idx,
                preds_prob,
@@ -204,8 +201,8 @@ class CTCLabelDecode(BaseRecLabelDecode):
                continue
            label = self.decode(label)
+            batch_text_list.append(text)
-            batch_text_list.append(text)   
            batch_label_list.append(label) 
        return batch_text_list, batch_label_list
@@ -215,14 +212,13 @@ class CTCLabelDecode(BaseRecLabelDecode):
        return dict_character
 class TextRecgnizer(object):
-    """Support SVTR_LCNet """
    def __init__(
        self,
        rec_model_path,
        rec_batch_num=1,
-        rec_input_size=(48, 480),#hw
+        rec_input_size=(48, 480),#(h,w)
        rec_algorithm="SVTR_LCNet",
-        precision_mode = "fp32",
+        precision_mode = "fp16",
        **kwargs
    ):
@@ -244,13 +240,27 @@ class TextRecgnizer(object):
            outputs = self.rec_model.get_outputs()
            if self.offload_copy==False:
                self.d_mem = AllocateOutputMemory(self.rec_model)
+                in_data = np.ones((rec_batch_num,3,self.rec_input_size[0],self.rec_input_size[1]),dtype=np.float32)
+                #推理前warm up一次
+                self.d_mem[self.rec_input_name] =migraphx.to_gpu(migraphx.argument(in_data))
+                self.rec_model.run(self.d_mem)
+            else:
+                #推理前warm up一次
+                in_data = np.ones((rec_batch_num,3,self.rec_input_size[0],self.rec_input_size[1]),dtype=np.float32)
+                self.rec_model.run({self.rec_input_name:in_data})
            print("Text recognizition model info:")
            print(f"                      inputs info:{inputs}")
            print(f"                      outputs info:{outputs}")
    def __call__(self, batch_img_list):
+        """
+        1、输入预处理
+        2、拼batch
+        3、推理
+        4、输出字符特征的featmap
+        """
        if len(batch_img_list) == 0:
            return []
        width_list = []
@@ -258,12 +268,11 @@ class TextRecgnizer(object):
            for img in batch_img_list[b]:
                width_list.append(img.shape[1] / float(img.shape[0]))
-        indices = np.argsort(np.array(width_list))
+        # indices = np.argsort(np.array(width_list))
        input_batch = self.rec_batch_num
        batch_outputs_pre = []
        batch_max_wh_ratio_pre = []
-        # print(f"Batch size :{input_batch}")
        for b in range(len(batch_img_list)):
            im_count = len(batch_img_list[b])
            batch_outputs = []
@@ -271,17 +280,11 @@ class TextRecgnizer(object):
            for beg_img_no in range(0, im_count, input_batch):
                end_img_no = min(im_count, beg_img_no + input_batch)
-                # for ino in range(beg_img_no, end_img_no):
-                #     h, w = batch_img_list[b][indices[ino]].shape[0:2]
-                #     wh_ratio = w * 1.0 / h
-                #     max_wh_ratio = max(max_wh_ratio, wh_ratio)
                batch_norm_imgs = []
                max_wh_ratio = list()
                # N batch
                for ino in range(beg_img_no, end_img_no):
-                    norm_img = self.preprocess(batch_img_list[b][indices[ino]], max_wh_ratio)
+                    norm_img = self.preprocess(batch_img_list[b][ino], max_wh_ratio)
                    norm_img = norm_img[np.newaxis, :].astype(np.float32)
                    batch_norm_imgs.append(norm_img)
@@ -289,17 +292,10 @@ class TextRecgnizer(object):
                if len(batch_norm_imgs)==0:
                    continue
                batch_max_wh_ratio.append(max_wh_ratio)
-                # if self.rec_batch_num >1:
-                #     norm_img_batch = np.concatenate(batch_norm_imgs)
-                #     norm_img_batch = norm_img_batch.copy()
-                # else:
-                #     norm_img_batch = np.concatenate(batch_norm_imgs)
-                #     norm_img_batch = norm_img_batch.copy()
                norm_img_batch = np.concatenate(batch_norm_imgs)
                norm_img_batch = norm_img_batch.copy()
-                # print(f"batch shape:{norm_img_batch.shape}")
                if self.offload_copy==False:
                    print("offload copy model")
                    self.d_mem[self.rec_input_name] =migraphx.to_gpu(migraphx.argument(norm_img_batch))
@@ -309,9 +305,6 @@ class TextRecgnizer(object):
                    results = self.rec_model.run({self.rec_input_name:norm_img_batch})
                    output = results[0]
-                # batch_outputs.append(np.array(output))
                [batch_outputs.append(out) for out in np.array(output)]
            batch_outputs_pre.append(np.array(batch_outputs))   
@@ -326,26 +319,19 @@ class TextRecgnizer(object):
        imgH, imgW = self.rec_input_size
        max_h,max_w = self.rec_input_size
        h, w = img.shape[:2]
-        # re_size = (max_w,max_h)
        #沿着h axixientation 轴进行resize
        if h <= max_h:
            ratio = max_h / h
            w = int(w*ratio)
            if w <= max_w:
                re_size =(w,max_h)
            else:
                re_size = (max_w,max_h)
        else:
            ratio = max_h/h
            w,h = int(w*ratio),max_h
            if w <= max_w:
                re_size = (w,h)
            else:
                re_size = (max_w,h)
@@ -356,9 +342,7 @@ class TextRecgnizer(object):
        resized_image -= 0.5
        resized_image /= 0.5
        padding_im = np.zeros((3, imgH, imgW), dtype=np.float32)
        padding_im[:, :, 0:re_size[0]] = resized_image
        return padding_im
 class TextDetector(object):
@@ -389,61 +373,46 @@ class TextDetector(object):
        assert score_mode in [
            "slow",
            "fast",
-        ], "Score mode must be in [slow, fast] but got: {}".format(score_mode)
+        ], "Score mode not support: {}".format(score_mode)
        self.dilation_kernel = None if not use_dilation else np.array([[1, 1], [1, 1]])
        self.offload_copy = kwargs.get("offload_copy", True)
        if os.path.exists(det_model_path) and det_model_path.endswith(".onnx"):
            self.det_input_name = "x"
            maxInput={self.det_input_name:[1,3,db_input_size[0],db_input_size[1]]}
            self.db_model = migraphx.parse_onnx(det_model_path,map_input_dims=maxInput)
            inputs = self.db_model.get_inputs()
            outputs = self.db_model.get_outputs()
-            # if self.precision_mode == "int8":
-            #     print("int8 quantization")
-            #     dic = dict()
-            #     image_path = "../Resource/Images/lite_demo.png"
-            #     img = cv2.imread(image_path)
-            #     data = self.preprocess(img)
-            #     print(data["image"].shape)
-            #     print(data["image"].dtype)
-            #     dic[self.det_input_name] = migraphx.argument(data["image"].copy())
-            #     calibration = [dic]
-            #     migraphx.quantize_int8(self.db_model, migraphx.get_target("gpu"), calibration)
            if self.precision_mode == "fp16":
                migraphx.quantize_fp16(self.db_model)
            self.db_model.compile(t=migraphx.get_target("gpu"),offload_copy=self.offload_copy,device_id=0)
            if self.offload_copy==False:
                self.d_mem = AllocateOutputMemory(self.db_model)
+                in_data = np.ones((1,3,db_input_size[0],db_input_size[1]),dtype=np.float32)
+                #推理前warm up一次
+                self.d_mem[self.det_input_name] =migraphx.to_gpu(migraphx.argument(in_data))
+                self.db_model.run(self.d_mem)
+            else:
+                #推理前warm up一次
+                in_data = np.ones((1,3,db_input_size[0],db_input_size[1]),dtype=np.float32)
+                self.db_model.run({self.det_input_name:in_data})
            print("Detection model info:")
            print(f"                      inputs info:{inputs}")
            print(f"                      outputs info:{outputs}")
    def polygons_from_bitmap(self, pred, _bitmap,  ratio_w,ratio_h,dest_width, dest_height):
-        """
-        _bitmap: single map with shape (1, H, W),
-            whose values are binarized as {0, 1}
-        """
        bitmap = _bitmap
        height, width = bitmap.shape
        boxes = []
        scores = []
+        #字符区域提取
        contours, _ = cv2.findContours(
            (bitmap * 255).astype(np.uint8), cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE
        )
@@ -483,11 +452,6 @@ class TextDetector(object):
        return boxes, scores
    def boxes_from_bitmap(self, pred, _bitmap,  ratio_w,ratio_h, dest_width, dest_height):
-        """
-        _bitmap: single map with shape (1, H, W),
-                whose values are binarized as {0, 1}
-        """
        bitmap = _bitmap
        height, width = bitmap.shape
@@ -563,9 +527,6 @@ class TextDetector(object):
        return box, min(bounding_box[1])
    def box_score_fast(self, bitmap, _box):
-        """
-        box_score_fast: use bbox mean score as the mean score
-        """
        h, w = bitmap.shape[:2]
        box = _box.copy()
        xmin = np.clip(np.floor(box[:, 0].min()).astype("int32"), 0, w - 1)
@@ -578,11 +539,7 @@ class TextDetector(object):
        box[:, 1] = box[:, 1] - ymin
        cv2.fillPoly(mask, box.reshape(1, -1, 2).astype("int32"), 1)
        return cv2.mean(bitmap[ymin : ymax + 1, xmin : xmax + 1], mask)[0]
    def box_score_slow(self, bitmap, contour):
-        """
-        box_score_slow: use polyon mean score as the mean score
-        """
        h, w = bitmap.shape[:2]
        contour = contour.copy()
        contour = np.reshape(contour, (-1, 2))
@@ -591,9 +548,7 @@ class TextDetector(object):
        xmax = np.clip(np.max(contour[:, 0]), 0, w - 1)
        ymin = np.clip(np.min(contour[:, 1]), 0, h - 1)
        ymax = np.clip(np.max(contour[:, 1]), 0, h - 1)
        mask = np.zeros((ymax - ymin + 1, xmax - xmin + 1), dtype=np.uint8)
        contour[:, 0] = contour[:, 0] - xmin
        contour[:, 1] = contour[:, 1] - ymin
@@ -606,7 +561,6 @@ class TextDetector(object):
        for b in range(len(boxes_batch)):
            src_h, src_w, _, _ = shape_list[b]
            det_boxs = []
-            det_rects = []
            for box in boxes_batch[b]:
                if isinstance(box,list):
                    box = np.array(box)
@@ -628,25 +582,28 @@ class TextDetector(object):
                b_h = int(np.linalg.norm(box[0] - box[3]))
                if b_w <= 3 or b_h <= 3:
                    continue
-                _rect = [int(rect[0][0]),int(rect[0][1]),int(rect[2][0]),int(rect[2][1])]
                det_boxs.append(rect)
-                det_rects.append(_rect)
            dt_batch_boxs.append(det_boxs)
-            dt_batch_rects.append(det_rects)
-        return dt_batch_boxs,dt_batch_rects
+        return dt_batch_boxs
    def __call__(self, src_img):
+        """
+        1、预处理
+        2、推理
+        3、后处理,输出字符区域的边界框
+        4、边界框排序，按照从上到下，从左到右的顺序
+        5、边界框坐标映射到原始图片
+        """
        data = self.preprocess(src_img)
        if self.offload_copy==False:
-            self.d_mem[self.det_input_name] =migraphx.to_gpu(migraphx.argument())
+            self.d_mem[self.det_input_name] = migraphx.to_gpu(migraphx.argument(data["image"]))
            results = self.db_model.run(self.d_mem)
        else:
            results = self.db_model.run({self.det_input_name:data["image"]})
        if self.offload_copy==False :
            result=migraphx.from_gpu(results[0])
            print("offload copy model")
@@ -682,8 +639,8 @@ class TextDetector(object):
            boxes_batch.append(boxes)
        det_box_batch = self.sorted_boxes(boxes_batch)
-        dt_boxes,det_rects = self.box_standardization(det_box_batch,shape_list)
+        dt_boxes = self.box_standardization(det_box_batch,shape_list)
-        return dt_boxes,det_rects
+        return dt_boxes
    def preprocess(self, src_img,
                   mean: list = [0.485, 0.456, 0.406],
@@ -729,36 +686,39 @@ class TextDetector(object):
        im_pad = np.zeros((self.db_input_size[1], self.db_input_size[0], 3), np.float32)
        im_pad[:resize_h, :resize_w, :] = img
        return im_pad, [ratio_h, ratio_w]
    def sorted_boxes(self,dt_boxes):
-        """
+        dt_boxes = dt_boxes[0]
-        Sort text boxes in order from top to bottom, left to right
+        boxes_np = np.array(dt_boxes, dtype=np.int32)
-        args:
-            dt_boxes(array):detected text boxes with shape [4, 2]
-        return:
-            sorted boxes(array) with shape [4, 2]
-        """
        batch_boxes = list()
-        # print(dt_boxes)
+        # 计算每个框的参考点（左上角）和几何特征
-        for b in range(len(dt_boxes)):
+        top_left = boxes_np[:, 0, :]
+        widths = boxes_np[:, 1, 0] - boxes_np[:, 0, 0]
-            num_boxes = dt_boxes[b].shape[0]
+        heights = boxes_np[:, 2, 1] - boxes_np[:, 0, 1]
-            batch_sorted_boxes = sorted(dt_boxes[b], key=lambda x: (x[0][1], x[0][0]))
+        avg_height = np.median(heights)
-            _boxes = list(batch_sorted_boxes)
-            for i in range(num_boxes - 1):
+        # 按y坐标主要排序，x坐标次要排序
-                for j in range(i, -1, -1):
+        sorted_indices = np.lexsort((top_left[:, 0], top_left[:, 1]))
-                    if abs(_boxes[j + 1][0][1] - _boxes[j][0][1]) < 10 and (
-                        _boxes[j + 1][0][0] < _boxes[j][0][0]
+        # 分组调整：将y坐标相近的框视为同一行
-                    ):
+        final_order = []
-                        tmp = _boxes[j]
+        original_indices = []
-                        _boxes[j] = _boxes[j + 1]
+        current_row = [(0, sorted_indices[0])]  # (x_coord, original_idx)
-                        _boxes[j + 1] = tmp
-                    else:
+        for idx in sorted_indices[1:]:
-                        break
+            # 如果当前框与前一框的y坐标差小于行高的0.6倍，视为同一行
+            if abs(top_left[idx,1] - top_left[current_row[-1][1],1]) < avg_height * 0.6:
-            batch_boxes.append(_boxes)
+                current_row.append((top_left[idx,0], idx))
-        # print("----------------------------------------")
+            else:
-        # print(batch_boxes)
+                # 对当前行按x坐标排序
+                current_row_sorted = sorted(current_row, key=lambda x: x[0])
+                final_order.extend([x[1] for x in current_row_sorted])
+                current_row = [(top_left[idx,0], idx)]
+        # 添加最后一行
+        current_row_sorted = sorted(current_row, key=lambda x: x[0])
+        final_order.extend([x[1] for x in current_row_sorted])
+        batch_boxes.append(boxes_np[final_order])
+        # 返回排序后的框
        return batch_boxes
@@ -771,16 +731,16 @@ class PPOcrV5():
        rec_input_size :list = (48,720),
        seg_thresh:float=0.3,
        box_thresh:float=0.7,
-        precision_mode:str='fp32',
+        precision_mode:str='fp16',
        offload_copy:bool=True,
        **kwargs
        ):
        """
-            det_model_path: detection model path
+            det_model_path: 字符检测模型路径
-            rec_model_path: recognition model path
+            rec_model_path: 字符识别模型路径
-            seg_thresh: dbnet segmentation threshold
+            seg_thresh:     dbnet 像素分割阈值
-            box_thresh: box threshold
+            box_thresh:     字符边界框阈值
-            db_input_size: dbnet input size
+            db_input_size:  模型输入size
        """
        self.seg_thres = seg_thresh
        self.box_thresh = box_thresh
@@ -837,19 +797,18 @@ class PPOcrV5():
    def __call__(self, src_img):
        import time
        start = time.time()
-        dt_boxs,dt_rects = self.db_detector(src_img)
+        dt_boxs = self.db_detector(src_img)
-        res_img = self.vis_boxes(dt_boxs,src_img)
-        batch_img_list = self.detection_roi_crop(src_img,dt_rects)
+        batch_img_list = self.detection_roi_crop(src_img,dt_boxs)
        batch_outputs_pre ,batch_max_wh_ratio_pre   = self.text_extractor(batch_img_list)
        batch_text_list, batch_label_list = self.ctc_decoder(batch_outputs_pre,return_word_box=False,wh_ratio_list = batch_max_wh_ratio_pre)
        end = time.time()
        batch_text_out = []
        batch_boxes_out = []
        for b in range(len(dt_boxs)):
            text_out = []
            boxex_out = []
-            print("text box num:",len(dt_boxs[b]))
            for box, rec_result in zip(dt_boxs[b], batch_text_list[b]):
                text, score = rec_result[0], rec_result[1]
                if score >= 0.5:
@@ -859,37 +818,31 @@ class PPOcrV5():
            batch_text_out.append(text_out)
            batch_boxes_out.append(boxex_out)
        for b in range(len(batch_text_out)):
            for text, score in batch_text_out[b]:
                print("{}, {:.3f}".format(text, score))
-        # res_img = self.vis_oct_text(batch_text_out,dt_rects,res_img)
-        print(f"[Time info] elapsed:{end-start:.4f}")
+        res_img = self.vis_boxes(batch_boxes_out,src_img)        
+        res_img = self.vis_oct_text(batch_text_out,batch_boxes_out,res_img)
+        print(f"[Time info] elapsed:{(end-start)*1000:.4f} ms")
        return res_img
-    def detection_roi_crop(self,src_img,rects):
+    def detection_roi_crop(self,src_img,boxes):
        batch_cut_imgs = list()
-        for b in range(len(rects)):
+        for b in range(len(boxes)):
            crop_imgs = list()
-            for rect in rects[b]:
+            for tl,tr,br,bl in boxes[b]:
-                x_min,y_min,x_max,y_max = rect
+                box = [int(tl[0]),int(tl[1]),int(br[0]),int(br[1])]
-                rect_w ,rect_h = x_max-x_min,y_max-y_min
+                crop_img = src_img[box[1]:box[3], box[0]:box[2],:]  
-                # if rect_w<3 or rect_h<3:
-                #     continue
-                # print(x_min,y_min,x_max,y_max)
-                crop_img = src_img[y_min:y_max, x_min:x_max,:]  
                crop_imgs.append(crop_img) 
            batch_cut_imgs.append(crop_imgs)
        return batch_cut_imgs
+    def vis_oct_text(self,batch_text,batch_boxes,src_img,fornt_path="../Resource/fonts/simfang.ttf"):
-    def vis_oct_text(self,batch_text,batch_rect,src_img,fornt_path="../Resource/fonts/simfang.ttf"):
        from PIL import Image, ImageDraw, ImageFont
        img = np.zeros(src_img.shape, dtype=np.uint8)
-        img.fill(114)  
+        img.fill(255)  
        pil_img = Image.fromarray(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
        draw = ImageDraw.Draw(pil_img)
@@ -897,33 +850,31 @@ class PPOcrV5():
        for b in range(len(batch_text)):
            for id,text in enumerate(batch_text[b]):
                text,conf = text
+                f_start = (batch_boxes[b][id][0][0],batch_boxes[b][id][0][1])
-                f_start = batch_rect[b][id][0:2]
+                f_end   = (batch_boxes[b][id][2][0],batch_boxes[b][id][2][1])
-                w,h = np.array(batch_rect[b][id][2:]) - np.array(batch_rect[b][id][0:2])
+                w,h = np.array(f_end) - np.array(f_start)
                font_size = int(h*0.9)
                font = ImageFont.truetype(fornt_path, font_size,encoding="utf-8")
                draw.text(f_start, text, font=font, fill=(0, 255, 0))  
        res_img = np.concatenate([src_img, np.array(pil_img)], axis=1)
        return res_img
+    def vis_boxes(self,boxes, img, colors=(0,255,0), thickness=2):
-    def vis_boxes(self,boxes, img, colors=(255,0,0), thickness=2):
        for b in range(len(boxes)):
            for tl,tr,br,bl in boxes[b]:
                box = [int(tl[0]),int(tl[1]),int(br[0]),int(br[1])]
                cv2.rectangle(img, (box[0],box[1]), (box[2],box[3]), colors, thickness)
        return img
 if __name__ == '__main__':
    det_onnx_path = "../Resource/Models/ppocrv5_server_det_infer.onnx"
    rec_onnx_path = "../Resource/Models/ppocrv5_server_rec_infer.onnx"
-    image_path = "../Resource/Images/lite_demo.png"
+    image_path = "../Resource/Images/demo.png"
    img = cv2.imread(image_path)
-    ppocrv5 = PPOcrV5(det_onnx_path,rec_onnx_path,offload_copy=True,precision_mode="fp32")
+    ppocrv5 = PPOcrV5(det_onnx_path,rec_onnx_path,offload_copy=True,precision_mode="fp16")
    res_img = ppocrv5(img)
    cv2.imwrite("res.jpg",res_img)
\ No newline at end of file
\ No newline at end of file
--- a/Python/res.jpg
+++ b/Python/res.jpg
--- a/README.md
+++ b/README.md
@@ -81,6 +81,7 @@ pip install -r requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple
 ```
 python inference.py
 ```
+结果图片保存在当前目录下：res.jpg
 offload_copy和precision_mode设置可参考[Tutorial_Python.md](Doc/Tutorial_Python.md)，在main中示例。
 ### C++版本推理
@@ -104,7 +105,6 @@ cd <path_to_ppocrv5_migraphx>
 sh ./3rdParty/InstallOpenCVDependences.sh
 ```
 #### 安装OpenCV并构建工程
 ```
@@ -116,30 +116,9 @@ rbuild build -d depend
 - 进入到opencv-3.4.11_mini目录下创建build目录，cd build
 - 执行以下命令：
 ```
-cmake -D CMAKE_BUILD_TYPE=RELEASE       -D CMAKE_INSTALL_PREFIX=./opencv_dep      -D INSTALL_C_EXAMPLES=ON       -D INSTALL_PYTHON_EXAMPLES=ON       -D OPENCV_GENERATE_PKGCONFIG=ON       -D BUILD_EXAMPLES=ON  -D OPENCV_EXTRA_MODULES_PATH=../modules/ ..
+cmake -D CMAKE_BUILD_TYPE=RELEASE -D CMAKE_INSTALL_PREFIX=./opencv_dep -D INSTALL_C_EXAMPLES=ON -D INSTALL_PYTHON_EXAMPLES=ON  -D OPENCV_GENERATE_PKGCONFIG=ON -D BUILD_EXAMPLES=ON -D OPENCV_EXTRA_MODULES_PATH=../modules/ ..
 ```
 - 执行make -j8 && make install,编译的头文件和库目录存放在opencv_dep,将opencv_dep目录拷贝到3rdParty,并命名为opencv
-#### 设置环境变量
-将依赖库依赖加入环境变量LD_LIBRARY_PATH，在~/.bashrc中添加如下语句：
-当操作系统是ubuntu系统时：
-```
-export LD_LIBRARY_PATH=<path_to_ppocrv5_migraphx>/depend/lib/:$LD_LIBRARY_PATH
-```
-当操作系统是centos系统时：
-```
-export LD_LIBRARY_PATH=<path_to_ppocrv5_migraphx>/depend/lib64/:$LD_LIBRARY_PATH
-```
-然后执行:
-```
-source ~/.bashrc
-```
 #### 运行示例
@@ -155,58 +134,112 @@ cmake .. && make
 #运行
 ./ppOcrV5cd 
 ```
+结果图片保存在当前目录下：res.jpg
 ## result
 ### Python版本
-输出结果中，每个值分别对应每个label的实际概率。
+输出结果中展示了识别到的字符，每个字符后面跟着一个置信度，置信度值越大，识别结果越准确。
 ```
-产品信息/参数, 0.954
+'0', 0.991
-发足够的滋养, 1.000
+纯臻营养护发素, 1.000
-纯臻宫乔护发素, 0.883
+'0'.'9''9''3''6''0''4', 0.999
-花费了'0'.'4''5''7''3''3''5'秒, 0.993
+'1', 0.998
-【净含量】：'2''2''0'ml, 0.993
+产品信息/参数, 0.934
+'0'.'9''9''2''7''2''8', 0.999
+'2', 0.999
+（'4''5'元／每公斤，'1''0''0'公斤起订）, 0.970
+'0'.'9''7''4''1''7', 0.999
+'3', 0.999
 每瓶'2''2'元，'1''0''0''0'瓶起订）, 0.998
-【品名】：纯臻营养护发素, 0.998
+'0'.'9''9''3''9''7''6', 0.999
-【品牌】：代加工方式/'0'EMODM, 0.968
+'4', 0.998
-糖、椰油酰胺丙基甜菜碱、泛醒, 0.997
+【品牌】：代加工方式/'0'EMODM, 0.959
-【适用人群】：适合所有肤质, 0.998
+'0'.'9''8''5''1''3''3', 0.998
-【产品编号】：YM'-'X'-''3''0''1''1''0'.'9''6''8''9''9', 0.993
+'5', 0.998
-（'4''5'元/每公斤，'1''0''0'公斤起订）, 0.972
+【品名】：纯臻营养护发素, 0.997
-【主要成分】：鲸蜡硬脂醇、燕麦B'-'葡聚, 0.966
+'0'.'9''9''5''0''0''7', 0.999
-【主要功能】：可紧致头发磷层，从而达到, 0.994
+'6', 0.995
-即时持久改善头发光泽的效果，给干燥的头, 0.997
+【产品编号】：YM'-'X'-''3''0''1''1''0'.'9''6''8''9''9', 0.973
-The detectionvisualizedimagsavedin./vis.jpg, 0.940
+'7', 0.999
-[Time info] elapsed:3.5736
+【净含量】：'2''2''0'ml, 0.994
+'0'.'9''9''6''5''7''7', 0.999
+'8', 0.998
+【适用人群】：适合所有肤质, 0.997
+'0'.'9''9''5''8''4''2', 0.999
+'9', 0.997
+【主要成分】：鲸蜡硬脂醇、燕麦B'-'葡聚, 0.976
+'0'.'9''6''1''9''2''8', 0.999
+'1''0', 1.000
+糖、椰油酰胺丙基甜菜碱、泛醒, 0.996
+'0'.'9''2''5''8''9''8', 0.999
+'1''1', 0.999
+（成品包材）, 0.998
+'0'.'9''7''2''5''7''3', 0.999
+'1''2', 1.000
+【主要功能】：可紧致头发磷层，从而达到, 0.992
+'0'.'9''9''4''4''4''8', 0.999
+'1''3', 0.999
+即时持久改善头发光泽的效果，给干燥的头, 0.989
+'0'.'9''9''0''1''9''8', 0.999
+'1''4', 0.999
+发足够的滋养, 0.999
+'0'.'9''9''7''6''6''8', 0.999
+花费了'0'.'4''5''7''3''3''5'秒, 0.993
+[Time info] elapsed:578.6152 ms
 ```
 ### C++版本
 ```
-ocr res :[生成一幅画，负向提示词为：画中不要出现人物。正负提示词结合会]
+ocr res :花费了'0'.'4''5''7''3''3''5'秒  0.984009
-ocr res :[Text_encode_'2'.副文本编码器，补充描述性细节(如材质、光照、]
+ocr res :'0'.'9''9''7'  0.773633
-ocr res :[图片的准确性，过滤掉不需要的元素，例如正向提示词为：提示模型]
+ocr res :发足够的滋养  0.96818
-ocr res :[编码器特征融合提升模型的理解能力。]
+ocr res :'1'  0.697754
-ocr res :[正负 prompt 设置：正向 prompt 和负向 prompt 结合可以提升生成]
+ocr res :'0''0'.'9''9''0''1''9'  0.656647
-ocr res :[语义表示捕获提示词的基础含义和全局语境（如对象、动作），与副]
+ocr res :即时持久改善头发光泽的效果，给干燥的头  0.996608
-ocr res :[的图像不会发生变化，随机种子可以增加生成图像的多样性。]
+ocr res :  0
-ocr res :[Text_encode.主文本编码器，将prompt序列转换为一个综合的]
+ocr res :【主要功能】：可紧致头发磷层，从而达到  0.993421
-ocr res :[响初始噪声和生成结果的确定性，固定种子后，同一个prompt生成]
+ocr res :'0'.'9''9''4''4'  0.677327
-ocr res :[声转化为目标图像。]
+ocr res :  0
-ocr res :[随机数设置：随机数种子是控制生成过程随机性的关键参数，直接影]
+ocr res :'0'.'9''7''2'  0.637158
-ocr res :[Scheduler：调度器，控制图像生成，决定了如何逐步将随机噪]
+ocr res :（成品包材）  0.901937
-ocr res :[程和图像生成过程中有着至关重要的作用。]
+ocr res :'1'  0.32251
-ocr res :[在stable'-'dffusion'-'xl'-'base'-''1'.'0'模型中主要包含一下子组件：]
+ocr res :糖、椰油酰胺丙基甜菜碱、泛醒  0.993478
-ocr res :[Pipeline的配置参数控制图像生成的质量和速度，在扩散模型预测过]
+ocr res :'0'.'9''2''5'  0.586279
-ocr res :[具配置文件中的定义手动加载各个子组件。]
+ocr res :'1''0'  0.547241
-ocr res :[这里使用了扩散模型加载器统一加载了所有的子组件，也可以更]
+ocr res :【主要成分】：鲸蜡硬脂醇、燕麦B'-'葡聚  0.975303
-ocr res :[·'2'.'3'pipeline 配置]
+ocr res :'0'.'9''1''9'  0.568408
-Time taken by task: 3475 ms
+ocr res :  0
+ocr res :'0'.'9''9''5''2'  0.613647
+ocr res :【适用人群】：适合所有肤质  0.996882
+ocr res :'8'  0.378906
+ocr res :'0'.'9''9'  0.595581
+ocr res :【净含量】：'2''2''0'ml  0.835671
+ocr res :'7'  0.356689
+ocr res :【产品编号】：YM'-'X'-''3''0''1''1''0'.'9''6''8''9''9'  0.993695
+ocr res :'6'  0.214355
+ocr res :'0'.'9''9''5'  0.478052
+ocr res :【品名】：纯臻营养护发素  0.996175
+ocr res :'5'  0.594727
+ocr res :  0
+ocr res :'0'.'9''8''5'  0.55166
+ocr res :【品牌】：代加工方式/'0'EMODM  0.917768
+ocr res :每瓶'2''2'元，'1''0''0''0'瓶起订）  0.974644
+ocr res :'0'.'9''9''3''9''7''6'  0.736755
+ocr res :'3'  0.486572
+ocr res :（'4''5'元/每公斤，'1''0''0'公斤起订）  0.940028
+ocr res :'0'.'9'm'7'  0.534668
+ocr res :'2'  0.961426
+ocr res :  0
+ocr res :'0'.'9''9''2'  0.524121
+ocr res :产品信息/参数  0.913853
+ocr res :纯臻营养护发素'0'.'9''9''3''6''0''4'  0.964128
+ocr res :'0'  0.380127
+ocr res :The detection visualized imagesavedin./vis.jpg  0.94302
+[Time info] elapsed: 389 ms
 ```
 ### 精度
 无

--- a/Resource/Images/lite_demo.png
+++ b/Resource/Images/lite_demo.png
--- a/Resource/fonts/SimHei.ttf
+++ b/Resource/fonts/SimHei.ttf
--- a/Src/Utility/cv_put_Text.cpp
+++ b/Src/Utility/cv_put_Text.cpp
+#include "cv_put_Text.hpp"
+PutText::PutText(const char* font_path) {
+     // 初始化 FreeType
+    if (FT_Init_FreeType(&ft)) {
+        std::cerr << "Error: Could not init FreeType !" << std::endl;
+        return;
+    }
+    // 加载字体文件（ 这里使用 SimHei.ttf 字体文件）
+    if (FT_New_Face(ft, font_path, 0, &face)) {
+        std::cerr << "Error: Load front failed!" << std::endl;
+        exit(-1);
+    }
+}
+PutText::~PutText() {
+    // 释放 FreeType 资源
+    FT_Done_Face(face);
+    FT_Done_FreeType(ft);
+}
+void PutText::putText(cv::Mat& img, const std::string& text, int x, int y, int fontSize, cv::Scalar color) {
+    if(img.empty())
+    {
+        std::cerr << "Empty image!";
+        return ;
+    }
+    // 设置字体大小
+    FT_Set_Pixel_Sizes(face, 0, fontSize);
+    int start_point_x = x;
+    int start_point_y = y + fontSize; // 调整基线
+    // 循环处理每个字符
+    for (size_t i = 0; i < text.size(); ) {
+        // 解析 UTF-8 字符
+        unsigned long unicode = 0;
+        if ((text[i] & 0x80) == 0) {
+            unicode = text[i];
+            i += 1;
+        } else if ((text[i] & 0xE0) == 0xC0) {
+            unicode = ((text[i] & 0x1F) << 6) | (text[i + 1] & 0x3F);
+            i += 2;
+        } else if ((text[i] & 0xF0) == 0xE0) {
+            unicode = ((text[i] & 0x0F) << 12) | ((text[i + 1] & 0x3F) << 6) | (text[i + 2] & 0x3F);
+            i += 3;
+        } else {
+            i++; // 无效 UTF-8
+            continue;
+        }
+        // 加载字符字形
+        if (FT_Load_Char(face, unicode, FT_LOAD_RENDER)) {
+            std::cerr << "Error: Could not load glyph" << std::endl;
+            continue;
+        }
+        // 绘制到 OpenCV 图像
+        FT_Bitmap& bitmap = face->glyph->bitmap;
+        for (int row = 0; row < bitmap.rows; ++row) {
+            for (int col = 0; col < bitmap.width; ++col) {
+                unsigned char intensity = bitmap.buffer[row * bitmap.width + col];
+                if (intensity > 0) {
+                    cv::Vec3b& pixel = img.at<cv::Vec3b>(start_point_y - face->glyph->bitmap_top + row, start_point_x + face->glyph->bitmap_left + col);
+                    pixel[0] = color[0] * (intensity / 255.0) + pixel[0] * (1 - intensity / 255.0);
+                    pixel[1] = color[1] * (intensity / 255.0) + pixel[1] * (1 - intensity / 255.0);
+                    pixel[2] = color[2] * (intensity / 255.0) + pixel[2] * (1 - intensity / 255.0);
+                }
+            }
+        }
+        start_point_x += face->glyph->advance.x >> 6;
+    }
+}
\ No newline at end of file
--- a/Src/Utility/cv_put_Text.hpp
+++ b/Src/Utility/cv_put_Text.hpp
+#pragma once
+#include <ft2build.h>
+#include FT_FREETYPE_H
+#include <opencv2/opencv.hpp>
+class PutText {
+private:
+    FT_Library ft;
+    FT_Face face;
+public:
+    PutText(const char* font_path);
+    ~PutText();
+    /**
+     * @brief 向图片写文字（支持中文）
+     * @param img    待叠加字符的图片
+     * @param text     待叠加的字符
+     * @param x        垂直方向缩放比例
+     * @param y        水平方向缩放比例
+     * @param fontSize 原始图像
+     * @param color    字体颜色，默认绿色
+     * 
+     * @return  无返回值
+     */
+    void putText(cv::Mat& img, const std::string& text, int x, int y, int fontSize=2, cv::Scalar color=cv::Scalar(0, 255, 0));
+};
--- a/Src/main.cpp
+++ b/Src/main.cpp
 #include "ocr_engine.hpp"
 using namespace ppocr;
-int main(int argc, char** argv)
+int main(int argc, char** argv){
-{
    std::string det_model_onnx = "../Resource/Models/ppocrv5_server_det_infer.onnx";
    std::string rec_model_onnx = "../Resource/Models/ppocrv5_server_rec_infer.onnx";
-    std::string img_path = "../Resource/Images/20250703205038.png";
+    std::string img_path = "../Resource/Images/demo.png";
    std::string character_dict_path = "../Resource/ppocr_keys_v5.txt";
+    std::string front = "../Resource/fonts/SimHei.ttf";
    float segm_thres=0.3;
    float box_thresh=0.3; 
    ppOcrEngine ocr_engine(det_model_onnx,
        rec_model_onnx,
        character_dict_path,
+        front,
        segm_thres,
        box_thresh,
        true,
-        "fp32");
+        "fp16");
    cv::Mat img=cv::imread(img_path);
    ocr_engine.forward(img);
    return 0;
 }
\ No newline at end of file
--- a/Src/ocr_engine.cpp
+++ b/Src/ocr_engine.cpp
@@ -53,15 +53,12 @@ bool XsortFp32(std::vector<float> a, std::vector<float> b) {
            return a[0] < b[0];
        return false;
    }
-namespace ppocr
+namespace ppocr{
-{
    OcrDet::OcrDet(const std::string det_model_path,
            std::string precision_mode,
            bool offload_copy,
            float segm_thres,
-            float box_thresh )
+            float box_thresh ){
-    {
       if(!Exists(det_model_path))
        {
            LOG_ERROR(stdout, "onnx file not exists!\n");
@@ -119,6 +116,9 @@ namespace ppocr
        options.offload_copy = offload_copy;
        migraphx::target gpuTarget = migraphx::gpu::target{};
        net.compile(gpuTarget, options);
+        float *warm_data = (float*)malloc(this->input_shape.bytes());
+        memset(warm_data, 1.0, this->input_shape.bytes());
        if( this->offload_copy ==false )
        {
            hipMalloc(&input_buffer_device, this->input_shape.bytes());
@@ -127,14 +127,23 @@ namespace ppocr
            dev_argument[input_name]  = migraphx::argument{input_shape, input_buffer_device};
            dev_argument[output_name] = migraphx::argument{output_shape, output_buffer_device};
-        }
-        //decode
+             hipMemcpy(input_buffer_device,
-        // ocr = std::make_shared<CTCDecode>(res_mpath,100,32,3,keys_path);
+                  (void*)warm_data,
+                  this->input_shape.bytes(),
+                  hipMemcpyHostToDevice);
+            //warm up
+            std::vector<migraphx::argument> results = net.eval(dev_argument);
+        }else{
+            std::unordered_map<std::string, migraphx::argument> inputData;
+            inputData[input_name] = migraphx::argument{input_shape, (float *)warm_data};
+            //warm up
+            std::vector<migraphx::argument> results = net.eval(inputData);
+        }
+        free(warm_data);
    }
-    OcrDet::~OcrDet()
+    OcrDet::~OcrDet(){
-    {
        if(data)
        {
            free(data);
@@ -142,6 +151,7 @@ namespace ppocr
        }
        if( offload_copy == false )
        {
+            //内存释放
            if(input_buffer_device)
            {
                hipFree(input_buffer_device);
@@ -158,8 +168,7 @@ namespace ppocr
        }
    }
-    cv::Size OcrDet::preproc(cv::Mat img,float* data)
+    cv::Size OcrDet::preproc(cv::Mat img,float* data){
-    {
        float scale = 1.0/255.0;
        std::vector<float> s_mean={0.485, 0.456, 0.406};
        std::vector<float> s_stdv={0.229, 0.224, 0.225};
@@ -189,8 +198,7 @@ namespace ppocr
        return  scale_r ;
    }
-    std::vector<std::vector<float>> OcrDet::get_mini_boxes(cv::RotatedRect box,float &ssid) 
+    std::vector<std::vector<float>> OcrDet::get_mini_boxes(cv::RotatedRect box,float &ssid) {
-    {
        ssid = max(box.size.width, box.size.height);
        cv::Mat points;
        cv::boxPoints(box, points);
@@ -252,7 +260,6 @@ namespace ppocr
            auto array = get_mini_boxes(box, ssid);
            auto box_for_unclip = array;
-            // end get_mini_box
            if (ssid < min_size) {
            continue;
@@ -260,20 +267,19 @@ namespace ppocr
            float score;
            if (use_polygon_score)
-            /* compute using polygon*/
+                //多边形区域的平均得分作为box的分数
-            score = polygon_score_acc(contours[_i], pred);
+                score = polygon_score_acc(contours[_i], pred);
            else
-            score = box_score_fast(array, pred);
+             score = box_score_fast(array, pred);
            if (score < box_thresh)
-            continue;
+                continue;
-            // start for unclip
+            //简化边界得到准确的边界
            cv::RotatedRect points = unClip(box_for_unclip, det_db_unclip_ratio);
            if (points.size.height < 1.001 && points.size.width < 1.001) {
-            continue;
+                continue;
            }
-            // end for unclip
            cv::RotatedRect clipbox = points;
            auto cliparray = get_mini_boxes(clipbox, ssid);
@@ -286,22 +292,21 @@ namespace ppocr
            std::vector<std::vector<int>> intcliparray;
            for (int num_pt = 0; num_pt < 4; num_pt++) {
-            std::vector<int> a{int(clampf(roundf(cliparray[num_pt][0] / float(width) *
+                std::vector<int> a{int(clampf(roundf(cliparray[num_pt][0] / float(width) *
-                                                float(dest_width)),
+                                                    float(dest_width)),
-                                            0, float(dest_width))),
+                                                0, float(dest_width))),
-                                int(clampf(roundf(cliparray[num_pt][1] /
+                                    int(clampf(roundf(cliparray[num_pt][1] /
-                                                float(height) * float(dest_height)),
+                                                    float(height) * float(dest_height)),
-                                            0, float(dest_height)))};
+                                                0, float(dest_height)))};
-            intcliparray.push_back(a);
+                intcliparray.push_back(a);
            }
            boxes.push_back(intcliparray);
-        } // end for
+        }
        return boxes;
    }
-    std::vector<std::vector<float>> OcrDet::Mat2Vector(cv::Mat mat)
+    std::vector<std::vector<float>> OcrDet::Mat2Vector(cv::Mat mat){
-    {
        std::vector<std::vector<float>> img_vec;
        std::vector<float> tmp;
@@ -316,8 +321,7 @@ namespace ppocr
    }
    float  OcrDet::polygon_score_acc(std::vector<cv::Point> contour,
-                                     cv::Mat pred)
+                                     cv::Mat pred){
-    {
        int width = pred.cols;
        int height = pred.rows;
        std::vector<float> box_x;
@@ -364,8 +368,7 @@ namespace ppocr
    }
    float OcrDet::box_score_fast(std::vector<std::vector<float>> box_array,
-                                  cv::Mat pred) 
+                                  cv::Mat pred) {
-    {
        auto array = box_array;
        int width = pred.cols;
        int height = pred.rows;
@@ -402,8 +405,7 @@ namespace ppocr
        return score;
   }
    cv::RotatedRect OcrDet::unClip(std::vector<std::vector<float>> box,
-                                      const float &unclip_ratio)
+                                      const float &unclip_ratio){
-    {
        float distance = 1.0;
        get_contour_area(box, unclip_ratio, distance);
        ClipperLib::ClipperOffset offset;
@@ -433,8 +435,7 @@ namespace ppocr
    }
    void OcrDet::get_contour_area(const std::vector<std::vector<float>> &box,
-                                   float unclip_ratio, float &distance) 
+                                   float unclip_ratio, float &distance) {
-    {
        int pts_num = 4;
        float area = 0.0f;
        float dist = 0.0f;
@@ -452,8 +453,7 @@ namespace ppocr
    std::vector<std::vector<std::vector<int>>>
    OcrDet::filter_det_res(std::vector<std::vector<std::vector<int>>> boxes,
-                                float ratio_h, float ratio_w, cv::Mat srcimg)
+                                float ratio_h, float ratio_w, cv::Mat srcimg){
-    {
        int oriimg_h = srcimg.rows;
        int oriimg_w = srcimg.cols;
@@ -482,8 +482,7 @@ namespace ppocr
        return root_points;
    }
-    std::vector<std::vector<int>> OcrDet::order_points_clockwise(std::vector<std::vector<int>> pts)
+    std::vector<std::vector<int>> OcrDet::order_points_clockwise(std::vector<std::vector<int>> pts){
-    {
        std::vector<std::vector<int>> box = pts;
        std::sort(box.begin(), box.end(), XsortInt);
        std::vector<std::vector<int>> leftmost = {box[0], box[1]};
@@ -500,31 +499,8 @@ namespace ppocr
        return rect;
    }
-    void OcrDet::visualize_boxes(const cv::Mat &srcimg,
-        const std::vector<std::vector<std::vector<int>>> &boxes) 
-    {
-        cv::Mat img_vis;
-        srcimg.copyTo(img_vis);
-        for (int n = 0; n < boxes.size(); n++) {
-            cv::Point rook_points[4];
-            // std::cout<<"size :"<<boxes[n].size()<<'\n';
-            for (int m = 0; m < boxes[n].size(); m++) {
-            rook_points[m] = cv::Point(int(boxes[n][m][0]), int(boxes[n][m][1]));
-            }
-            const cv::Point *ppt[1] = {rook_points};
-            int npt[] = {4};
-            cv::polylines(img_vis, ppt, npt, 1, 1, CV_RGB(0, 255, 0), 2, 8, 0);
-        }
-        cv::imwrite("./ocr_debug.png", img_vis);
-        std::cout << "image saved in ./ocr_result.png"
-                    << std::endl;
-    }
    bool OcrDet::text_recognition(const cv::Mat &srcimg,
-        const std::vector<std::vector<std::vector<int>>> &boxes)
+        const std::vector<std::vector<std::vector<int>>> &boxes){
-    {
        if(boxes.size() == 0)
        {
            std::cout<<"Not found text roi !\n";
@@ -540,15 +516,11 @@ namespace ppocr
            rect.width = boxes[n][2][0] - boxes[n][0][0];
            rect.height = boxes[n][2][1] - boxes[n][0][1];
            text_mat = srcimg(rect).clone();
-            // ocr->forward(text_mat);
-            // cv::rectangle(srcimg,rect,cv::Scalar(0,255,0),2);
        }   
-        // cv::imwrite("region_debug.jpg",srcimg);
        return true;
    }
-    int OcrDet::postprocess(float* feature, std::vector<std::vector<std::vector<int>>> &boxes)
+    int OcrDet::postprocess(float* feature, std::vector<std::vector<std::vector<int>>> &boxes){
-    {
        int batch_s = 1;
        float conf_thres = 0.6;
        cv::Mat thres_mat = cv::Mat(cv::Size(output_height,output_width), CV_8UC1);
@@ -574,8 +546,7 @@ namespace ppocr
        return 0;        
    }
-    bool OcrDet::forward(cv::Mat& img,std::vector<std::vector<std::vector<int>>>& text_roi_boxes)
+    bool OcrDet::forward(cv::Mat& img,std::vector<std::vector<std::vector<int>>>& text_roi_boxes){
-    {
        std::vector<std::vector<std::vector<int>>> boxes;
        cv::Size ratio = preproc(img,data);
@@ -608,8 +579,7 @@ namespace ppocr
        float ratio_h = float(net_input_height) / float(img.rows);
        text_roi_boxes = filter_det_res(boxes, ratio_h, ratio_w, img);
-        visualize_boxes(img,text_roi_boxes);
+        // visualize_boxes(img,text_roi_boxes);
-        // TextRecognition(img,boxes);
        return true;
    }
@@ -620,9 +590,7 @@ namespace ppocr
        int channel,
        int batch_size,
        bool offload_copy,
-        std::string character_dict_path)
+        std::string character_dict_path){ 
-    {
        if(!Exists(rec_model_path))
        {
            LOG_ERROR(stdout, "onnx file not exists!\n");
@@ -633,7 +601,6 @@ namespace ppocr
        this->net_input_height=image_height;
        this->net_input_channel=channel;
        this->precision_mode = precision_mode;
        migraphx::onnx_options onnx_options;
        onnx_options.map_input_dims["x"] = {1, 3, 48, 720};
@@ -663,8 +630,6 @@ namespace ppocr
        this->feature_size = output_shape.lens()[2];
        n_channel = this->output_shape.lens()[1];
-        std::cout<<"["<<this->output_shape.lens()[0]<<
-        ","<<this->output_shape.lens()[1]<<","<<this->output_shape.lens()[2]<<"]\n";
        this->offload_copy = offload_copy;
        migraphx::compile_options options;
@@ -673,23 +638,37 @@ namespace ppocr
        migraphx::target gpuTarget = migraphx::gpu::target{};
        net.compile(gpuTarget, options);
+        float *warm_data = (float*)malloc(this->input_shape.bytes());
+        memset(warm_data, 1.0, this->input_shape.bytes());
        if( this->offload_copy ==false )
        {
-            LOG_INFO(stdout, "Set copy mode ...\n");
            hipMalloc(&input_buffer_device, this->input_shape.bytes());
            hipMalloc(&output_buffer_device, this->output_shape.bytes());
            output_buffer_host   =  (void*)malloc(this->output_shape.bytes());
            dev_argument[input_name]  = migraphx::argument{input_shape, input_buffer_device};
            dev_argument[output_name] = migraphx::argument{output_shape, output_buffer_device};
-        }
+             hipMemcpy(input_buffer_device,
+                  (void*)warm_data,
+                  this->input_shape.bytes(),
+                  hipMemcpyHostToDevice);
+            //warm up
+            std::vector<migraphx::argument> results = net.eval(dev_argument);
+        }else{
+            std::unordered_map<std::string, migraphx::argument> inputData;
+            inputData[input_name] = migraphx::argument{input_shape, (float *)warm_data};
+            //warm up
+            std::vector<migraphx::argument> results = net.eval(inputData);
+        }
+        free(warm_data);
        std::ifstream infile; 
        infile.open(character_dict_path,std::ios::in);    
        assert(infile.is_open()); 
        std::string k_work=""; 
        k_words.clear();
+        //读取字典文件
        while (std::getline(infile,k_work))
        {
            k_words.push_back(k_work);
@@ -697,8 +676,7 @@ namespace ppocr
        system("chcp 65001");
    }
-    CTCDecode::~CTCDecode()
+    CTCDecode::~CTCDecode(){
-    {
        if(data)
        {
            free(data);
@@ -723,8 +701,7 @@ namespace ppocr
        }
    }
-    bool CTCDecode::preproc(cv::Mat img,float* data,int img_w,int img_h)
+    bool CTCDecode::preproc(cv::Mat img,float* data,int img_w,int img_h){
-    {
        if (img.empty())
        {
            std::cout<<"WARNING image is empty!\n";
@@ -754,25 +731,21 @@ namespace ppocr
                    data[i*img_w+j] = (template_mat.at<cv::Vec3b>(i, j)[2]*scale-0.5)/0.5;
                    data[i*img_w+j+img_h*img_w] = (template_mat.at<cv::Vec3b>(i, j)[1]*scale-0.5)/0.5;
                    data[i*img_w+j+2*img_h*img_w] =( template_mat.at<cv::Vec3b>(i, j)[0]*scale-0.5)/0.5;  
                }
            }
        }
        return  true ;
    }
-    std::string CTCDecode::decode(std::vector<float>& probs,std::vector<int>& indexs,float& mean_prob)
+    std::string CTCDecode::decode(std::vector<float>& probs,std::vector<int>& indexs,float& mean_prob){
-    {
        int ignored_tokens=0;
        std::string text="";
        std::vector<float> n_probs;
        std::vector<int> n_indexs;
        int eff_text_num=0;
        for (int i=0;i<n_channel;i++)
        {
-            // std::cout<<"s :"<<i<<":"<<indexs[i]<<"-"<<probs[i]<<std::endl;
            if(indexs[i]==ignored_tokens)
            {
                continue;
@@ -784,7 +757,6 @@ namespace ppocr
            mean_prob+=probs[i];
            text+=k_words[indexs[i]-1];
            eff_text_num++;
        }
@@ -801,38 +773,26 @@ namespace ppocr
    } 
    std::string CTCDecode::postprocess(float* feature)
    {
-        //shape 25*6625
        std::vector<float> probs;
        std::vector<int> indexs;
        float prob=0.;
-        // std::cout<<"n_channel:"<<n_channel<<", feature_size:"<<feature_size<<std::endl;
        for (int i=0;i<n_channel;i++)
        {
            float* c_feat = feature+i*feature_size;
            int max_index = argmax<float*>(c_feat,c_feat+feature_size);
            float max_pro = c_feat[max_index];
-            // std::cout<<"step:"<<i<<"  max_pro:"<<max_pro<<", max_index:"<<max_index<<std::endl;
            probs.push_back(max_pro);
            indexs.push_back(max_index);
        }
        std::string text = decode(probs,indexs,prob);
-        std::cout<<"ocr res :["<<text<<"]\n";
+        std::cout<<"ocr res :"<<text<<"  "<<prob<<"\n";
        return text;
    }
-    std::string  CTCDecode::forward(cv::Mat& img)
+    std::string  CTCDecode::forward(cv::Mat& img){
-    {
        preproc(img,data,net_input_width,net_input_height);
-        // std::unordered_map<std::string, migraphx::argument> inputData;
-        // inputData[input_name] = migraphx::argument{input_shape, data};
-        // std::vector<migraphx::argument> results = net.eval(inputData);
-        // migraphx::argument result = results[0];
        if( this->offload_copy ==false )
        {
            hipMemcpy(input_buffer_device,
@@ -846,8 +806,6 @@ namespace ppocr
            (void*)output_buffer_device,
            output_shape.bytes(),
            hipMemcpyDeviceToHost);
-            // std::cout<<"ctc: copy mode ..."<<std::endl;
            std::string text = postprocess((float *)output_buffer_device);
            return text;
        }else{
@@ -856,46 +814,65 @@ namespace ppocr
            std::vector<migraphx::argument> results = net.eval(inputData);
            migraphx::argument result = results[0] ;  
            std::string text = postprocess((float *)result.data());
-            // std::cout<<"ctc: offload copy mode ..."<<std::endl;
            return text;
        }
-        //get output data (first node)
-        // migraphx::shape outputShape = result.get_shape();
-        // int numberOfOutput = outputShape.elements();
-        // std::vector<std::size_t> outputSize = outputShape.lens();
-        // std::cout<<"output size:"<<outputSize.size()<<std::endl;
-        // for(int i = 0; i < outputSize.size(); i++)
-        // {
-        //     std::cout << outputSize[i] << " ";
-        // }
    }
    ppOcrEngine::ppOcrEngine(const std::string &det_model_path,
                    const std::string &rec_model_path,
                    const std::string &character_dict_path,
+                    const std::string front,
                    float segm_thres,
                    float box_thresh,
                    bool offload_copy,
-                    std::string precision_mode ){
+                    std::string precision_mode
+                    ){
        text_detector = std::make_shared<OcrDet>(det_model_path,precision_mode,offload_copy,segm_thres,box_thresh);
        text_recognizer = std::make_shared<CTCDecode>(rec_model_path,precision_mode,720,48,3,1,offload_copy,character_dict_path);
+        ft2 = std::make_shared<PutText>(front.c_str());
    }
-    ppOcrEngine::~ppOcrEngine()
+    ppOcrEngine::~ppOcrEngine(){
-    {
        ;
    }
-    std::vector<std::string> ppOcrEngine::forward(cv::Mat &srcimg)
+    void ppOcrEngine::visualize_boxes(cv::Mat &srcimg,
+        const std::vector<std::vector<std::vector<int>>> &boxes) {
+        std::vector<std::vector<cv::Point>> contours;
+        for (const auto& box : boxes) {
+            std::vector<cv::Point> pts;
+            for (const auto& point : box) {
+                pts.emplace_back(point[0], point[1]);
+            }
+            contours.push_back(pts);
+        }
+        cv::polylines(
+            srcimg,
+            contours,                 
+            true,                    // 是否闭合
+            cv::Scalar(0, 255, 0),   // 默认绿色
+            2,                       // 线宽
+            cv::LINE_8               // 8连通线
+        );
+    }
+    cv::Mat ppOcrEngine::visualize_text(std::vector<std::string> texts,std::vector<cv::Point> points, cv::Mat &img)
    {
+        assert(texts.size()==points.size()),"error texts size != points size";
+        cv::Mat draw_img = cv::Mat(img.size(), CV_8UC3,cv::Scalar(255,255,255));
+        int width = img.cols*2;
+        int height = img.rows;
+        cv::Mat templete_img = cv::Mat(width,height, CV_8UC3,cv::Scalar(255,255,255));
+        for(int i = 0 ; i < texts.size(); i++)
+        {
+            ft2->putText(draw_img,texts[i],points[i].x,points[i].y,15);
+        }
+        cv::hconcat(img, draw_img, templete_img);
+        return templete_img;
+    }
+    std::vector<std::string> ppOcrEngine::forward(cv::Mat &srcimg){
        std::vector<std::vector<std::vector<int>>> text_roi_boxes;
        std::vector<std::string> text_vec;
        auto start = std::chrono::high_resolution_clock::now();
        text_detector->forward(srcimg,text_roi_boxes);
@@ -904,7 +881,8 @@ namespace ppocr
            std::cout<<"Not found text roi !\n";
            return std::vector<std::string>();
        }
-        std::cout<<"text_roi_boxes.size(): "<<text_roi_boxes.size()<<"\n";
+        std::vector<cv::Point> points;
        for (int n = 0; n < text_roi_boxes.size(); n++) {
            cv::Rect rect;
@@ -920,10 +898,14 @@ namespace ppocr
            text_roi_mat = srcimg(rect).clone();
            std::string text = text_recognizer->forward(text_roi_mat);
            text_vec.push_back(text);
+            points.push_back(cv::Point(rect.x,rect.y));
        }  
        auto end = std::chrono::high_resolution_clock::now(); 
        auto duration_ms = std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
-        std::cout<<"Time taken by task: "<< duration_ms.count() <<" ms\n";
+        std::cout<<"[Time info] elapsed: "<< duration_ms.count() <<" ms\n";
+        visualize_boxes(srcimg,text_roi_boxes);
+        cv::Mat res_img = visualize_text(text_vec,points, srcimg);
+        cv::imwrite("res.jpg",res_img);
        return text_vec;
    }

--- a/Src/ocr_engine.hpp
+++ b/Src/ocr_engine.hpp
@@ -10,15 +10,9 @@
 #include "Filesystem.h"
 #include "SimpleLog.h"
 #include "clipper.h"
+#include "cv_put_Text.hpp"
 namespace ppocr{
-    struct _TEXT_BOX
-    {
-        cv::Rect  t_rect;
-        float score;
-    };
-    using T_BOX = struct _TEXT_BOX;
    class CTCDecode
    {
    private:
@@ -38,7 +32,6 @@ namespace ppocr{
        void* output_buffer_device;
        void* output_buffer_host;
        migraphx::shape input_shape;
        migraphx::shape output_shape;
        std::string input_name;
@@ -51,7 +44,7 @@ namespace ppocr{
    public:
        CTCDecode(std::string rec_model_path,
-        std::string precision_mode="fp32",
+        std::string precision_mode="fp16",
        int image_width=480,
        int image_height=48,
        int channel=3,
@@ -61,7 +54,9 @@ namespace ppocr{
        ~CTCDecode();
        /**
-         * @brief 字符识别编码，可支持，最长可支持预测90个字符，18385个字符
+         * @brief 字符识别、编码API 字符识别编码，可支持，最长可支持预测90个字符，18385个字符
+         * @param img 输入图片
+         * @return 编码后的字符串
         */
        std::string forward(cv::Mat& img);
@@ -93,7 +88,6 @@ namespace ppocr{
         * @return 成功：text,失败：""
         */
        std::string decode(std::vector<float>& probs,std::vector<int>& indexs,float& mean_prob);
    };
    class OcrDet
@@ -117,25 +111,33 @@ namespace ppocr{
        float* data;
-        //Allocate device buffer and host buffer,if offload_copy is false
+        //当offload_copy为true时，分配设备内存
        std::unordered_map<std::string, migraphx::argument> dev_argument;
        void* input_buffer_device;
        void* output_buffer_device;
        void* output_buffer_host;
-        //postprocess
+        //后处理
        int n_channel;
-        int feature_size;  //single channel feature map size.
+        int feature_size;  //单个通道的特征大小，例如模型输出[1,3,32,32],feature_size= 32x32.
        int output_width;
        int output_height;
-        int max_candidates;//maximun number of candidates contours.
+        int max_candidates;//最大检测的候选区域.
    public:
        OcrDet(std::string det_model_path,
-            std::string precision_mode="float32",
+            std::string precision_mode="fp16",
            bool offload_copy = true,
            float segm_thres = 0.3,
            float box_thresh = 0.7);
        ~OcrDet();
+         /**
+         * @brief 字符检测模型推理API
+         * @param img 原始图片
+         * @param text_roi_boxes  字符区域坐标，格式：[[[tl.x, tl.y], [tr.x, tr.y],[], [br.x, br.y], [bl.x, bl.y]]]]
+         *                                                  |              |               |                |
+         *                                               左上坐标        右上坐标         右下坐标        左下坐标
+         * @return 成功返回true，失败返回false
+         */
        bool forward(cv::Mat& img,std::vector<std::vector<std::vector<int>>>& text_roi_boxes);
    private:
@@ -159,17 +161,18 @@ namespace ppocr{
         */
        int  postprocess(float* feature, std::vector<std::vector<std::vector<int>>> &boxes);
+        /**
+         * @brief 后处理，文本区域提取
+         * @param pred  二值图（这里字符检测使用了dbnet分割字符区域，二值图对应了文本区域）
-        int boxes_from_bitmap(cv::Mat& bit_map,std::vector<T_BOX>& box);
+         * @param bitmap 二值图（pred做形态学运算输出bitmap，结合pred结算平均边框得分）
+         * @return 成功：0,失败：-1
+         */
        std::vector<std::vector<std::vector<int>>>boxes_from_bitmap(
        const cv::Mat pred, const cv::Mat bitmap, const float &box_thresh,
        const float &det_db_unclip_ratio, const bool &use_polygon_score);
        std::vector<std::vector<float>> Mat2Vector(cv::Mat mat);
        /**
         * @brief 统计多边形区域的平均得分
         * @param contour  字符区域的轮廓点集合
@@ -238,9 +241,6 @@ namespace ppocr{
         */
        float box_score_fast(std::vector<std::vector<float>> box_array,cv::Mat pred) ;
-        void visualize_boxes(const cv::Mat &srcimg,
-        const std::vector<std::vector<std::vector<int>>> &boxes) ;
        bool text_recognition(const cv::Mat &srcimg,
        const std::vector<std::vector<std::vector<int>>> &boxes);
@@ -250,16 +250,21 @@ namespace ppocr{
        private:
            std::shared_ptr<OcrDet> text_detector;
            std::shared_ptr<CTCDecode> text_recognizer;
+            std::shared_ptr<PutText> ft2 ;
        public:
            ppOcrEngine(const std::string &det_model_path,
                    const std::string &rec_model_path,
                    const std::string &character_dict_path,
+                    const std::string front,
                    const float segm_thres=0.3,
                    const float box_thresh=0.7,
                    bool offload_copy =true,
-                    std::string precision_mode = "fp32") ;
+                    std::string precision_mode = "fp16") ;
            ~ppOcrEngine();
            std::vector<std::string> forward(cv::Mat &srcimg);
+            cv::Mat visualize_text(std::vector<std::string> texts,std::vector<cv::Point> points, cv::Mat &img);
+            void visualize_boxes(cv::Mat &srcimg,
+        const std::vector<std::vector<std::vector<int>>> &boxes) ;
    };
 }