ocr_engine.hpp

#pragma once
#include<iostream>
#include <migraphx/program.hpp>
#include <migraphx/onnx.hpp>
#include <migraphx/gpu/target.hpp>
#include <migraphx/quantization.hpp>
#include <hip/hip_runtime_api.h>
#include <opencv2/opencv.hpp>
#include "CommonDefinition.h"
#include "Filesystem.h"
#include "SimpleLog.h"
#include "clipper.h"
#include "cv_put_Text.hpp"

namespace ppocr{
    class CTCDecode
    {
    private:
        //inference image
        float* data;
        std::unordered_map<std::string, migraphx::argument> device_data;
        migraphx::program net;
        int batch_size;
        int net_input_width;
        int net_input_height;
        int net_input_channel;
        bool offload_copy;
        std::string precision_mode;

        std::unordered_map<std::string, migraphx::argument> dev_argument;
        void* input_buffer_device;
        void* output_buffer_device;
        void* output_buffer_host;

        migraphx::shape input_shape;
        migraphx::shape output_shape;
        std::string input_name;
        std::string output_name;

        //postprocess: n_channel->model output channel,feature_size--> feature size one channel
        int n_channel;
        int feature_size;
        std::vector<std::string> k_words;
        
    public:
        CTCDecode(std::string rec_model_path,
        std::string precision_mode="fp16",
        int image_width=480,
        int image_height=48,
        int channel=3,
        int batch_size = 1,
        bool offload_copy = true,
        std::string character_dict_path="./ppocr_keys_v5.txt");
        
        ~CTCDecode();
        /**
         * @brief 字符识别、编码API 字符识别编码，可支持，最长可支持预测90个字符，18385个字符
         * @param img 输入图片
         * @return 编码后的字符串
         */
        std::string forward(cv::Mat& img);
        
    private:
        /**
         * @brief 预处理
         * pixel = (src_img*scale-0.5)/0.5;
         * scale = 1.0/255
         * @param img  字符图片
         * @param data 预处理输出
         * @param img_w 模型输入宽
         * @param img_h 模型输入高
         * @return 成功：true,失败：false
         */
        bool preproc(cv::Mat img,float* data,int img_w=480,int img_h=48);

        /**
         * @brief 模型预测后处理，获取每行中概率最大的字符，组成一句长度最大为90个字符的句子，模型预测输出shape=[1,90,18385]
         * @param feature model output 
         * @return 成功：text,失败：""
         */
        std::string postprocess(float* feature);
        
        /**
         * @brief 解码，将模型预测输出与字符集关联起来
         * @param probs 模型预测的最大概率
         * @param indexs 模型预测的最大概率的索引值
         * @param mean_prob 预测句子的平均概率
         * @return 成功：text,失败：""
         */
        std::string decode(std::vector<float>& probs,std::vector<int>& indexs,float& mean_prob);
    };
 
    class OcrDet
    {
    private:
        std::string precision_mode;
        bool offload_copy;
        migraphx::program net;
        migraphx::shape input_shape;
        migraphx::shape output_shape;
        std::string input_name;
        std::string output_name;
        int det_batch_size;
        int data_size ; 
        float segm_thres;
        float box_thres;  

        int net_input_width;
        int net_input_height;
        int net_input_channel;
        
        float* data;
        
        //当offload_copy为true时，分配设备内存
        std::unordered_map<std::string, migraphx::argument> dev_argument;
        void* input_buffer_device;
        void* output_buffer_device;
        void* output_buffer_host;
        //后处理
        int n_channel;
        int feature_size;  //单个通道的特征大小，例如模型输出[1,3,32,32],feature_size= 32x32.
        int output_width;
        int output_height;
        int max_candidates;//最大检测的候选区域.

    public:
        OcrDet(std::string det_model_path,
            std::string precision_mode="fp16",
            bool offload_copy = true,
            float segm_thres = 0.3,
            float box_thresh = 0.7);
        ~OcrDet();
         /**
         * @brief 字符检测模型推理API
         * @param img 原始图片
         * @param text_roi_boxes  字符区域坐标，格式：[[[tl.x, tl.y], [tr.x, tr.y],[], [br.x, br.y], [bl.x, bl.y]]]]
         *                                                  |              |               |                |
         *                                               左上坐标        右上坐标         右下坐标        左下坐标
         * @return 成功返回true，失败返回false
         */
        bool forward(cv::Mat& img,std::vector<std::vector<std::vector<int>>>& text_roi_boxes);
        
    private:
        /**
         * @brief 预处理
         * pixel = (scale*src_img*mean/std);
         * scale = 1.0/255
         * mean = [0.485, 0.456, 0.406]
         * std = [0.229, 0.224, 0.225]  
         * @param img  字符图片
         * @param data 预处理输出
         * @return 成功：w,h维度的缩放比例
         */
        cv::Size preproc(cv::Mat img,float* data);
        
        /**
         * @brief 后处理，通过模型预测的二值图获取文本区域
         * @param feature  模型预测tensor（这里字符检测使用了dbnet）
         * @param boxes 字符区域坐标
         * @return 成功：0,失败：-1
         */
        int  postprocess(float* feature, std::vector<std::vector<std::vector<int>>> &boxes);
        
        /**
         * @brief 后处理，文本区域提取
         * @param pred  二值图（这里字符检测使用了dbnet分割字符区域，二值图对应了文本区域）
         * @param bitmap 二值图（pred做形态学运算输出bitmap，结合pred结算平均边框得分）
         * @return 成功：0,失败：-1
         */
        std::vector<std::vector<std::vector<int>>>boxes_from_bitmap(
        const cv::Mat pred, const cv::Mat bitmap, const float &box_thresh,
        const float &det_db_unclip_ratio, const bool &use_polygon_score);

        std::vector<std::vector<float>> Mat2Vector(cv::Mat mat);

        /**
         * @brief 统计多边形区域的平均得分
         * @param contour  字符区域的轮廓点集合
         * @param pred  模型预测二值图
         * @return score
         */
        float polygon_score_acc(std::vector<cv::Point> contour,cv::Mat pred);
        

        /**
         * @brief 对模型预测的区域进行向内或向外扩散，扩散比例是unclip_ratio ，目的是找到更加合适的字符区域
         * @param box  字符区域坐标
         * @param pred  模型预测二值图
         * @return 处理后的字符区域
         */
        cv::RotatedRect unClip(std::vector<std::vector<float>> box,
                                      const float &unclip_ratio);
        
         /**
         * @brief 计算偏移距离
         *  distance = area * unclip_ratio / dist;
         *  area = ∑(x_i*y_{i+1} - x_{i+1}*y_i)
         *  dist = sqrtf(dx * dx + dy * dy)
         *
         * @param box  字符区域坐标
         * @param unclip_ratio  缩放比例
         * @param distance 偏移距离
         * @return  NONE
         */
        void get_contour_area(const std::vector<std::vector<float>> &box,
                                   float unclip_ratio, float &distance) ;
        
        /**
         * @brief 无效字符区域过滤。首先将boxes映射回原始图像，然后过滤无效区域
         * @param boxes  字符区域坐标
         * @param ratio_h  垂直方向缩放比例
         * @param ratio_w  水平方向缩放比例
         * @param srcimg   原始图像
         * 
         * @return  字符区域有效坐标
         */
        std::vector<std::vector<std::vector<int>>> filter_det_res(std::vector<std::vector<std::vector<int>>> boxes,
                                float ratio_h, float ratio_w, cv::Mat srcimg);
        
        /**
         * @brief 对字符区域按照从上到下，从左到右的顺序排序
         * @param pts  字符区域坐标
         * 
         * @return  字符区域有效坐标
         */
        std::vector<std::vector<int>> order_points_clockwise(std::vector<std::vector<int>> pts);
        
         /**
         * @brief 获取最小矩形坐标
         * @param box  字符区域最小外接矩形的坐标
         * @param ssid  box的最大边
         * @return  字符区域有效坐标
         */
        std::vector<std::vector<float>> get_mini_boxes(cv::RotatedRect box,float &ssid) ;
        
        /**
         * @brief 计算bitmap上的t_rect区域的平均分数
         * @param box_array  模型预测的字符区域
         * @param pred  模型预测二值图
         * @return score
         */
        float box_score_fast(std::vector<std::vector<float>> box_array,cv::Mat pred) ;

        bool text_recognition(const cv::Mat &srcimg,
        const std::vector<std::vector<std::vector<int>>> &boxes);

    };

    class ppOcrEngine { 
        private:
            std::shared_ptr<OcrDet> text_detector;
            std::shared_ptr<CTCDecode> text_recognizer;
            std::shared_ptr<PutText> ft2 ;
        public:
            ppOcrEngine(const std::string &det_model_path,
                    const std::string &rec_model_path,
                    const std::string &character_dict_path,
                    const std::string front,
                    const float segm_thres=0.3,
                    const float box_thresh=0.7,
                    bool offload_copy =true,
                    std::string precision_mode = "fp16") ;
            ~ppOcrEngine();
            std::vector<std::string> forward(cv::Mat &srcimg);
            cv::Mat visualize_text(std::vector<std::string> texts,std::vector<cv::Point> points, cv::Mat &img);
            void visualize_boxes(cv::Mat &srcimg,
        const std::vector<std::vector<std::vector<int>>> &boxes) ;
    };

}