# 概述 PP-OCRv5 是PP-OCR新一代文字识别解决方案，该方案聚焦于多场景、多文字类型的文字识别。在文字类型方面，PP-OCRv5支持简体中文、中文拼音、繁体中文、英文、日文5大主流文字类型，在场景方面，PP-OCRv5升级了中英复杂手写体、竖排文本、生僻字等多种挑战性场景的识别能力。在内部多场景复杂评估集上，PP-OCRv5较PP-OCRv4端到端提升13个百分点,本sample适配了PPOcrV5字符检测和识别模型，并使用MIGraphX 的C++接口实现推理。 ## 模型简介 ### 文本检测文本检测使用了dbnet(论文地址：https://arxiv.org/pdf/1911.08947),网络结构:![alt text](Images/DBNet.png),模型输出概率图，并用Vatti Clipping算法对字符区域多边形简化处理，sample中借助Clipping 库。 sample中使用动态shape（N,3,H,C）,最大输入shape是[1,3,640,640],模型地址：Resource/Models/ppocrv5_server_det_infer.onnx ### 文本识别文本识别使用了CRNN+CTCDecode(https://arxiv.org/pdf/2009.09941)，网络结构：![alt text](Images/CRNN.png)，sample中使用了动态shape (N,3,48,W),最大输入shape是[1,3,48,720],模型地址：Resource/Models/ppocrv5_server_rec_infer.onnx ## 预处理 ### 检测模型预处理检测模型输入数据预处理： - 图片等比缩放，填充（沿着右、下填充） - 图片归一化，减均值除方差 - transpose ,MigraphX的输入数据排布顺序为[N,C,H,W] ```c++ cv::Size OcrDet::preproc(cv::Mat img,float* data) { float scale = 1.0/255.0; std::vector s_mean={0.485, 0.456, 0.406}; std::vector s_stdv={0.229, 0.224, 0.225}; if(img.empty()) { std::cout<<"Source image is empty!\n"; return cv::Size(1.0,1.0); } cv::Mat res_img; cv::Size scale_r; scale_r.width = float(net_input_width)/float(img.cols); scale_r.height = float(net_input_height)/float(img.rows); //等比缩放 cv::resize(img,res_img,cv::Size(net_input_width,net_input_height)); int iw = res_img.cols; int ih = res_img.rows; memset(data,0.0,3*iw*ih*sizeof(float)); //HWC->CHW for(int i=0;i(i, j)[2])*scale-s_mean[2])/s_stdv[2]; data[i*net_input_width+j+net_input_height*net_input_width] = (float(res_img.at(i, j)[1])*scale-s_mean[1])/s_stdv[1]; data[i*net_input_width+j] = (float(res_img.at(i, j)[0])*scale-s_mean[0])/s_stdv[0]; } } return scale_r ; } ``` ### 字符识别模型预处理字符识别模型输入数据预处理： - 等比缩放，保留H维度的原始比例，填充(沿着右、下) - 图片归一化，均值方差默认为0.5 - transpose ,MigraphX的输入数据排布顺序为[N,C,H,W] ```c++ bool CTCDecode::preproc(cv::Mat img,float* data,int img_w,int img_h) { if (img.empty()) { std::cout<<"WARNING image is empty!\n"; return false; } float scale=1.0/255.; int iw=img.cols; int ih=img.rows; float ratio=min(img_h*1.0/ih,img_w*1.0/iw); int nw=static_cast (iw*ratio); int nh=img_h; cv::Mat res_mat; cv::resize(img,res_mat,cv::Size(nw,nh)); cv::Mat template_mat=cv::Mat(img_h,img_w,CV_8UC3,cv::Scalar(0,0,0)); int xdet=img_w-nw; int ydet=img_h-nh; cv::copyMakeBorder(res_mat, template_mat, 0,ydet, 0, xdet, 0); memset(data,0.0,this->batch_size*3*img_w*img_h*sizeof(float)); for(int b =0 ; b < this->batch_size;b++ ) { for(int i=0;i(i, j)[2]*scale-0.5)/0.5; data[i*img_w+j+img_h*img_w] = (template_mat.at(i, j)[1]*scale-0.5)/0.5; data[i*img_w+j+2*img_h*img_w] =( template_mat.at(i, j)[0]*scale-0.5)/0.5; } } } return true ; } ``` ## 类介绍 ppOcrEngine 封装了对外提供的API，OcrDet为文本检测类，CTCDecode为文本识别类。文本检测和文本识别在ppOcrEngine中是两个智能指针变量，在forward，首先调用text_detector检测到图片中的所有字符区域，然后分别将检测到的区域传入到text_recognizer中识别字符区域的内容。 ```c++ class ppOcrEngine { private: std::shared_ptr text_detector; std::shared_ptr text_recognizer; public: ppOcrEngine(const std::string &det_model_path, const std::string &rec_model_path, const std::string &character_dict_path, const float segm_thres=0.3, const float box_thresh=0.7, bool offload_copy =true, std::string precision_mode = "fp32") ; /** * @brief OCR engine初始化 * @param det_model_path 字符检测模型路径 * @param rec_model_path 识别模型路径 * @param character_dict_path 字符字典路径 * @param segm_thres 像素分割阈值 * @param box_thresh 字符区域box阈值 * @param offload_copy 内存拷贝存模式，支持两种数据拷贝方式：*offload_copy=true、offload_copy=false。当offload_copy为true时，不需*要进行内存拷贝，如果为false，需要先预分配输入输出的设备内存，并在推理* *前，将预处理数据拷贝到设备内存，推理后将模型输出从设备内存中拷贝出来 * @param precision_mode 精度模式，支持：fp32、fp16 * * @return NONE */ ~ppOcrEngine(); std::vector forward(cv::Mat &srcimg); }; class CTCDecode { private: //inference image float* data; std::unordered_map device_data; migraphx::program net; int batch_size; int net_input_width; int net_input_height; int net_input_channel; bool offload_copy; std::string precision_mode; std::unordered_map dev_argument; void* input_buffer_device; void* output_buffer_device; void* output_buffer_host; migraphx::shape input_shape; migraphx::shape output_shape; std::string input_name; std::string output_name; //postprocess: n_channel->model output channel,feature_size--> feature size one channel int n_channel; int feature_size; std::vector k_words; public: CTCDecode(std::string rec_model_path, std::string precision_mode="fp32", int image_width=480, int image_height=48, int channel=3, int batch_size = 1, bool offload_copy = true, std::string character_dict_path="./ppocr_keys_v5.txt"); ~CTCDecode(); /** * @brief 字符识别编码，可支持，最长可支持预测90个字符，18385个字符 */ std::string forward(cv::Mat& img); private: /** * @brief 预处理 * pixel = (src_img*scale-0.5)/0.5; * scale = 1.0/255 * @param img 字符图片 * @param data 预处理输出 * @param img_w 模型输入宽 * @param img_h 模型输入高 * @return 成功：true,失败：false */ bool preproc(cv::Mat img,float* data,int img_w=480,int img_h=48); /** * @brief 模型预测后处理，获取每行中概率最大的字符，组成一句长度最大为90个字符的句子，模型预测输出shape=[1,90,18385] * @param feature model output * @return 成功：text,失败："" */ std::string postprocess(float* feature); /** * @brief 解码，将模型预测输出与字符集关联起来 * @param probs 模型预测的最大概率 * @param indexs 模型预测的最大概率的索引值 * @param mean_prob 预测句子的平均概率 * @return 成功：text,失败："" */ std::string decode(std::vector& probs,std::vector& indexs,float& mean_prob); }; class OcrDet { private: std::string precision_mode; bool offload_copy; migraphx::program net; migraphx::shape input_shape; migraphx::shape output_shape; std::string input_name; std::string output_name; int det_batch_size; int data_size ; float segm_thres; float box_thres; int net_input_width; int net_input_height; int net_input_channel; float* data; //Allocate device buffer and host buffer,if offload_copy is false std::unordered_map dev_argument; void* input_buffer_device; void* output_buffer_device; void* output_buffer_host; //postprocess int n_channel; int feature_size; //single channel feature map size. int output_width; int output_height; int max_candidates;//maximun number of candidates contours. public: OcrDet(std::string det_model_path, std::string precision_mode="float32", bool offload_copy = true, float segm_thres = 0.3, float box_thresh = 0.7); ~OcrDet(); bool forward(cv::Mat& img,std::vector>>& text_roi_boxes); private: /** * @brief 预处理 * pixel = (scale*src_img*mean/std); * scale = 1.0/255 * mean = [0.485, 0.456, 0.406] * std = [0.229, 0.224, 0.225] * @param img 字符图片 * @param data 预处理输出 * @return 成功：w,h维度的缩放比例 */ cv::Size preproc(cv::Mat img,float* data); /** * @brief 后处理，通过模型预测的二值图获取文本区域 * @param feature 模型预测tensor（这里字符检测使用了dbnet） * @param boxes 字符区域坐标 * @return 成功：0,失败：-1 */ int postprocess(float* feature, std::vector>> &boxes); int boxes_from_bitmap(cv::Mat& bit_map,std::vector& box); std::vector>>boxes_from_bitmap( const cv::Mat pred, const cv::Mat bitmap, const float &box_thresh, const float &det_db_unclip_ratio, const bool &use_polygon_score); std::vector> Mat2Vector(cv::Mat mat); /** * @brief 统计多边形区域的平均得分 * @param contour 字符区域的轮廓点集合 * @param pred 模型预测二值图 * @return score */ float polygon_score_acc(std::vector contour,cv::Mat pred); /** * @brief 对模型预测的区域进行向内或向外扩散，扩散比例是unclip_ratio ，目的是找到更加合适的字符区域 * @param box 字符区域坐标 * @param pred 模型预测二值图 * @return 处理后的字符区域 */ cv::RotatedRect unClip(std::vector> box, const float &unclip_ratio); /** * @brief 计算偏移距离 * distance = area * unclip_ratio / dist; * area = ∑(x_i*y_{i+1} - x_{i+1}*y_i) * dist = sqrtf(dx * dx + dy * dy) * * @param box 字符区域坐标 * @param unclip_ratio 缩放比例 * @param distance 偏移距离 * @return NONE */ void get_contour_area(const std::vector> &box, float unclip_ratio, float &distance) ; /** * @brief 无效字符区域过滤。首先将boxes映射回原始图像，然后过滤无效区域 * @param boxes 字符区域坐标 * @param ratio_h 垂直方向缩放比例 * @param ratio_w 水平方向缩放比例 * @param srcimg 原始图像 * * @return 字符区域有效坐标 */ std::vector>> filter_det_res(std::vector>> boxes, float ratio_h, float ratio_w, cv::Mat srcimg); /** * @brief 对字符区域按照从上到下，从左到右的顺序排序 * @param pts 字符区域坐标 * * @return 字符区域有效坐标 */ std::vector> order_points_clockwise(std::vector> pts); /** * @brief 获取最小矩形坐标 * @param box 字符区域最小外接矩形的坐标 * @param ssid box的最大边 * @return 字符区域有效坐标 */ std::vector> get_mini_boxes(cv::RotatedRect box,float &ssid) ; /** * @brief 计算bitmap上的t_rect区域的平均分数 * @param box_array 模型预测的字符区域 * @param pred 模型预测二值图 * @return score */ float box_score_fast(std::vector> box_array,cv::Mat pred) ; void visualize_boxes(const cv::Mat &srcimg, const std::vector>> &boxes) ; bool text_recognition(const cv::Mat &srcimg, const std::vector>> &boxes); }; ``` ## 推理 ### 字符检测模型推理 ```c++ bool OcrDet::forward(cv::Mat& img,std::vector>>& text_roi_boxes) { std::vector>> boxes; //输入数据预处理 cv::Size ratio = preproc(img,data); /* 支持两种数据拷贝方式：offload_copy=true、offload_copy=false。当offload_copy为true时，不需要进行内存拷贝，如果为false，需要先预分配输入输出的设备内存，并在推理前，将预处理数据拷贝到设备内存，推理后将模型输出从设备内存中拷贝出来，在做后处理。 */ if( this->offload_copy ==false ) { hipMemcpy(input_buffer_device, (void*)data, this->input_shape.bytes(), hipMemcpyHostToDevice); std::vector results = net.eval(dev_argument); hipMemcpy(output_buffer_host, (void*)output_buffer_device, output_shape.bytes(), hipMemcpyDeviceToHost); postprocess((float *)output_buffer_host,boxes); std::cout<<"copy mode ..."< inputData; inputData[input_name] = migraphx::argument{input_shape, (float *)data}; std::vector results = net.eval(inputData); migraphx::argument result = results[0] ; //get output data postprocess((float *)result.data(),boxes); std::cout<<"offload copy mode ..."<offload_copy ==false ) { hipMemcpy(input_buffer_device, (void*)data, this->input_shape.bytes(), hipMemcpyHostToDevice); std::vector results = net.eval(dev_argument); hipMemcpy(output_buffer_host, (void*)output_buffer_device, output_shape.bytes(), hipMemcpyDeviceToHost); //模型后处理，获取字符的最大概率和索引，并根据索引在字符库中查找对应的字符，然后合成一个句子 std::string text = postprocess((float *)output_buffer_device); return text; }else{ std::unordered_map inputData; inputData[input_name] = migraphx::argument{input_shape, (float *)data}; std::vector results = net.eval(inputData); migraphx::argument result = results[0] ; std::string text = postprocess((float *)result.data()); // std::cout<<"ctc: offload copy mode ..."<