"tools/vscode:/vscode.git/clone" did not exist on "e1458ec8e36190766c42dab4912a8e97b6ee5097"
Commit 417a4ca0 authored by liuhy's avatar liuhy
Browse files

1、新增warm up功能 2、新增图片叠加OCR字符功能

parent 369751c2
...@@ -10,6 +10,7 @@ set(CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS} -std=c++17) ...@@ -10,6 +10,7 @@ set(CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS} -std=c++17)
set(CMAKE_BUILD_TYPE release) set(CMAKE_BUILD_TYPE release)
set(INCLUDE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/Src/ set(INCLUDE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/Src/
/usr/include/freetype2
$ENV{DTKROOT}/include/ $ENV{DTKROOT}/include/
${CMAKE_CURRENT_SOURCE_DIR}/Src/Utility ${CMAKE_CURRENT_SOURCE_DIR}/Src/Utility
${CMAKE_CURRENT_SOURCE_DIR}/3rdParty/opencv/include) ${CMAKE_CURRENT_SOURCE_DIR}/3rdParty/opencv/include)
...@@ -17,6 +18,7 @@ include_directories(${INCLUDE_PATH}) ...@@ -17,6 +18,7 @@ include_directories(${INCLUDE_PATH})
# 添加依赖库路径 # 添加依赖库路径
set(LIBRARY_PATH ${CMAKE_CURRENT_SOURCE_DIR}/3rdParty/opencv/lib set(LIBRARY_PATH ${CMAKE_CURRENT_SOURCE_DIR}/3rdParty/opencv/lib
/usr/lib/x86_64-linux-gnu
$ENV{DTKROOT}/lib/) $ENV{DTKROOT}/lib/)
link_directories(${LIBRARY_PATH}) link_directories(${LIBRARY_PATH})
...@@ -24,6 +26,7 @@ link_directories(${LIBRARY_PATH}) ...@@ -24,6 +26,7 @@ link_directories(${LIBRARY_PATH})
set(LIBRARY opencv_core set(LIBRARY opencv_core
opencv_imgproc opencv_imgproc
opencv_imgcodecs opencv_imgcodecs
freetype
opencv_dnn opencv_dnn
migraphx migraphx
migraphx_gpu migraphx_gpu
...@@ -36,6 +39,7 @@ link_libraries(${LIBRARY}) ...@@ -36,6 +39,7 @@ link_libraries(${LIBRARY})
set(SOURCE_FILES ${CMAKE_CURRENT_SOURCE_DIR}/Src/main.cpp set(SOURCE_FILES ${CMAKE_CURRENT_SOURCE_DIR}/Src/main.cpp
${CMAKE_CURRENT_SOURCE_DIR}/Src/Utility/Filesystem.cpp ${CMAKE_CURRENT_SOURCE_DIR}/Src/Utility/Filesystem.cpp
${CMAKE_CURRENT_SOURCE_DIR}/Src/Utility/clipper.cpp ${CMAKE_CURRENT_SOURCE_DIR}/Src/Utility/clipper.cpp
${CMAKE_CURRENT_SOURCE_DIR}/Src/Utility/cv_put_Text.cpp
${CMAKE_CURRENT_SOURCE_DIR}/Src/ocr_engine.cpp) ${CMAKE_CURRENT_SOURCE_DIR}/Src/ocr_engine.cpp)
# 添加可执行目标 # 添加可执行目标
......
Doc/Images/CRNN.png

112 KB | W: | H:

Doc/Images/CRNN.png

96.4 KB | W: | H:

Doc/Images/CRNN.png
Doc/Images/CRNN.png
Doc/Images/CRNN.png
Doc/Images/CRNN.png
  • 2-up
  • Swipe
  • Onion skin
Doc/Images/DBNet.png

597 KB | W: | H:

Doc/Images/DBNet.png

311 KB | W: | H:

Doc/Images/DBNet.png
Doc/Images/DBNet.png
Doc/Images/DBNet.png
Doc/Images/DBNet.png
  • 2-up
  • Swipe
  • Onion skin
...@@ -4,11 +4,15 @@ PP-OCRv5 是PP-OCR新一代文字识别解决方案,该方案聚焦于多场 ...@@ -4,11 +4,15 @@ PP-OCRv5 是PP-OCR新一代文字识别解决方案,该方案聚焦于多场
## 模型简介 ## 模型简介
### 文本检测 ### 文本检测
文本检测使用了dbnet(论文地址:https://arxiv.org/pdf/1911.08947),网络结构:![alt text](Images/DBNet.png),模型输出概率图,并用Vatti Clipping算法对字符区域多边形简化处理,sample中借助Clipping 库。 sample中使用动态shape(N,3,H,C),最大输入shape是[1,3,640,640],模型地址:Resource/Models/ppocrv5_server_det_infer.onnx 文本检测使用了dbnet( 论文地址:https://arxiv.org/pdf/1911.08947 ),网络结构:
![alt text](Images/DBNet.png)
模型输出概率图,并用Vatti Clipping算法对字符区域多边形简化处理,sample中借助Clipping 库。 sample模型输入shape为[1,3,640,640],模型路径:Resource/Models/ppocrv5_server_det_infer.onnx
### 文本识别 ### 文本识别
文本识别使用了CRNN+CTCDecode(https://arxiv.org/pdf/2009.09941),网络结构:![alt text](Images/CRNN.png),sample中使用了动态shape (N,3,48,W),最大输入shape是[1,3,48,720],模型地址:Resource/Models/ppocrv5_server_rec_infer.onnx 文本识别使用了CRNN+CTCDecode( https://arxiv.org/pdf/2009.09941 ),网络结构:
![(Images/CRNN.png)](Images/CRNN.png)
sample中模型输入shape为[1,3,48,720],模型路径:Resource/Models/ppocrv5_server_rec_infer.onnx
## 预处理 ## 预处理
### 检测模型预处理 ### 检测模型预处理
检测模型输入数据预处理: 检测模型输入数据预处理:
...@@ -110,7 +114,7 @@ class ppOcrEngine { ...@@ -110,7 +114,7 @@ class ppOcrEngine {
const float segm_thres=0.3, const float segm_thres=0.3,
const float box_thresh=0.7, const float box_thresh=0.7,
bool offload_copy =true, bool offload_copy =true,
std::string precision_mode = "fp32") ; std::string precision_mode = "fp16") ;
/** /**
* @brief OCR engine初始化 * @brief OCR engine初始化
* @param det_model_path 字符检测模型路径 * @param det_model_path 字符检测模型路径
...@@ -119,7 +123,7 @@ class ppOcrEngine { ...@@ -119,7 +123,7 @@ class ppOcrEngine {
* @param segm_thres 像素分割阈值 * @param segm_thres 像素分割阈值
* @param box_thresh 字符区域box阈值 * @param box_thresh 字符区域box阈值
* @param offload_copy 内存拷贝存模式, 支持两种数据拷贝方式:*offload_copy=true、offload_copy=false。当offload_copy为true时,不需*要进行内存拷贝,如果为false,需要先预分配输入输出的设备内存,并在推理* *前,将预处理数据拷贝到设备内存,推理后将模型输出从设备内存中拷贝出来 * @param offload_copy 内存拷贝存模式, 支持两种数据拷贝方式:*offload_copy=true、offload_copy=false。当offload_copy为true时,不需*要进行内存拷贝,如果为false,需要先预分配输入输出的设备内存,并在推理* *前,将预处理数据拷贝到设备内存,推理后将模型输出从设备内存中拷贝出来
* @param precision_mode 精度模式,支持:fp32、fp16 * @param precision_mode 精度模式,支持:fp32、fp16,默认支持fp16
* *
* @return NONE * @return NONE
*/ */
...@@ -130,36 +134,11 @@ class ppOcrEngine { ...@@ -130,36 +134,11 @@ class ppOcrEngine {
class CTCDecode class CTCDecode
{ {
private: private:
//inference image ...
float* data;
std::unordered_map<std::string, migraphx::argument> device_data;
migraphx::program net;
int batch_size;
int net_input_width;
int net_input_height;
int net_input_channel;
bool offload_copy;
std::string precision_mode;
std::unordered_map<std::string, migraphx::argument> dev_argument;
void* input_buffer_device;
void* output_buffer_device;
void* output_buffer_host;
migraphx::shape input_shape;
migraphx::shape output_shape;
std::string input_name;
std::string output_name;
//postprocess: n_channel->model output channel,feature_size--> feature size one channel
int n_channel;
int feature_size;
std::vector<std::string> k_words;
public: public:
CTCDecode(std::string rec_model_path, CTCDecode(std::string rec_model_path,
std::string precision_mode="fp32", std::string precision_mode="fp16",
int image_width=480, int image_width=480,
int image_height=48, int image_height=48,
int channel=3, int channel=3,
...@@ -169,73 +148,21 @@ class ppOcrEngine { ...@@ -169,73 +148,21 @@ class ppOcrEngine {
~CTCDecode(); ~CTCDecode();
/** /**
* @brief 字符识别编码,可支持,最长可支持预测90个字符,18385个字符 * @brief 字符识别、编码API 字符识别编码,可支持,最长可支持预测90个字符,18385个字符
* @param img 输入图片
* @return 编码后的字符串
*/ */
std::string forward(cv::Mat& img); std::string forward(cv::Mat& img);
private: private:
/** ...
* @brief 预处理
* pixel = (src_img*scale-0.5)/0.5;
* scale = 1.0/255
* @param img 字符图片
* @param data 预处理输出
* @param img_w 模型输入宽
* @param img_h 模型输入高
* @return 成功:true,失败:false
*/
bool preproc(cv::Mat img,float* data,int img_w=480,int img_h=48);
/**
* @brief 模型预测后处理,获取每行中概率最大的字符,组成一句长度最大为90个字符的句子,模型预测输出shape=[1,90,18385]
* @param feature model output
* @return 成功:text,失败:""
*/
std::string postprocess(float* feature);
/**
* @brief 解码,将模型预测输出与字符集关联起来
* @param probs 模型预测的最大概率
* @param indexs 模型预测的最大概率的索引值
* @param mean_prob 预测句子的平均概率
* @return 成功:text,失败:""
*/
std::string decode(std::vector<float>& probs,std::vector<int>& indexs,float& mean_prob);
}; };
class OcrDet class OcrDet
{ {
private: private:
std::string precision_mode; ...
bool offload_copy;
migraphx::program net;
migraphx::shape input_shape;
migraphx::shape output_shape;
std::string input_name;
std::string output_name;
int det_batch_size;
int data_size ;
float segm_thres;
float box_thres;
int net_input_width;
int net_input_height;
int net_input_channel;
float* data;
//Allocate device buffer and host buffer,if offload_copy is false
std::unordered_map<std::string, migraphx::argument> dev_argument;
void* input_buffer_device;
void* output_buffer_device;
void* output_buffer_host;
//postprocess
int n_channel;
int feature_size; //single channel feature map size.
int output_width;
int output_height;
int max_candidates;//maximun number of candidates contours.
public: public:
OcrDet(std::string det_model_path, OcrDet(std::string det_model_path,
...@@ -244,113 +171,19 @@ class ppOcrEngine { ...@@ -244,113 +171,19 @@ class ppOcrEngine {
float segm_thres = 0.3, float segm_thres = 0.3,
float box_thresh = 0.7); float box_thresh = 0.7);
~OcrDet(); ~OcrDet();
bool forward(cv::Mat& img,std::vector<std::vector<std::vector<int>>>& text_roi_boxes);
private:
/**
* @brief 预处理
* pixel = (scale*src_img*mean/std);
* scale = 1.0/255
* mean = [0.485, 0.456, 0.406]
* std = [0.229, 0.224, 0.225]
* @param img 字符图片
* @param data 预处理输出
* @return 成功:w,h维度的缩放比例
*/
cv::Size preproc(cv::Mat img,float* data);
/**
* @brief 后处理,通过模型预测的二值图获取文本区域
* @param feature 模型预测tensor(这里字符检测使用了dbnet)
* @param boxes 字符区域坐标
* @return 成功:0,失败:-1
*/
int postprocess(float* feature, std::vector<std::vector<std::vector<int>>> &boxes);
int boxes_from_bitmap(cv::Mat& bit_map,std::vector<T_BOX>& box);
std::vector<std::vector<std::vector<int>>>boxes_from_bitmap(
const cv::Mat pred, const cv::Mat bitmap, const float &box_thresh,
const float &det_db_unclip_ratio, const bool &use_polygon_score);
std::vector<std::vector<float>> Mat2Vector(cv::Mat mat);
/**
* @brief 统计多边形区域的平均得分
* @param contour 字符区域的轮廓点集合
* @param pred 模型预测二值图
* @return score
*/
float polygon_score_acc(std::vector<cv::Point> contour,cv::Mat pred);
/**
* @brief 对模型预测的区域进行向内或向外扩散,扩散比例是unclip_ratio ,目的是找到更加合适的字符区域
* @param box 字符区域坐标
* @param pred 模型预测二值图
* @return 处理后的字符区域
*/
cv::RotatedRect unClip(std::vector<std::vector<float>> box,
const float &unclip_ratio);
/**
* @brief 计算偏移距离
* distance = area * unclip_ratio / dist;
* area = ∑(x_i*y_{i+1} - x_{i+1}*y_i)
* dist = sqrtf(dx * dx + dy * dy)
*
* @param box 字符区域坐标
* @param unclip_ratio 缩放比例
* @param distance 偏移距离
* @return NONE
*/
void get_contour_area(const std::vector<std::vector<float>> &box,
float unclip_ratio, float &distance) ;
/**
* @brief 无效字符区域过滤。首先将boxes映射回原始图像,然后过滤无效区域
* @param boxes 字符区域坐标
* @param ratio_h 垂直方向缩放比例
* @param ratio_w 水平方向缩放比例
* @param srcimg 原始图像
*
* @return 字符区域有效坐标
*/
std::vector<std::vector<std::vector<int>>> filter_det_res(std::vector<std::vector<std::vector<int>>> boxes,
float ratio_h, float ratio_w, cv::Mat srcimg);
/** /**
* @brief 对字符区域按照从上到下,从左到右的顺序排序 * @brief 字符检测模型推理API
* @param pts 字符区域坐标 * @param img 原始图片
* * @param text_roi_boxes 字符区域坐标,格式:[[[tl.x, tl.y], [tr.x, tr.y],[], [br.x, br.y], [bl.x, bl.y]]]]
* @return 字符区域有效坐标 * | | | |
*/ * 左上坐标 右上坐标 右下坐标 左下坐标
std::vector<std::vector<int>> order_points_clockwise(std::vector<std::vector<int>> pts); * @return 成功返回true,失败返回false
/**
* @brief 获取最小矩形坐标
* @param box 字符区域最小外接矩形的坐标
* @param ssid box的最大边
* @return 字符区域有效坐标
*/ */
std::vector<std::vector<float>> get_mini_boxes(cv::RotatedRect box,float &ssid) ; bool forward(cv::Mat& img,std::vector<std::vector<std::vector<int>>>& text_roi_boxes);
/**
* @brief 计算bitmap上的t_rect区域的平均分数
* @param box_array 模型预测的字符区域
* @param pred 模型预测二值图
* @return score
*/
float box_score_fast(std::vector<std::vector<float>> box_array,cv::Mat pred) ;
void visualize_boxes(const cv::Mat &srcimg,
const std::vector<std::vector<std::vector<int>>> &boxes) ;
bool text_recognition(const cv::Mat &srcimg, private:
const std::vector<std::vector<std::vector<int>>> &boxes); ...
}; };
...@@ -358,119 +191,84 @@ class ppOcrEngine { ...@@ -358,119 +191,84 @@ class ppOcrEngine {
## 推理 ## 推理
### 字符检测模型推理 - 字符检测
- 字符识别、解码
- 字符框可视化
- OCR结果可视化
```c++ ```c++
bool OcrDet::forward(cv::Mat& img,std::vector<std::vector<std::vector<int>>>& text_roi_boxes) std::vector<std::string> ppOcrEngine::forward(cv::Mat &srcimg){
{ std::vector<std::vector<std::vector<int>>> text_roi_boxes;
std::vector<std::vector<std::vector<int>>> boxes;
//输入数据预处理 std::vector<std::string> text_vec;
cv::Size ratio = preproc(img,data); auto start = std::chrono::high_resolution_clock::now();
/* //字符区域检测
支持两种数据拷贝方式:offload_copy=true、offload_copy=false。当offload_copy为true时,不需要进行内存拷贝,如果为false,需要先预分配输入输出的设备内存,并在推理前,将预处理数据拷贝到设备内存,推理后将模型输出从设备内存中拷贝出来,在做后处理。 text_detector->forward(srcimg,text_roi_boxes);
*/ if(text_roi_boxes.size() == 0)
if( this->offload_copy ==false )
{ {
hipMemcpy(input_buffer_device, std::cout<<"Not found text roi !\n";
(void*)data, return std::vector<std::string>();
this->input_shape.bytes(),
hipMemcpyHostToDevice);
std::vector<migraphx::argument> results = net.eval(dev_argument);
hipMemcpy(output_buffer_host,
(void*)output_buffer_device,
output_shape.bytes(),
hipMemcpyDeviceToHost);
postprocess((float *)output_buffer_host,boxes);
std::cout<<"copy mode ..."<<std::endl;
}else{
std::unordered_map<std::string, migraphx::argument> inputData;
inputData[input_name] = migraphx::argument{input_shape, (float *)data};
std::vector<migraphx::argument> results = net.eval(inputData);
migraphx::argument result = results[0] ; //get output data
postprocess((float *)result.data(),boxes);
std::cout<<"offload copy mode ..."<<std::endl;
} }
//计算等比缩放比例 std::vector<cv::Point> points;
float ratio_w = float(net_input_width) / float(img.cols); //字符识别+编码
float ratio_h = float(net_input_height) / float(img.rows); for (int n = 0; n < text_roi_boxes.size(); n++) {
//过滤无效框
text_roi_boxes = filter_det_res(boxes, ratio_h, ratio_w, img); cv::Rect rect;
//可视化检测结果 cv::Mat text_roi_mat;
visualize_boxes(img,text_roi_boxes); rect.x = text_roi_boxes[n][0][0];
// TextRecognition(img,boxes); rect.y = text_roi_boxes[n][0][1];
return true; rect.width = text_roi_boxes[n][2][0] - text_roi_boxes[n][0][0];
} rect.height = text_roi_boxes[n][2][1] - text_roi_boxes[n][0][1];
if(rect.width <3 || rect.height<3)
{
``` continue;
### 字符识别推理 }
```c++ text_roi_mat = srcimg(rect).clone();
std::string CTCDecode::forward(cv::Mat& img)
{
//预处理
preproc(img,data,net_input_width,net_input_height);
/*
支持两种数据拷贝方式:offload_copy=true、offload_copy=false。当offload_copy为true时,不需要进行内存拷贝,如果为false,需要先预分配输入输出的设备内存,并在推理前,将预处理数据拷贝到设备内存,推理后将模型输出从设备内存中拷贝出来,在做后处理。
*/
if( this->offload_copy ==false ) std::string text = text_recognizer->forward(text_roi_mat);
{ text_vec.push_back(text);
hipMemcpy(input_buffer_device, points.push_back(cv::Point(rect.x,rect.y));
(void*)data, }
this->input_shape.bytes(), auto end = std::chrono::high_resolution_clock::now();
hipMemcpyHostToDevice); auto duration_ms = std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
std::cout<<"[Time info] elapsed: "<< duration_ms.count() <<" ms\n";
//字符框可视化
visualize_boxes(srcimg,text_roi_boxes);
//OCR可视化
cv::Mat res_img = visualize_text(text_vec,points, srcimg);
...
}
std::vector<migraphx::argument> results = net.eval(dev_argument);
hipMemcpy(output_buffer_host,
(void*)output_buffer_device,
output_shape.bytes(),
hipMemcpyDeviceToHost);
//模型后处理,获取字符的最大概率和索引,并根据索引在字符库中查找对应的字符,然后合成一个句子
std::string text = postprocess((float *)output_buffer_device);
return text;
}else{
std::unordered_map<std::string, migraphx::argument> inputData;
inputData[input_name] = migraphx::argument{input_shape, (float *)data};
std::vector<migraphx::argument> results = net.eval(inputData);
migraphx::argument result = results[0] ;
std::string text = postprocess((float *)result.data());
// std::cout<<"ctc: offload copy mode ..."<<std::endl;
return text;
}
}
``` ```
# Ocrv5 API调用说明 # Ocrv5 API调用说明
API调用步骤如下: API调用步骤如下:
- 类实例化 - 类实例化
- 读取测试图片
- 识别接口调用 - 识别接口调用
例: 例:
```c++ ```c++
int main(int argc, char** argv) int main(int argc, char** argv){
{
std::string det_model_onnx = "../Resource/Models/ppocrv5_server_det_infer.onnx"; std::string det_model_onnx = "../Resource/Models/ppocrv5_server_det_infer.onnx";
std::string rec_model_onnx = "../Resource/Models/ppocrv5_server_rec_infer.onnx"; std::string rec_model_onnx = "../Resource/Models/ppocrv5_server_rec_infer.onnx";
std::string img_path = "../Resource/Images/20250703205038.png"; std::string img_path = "../Resource/Images/demo.png";
std::string character_dict_path = "../Resource/ppocr_keys_v5.txt"; std::string character_dict_path = "../Resource/ppocr_keys_v5.txt";
std::string front = "../Resource/fonts/SimHei.ttf";
float segm_thres=0.3; float segm_thres=0.3;
float box_thresh=0.3; float box_thresh=0.3;
ppOcrEngine ocr_engine(det_model_onnx, ppOcrEngine ocr_engine(det_model_onnx,
rec_model_onnx, rec_model_onnx,
character_dict_path, character_dict_path,
front,
segm_thres, segm_thres,
box_thresh, box_thresh,
true, true,
"fp32"); "fp16");
cv::Mat img=cv::imread(img_path); cv::Mat img=cv::imread(img_path);
ocr_engine.forward(img); ocr_engine.forward(img);
return 0; return 0;
} }
``` ```
sample支持两种精度推理(fp32和fp16),默认是fp32),精度和内存拷贝方式分别通过ocr_engine的构造函数传入参数来设置。 sample支持两种精度推理(fp32和fp16),默认是fp16),精度和内存拷贝方式分别通过ocr_engine的构造函数传入参数来设置。
\ No newline at end of file \ No newline at end of file
...@@ -4,10 +4,14 @@ PP-OCRv5 是PP-OCR新一代文字识别解决方案,该方案聚焦于多场 ...@@ -4,10 +4,14 @@ PP-OCRv5 是PP-OCR新一代文字识别解决方案,该方案聚焦于多场
## 模型简介 ## 模型简介
### 文本检测 ### 文本检测
文本检测使用了dbnet(论文地址:https://arxiv.org/pdf/1911.08947),网络结构:![alt text](Images/DBNet.png),模型输出概率图,并用Vatti Clipping算法对字符区域多边形简化处理。 sample中使用动态shape(N,3,H,C),最大输入shape是[1,3,640,640],模型地址:Resource/Models/ppocrv5_server_det_infer.onnx 文本检测使用了dbnet( 论文地址:https://arxiv.org/pdf/1911.08947 ),网络结构:
![alt text](Images/DBNet.png)
模型输出概率图,并用Vatti Clipping算法对字符区域多边形简化处理,sample中借助Clipping 库。 sample中模型输入shape为[1,3,640,640],模型路径:Resource/Models/ppocrv5_server_det_infer.onnx
### 文本识别 ### 文本识别
文本识别使用了CRNN+CTCDecode(https://arxiv.org/pdf/2009.09941),网络结构:![alt text](Images/CRNN.png),sample中使用了动态shape (N,3,48,W),最大输入shape是[1,3,48,720],模型地址:Resource/Models/ppocrv5_server_rec_infer.onnx 文本识别使用了CRNN+CTCDecode( https://arxiv.org/pdf/2009.09941 ),网络结构:
![(Images/CRNN.png)](Images/CRNN.png)
sample中模型输入shape为[1,3,48,720],模型路径:Resource/Models/ppocrv5_server_rec_infer.onnx
## 预处理 ## 预处理
### 检测模型预处理 ### 检测模型预处理
...@@ -80,28 +84,20 @@ def preprocess(self, img, max_wh_ratio): ...@@ -80,28 +84,20 @@ def preprocess(self, img, max_wh_ratio):
imgH, imgW = self.rec_input_size imgH, imgW = self.rec_input_size
max_h,max_w = self.rec_input_size max_h,max_w = self.rec_input_size
h, w = img.shape[:2] h, w = img.shape[:2]
# re_size = (max_w,max_h)
#保留H的原始维度 #保留H的原始维度
if h <= max_h: if h <= max_h:
ratio = max_h / h ratio = max_h / h
w = int(w*ratio) w = int(w*ratio)
if w <= max_w: if w <= max_w:
re_size =(w,max_h) re_size =(w,max_h)
else: else:
re_size = (max_w,max_h) re_size = (max_w,max_h)
else: else:
ratio = max_h/h ratio = max_h/h
w,h = int(w*ratio),max_h w,h = int(w*ratio),max_h
if w <= max_w: if w <= max_w:
re_size = (w,h) re_size = (w,h)
else: else:
re_size = (max_w,h) re_size = (max_w,h)
...@@ -112,12 +108,9 @@ def preprocess(self, img, max_wh_ratio): ...@@ -112,12 +108,9 @@ def preprocess(self, img, max_wh_ratio):
resized_image = resized_image.transpose((2, 0, 1)) / 255 resized_image = resized_image.transpose((2, 0, 1)) / 255
resized_image -= 0.5 resized_image -= 0.5
resized_image /= 0.5 resized_image /= 0.5
#填充,沿着右、下填充 #填充,沿着右、下填充
padding_im = np.zeros((3, imgH, imgW), dtype=np.float32) padding_im = np.zeros((3, imgH, imgW), dtype=np.float32)
padding_im[:, :, 0:re_size[0]] = resized_image padding_im[:, :, 0:re_size[0]] = resized_image
return padding_im return padding_im
``` ```
## 类介绍 ## 类介绍
...@@ -154,7 +147,7 @@ class PPOcrV5(): ...@@ -154,7 +147,7 @@ class PPOcrV5():
**kwargs :设置字符检测模型后处理相关参数 **kwargs :设置字符检测模型后处理相关参数
Returns: Returns:
return_type: NONE。 return_type: 无返回值
Examples: Examples:
det_onnx_path = "PATH/TO/det_onnx_model.onnx" det_onnx_path = "PATH/TO/det_onnx_model.onnx"
...@@ -198,7 +191,7 @@ class TextDetector(object): ...@@ -198,7 +191,7 @@ class TextDetector(object):
**kwargs :设置字符检测模型后处理相关参数 **kwargs :设置字符检测模型后处理相关参数
Returns: Returns:
return_type: NONE return_type: 无返回值
Examples: Examples:
self.db_detector = TextDetector( self.db_detector = TextDetector(
...@@ -216,7 +209,6 @@ class TextDetector(object): ...@@ -216,7 +209,6 @@ class TextDetector(object):
""" """
class TextRecgnizer(object): class TextRecgnizer(object):
"""Support SVTR_LCNet """
def __init__( def __init__(
self, self,
rec_model_path, rec_model_path,
...@@ -240,7 +232,7 @@ class TextRecgnizer(object): ...@@ -240,7 +232,7 @@ class TextRecgnizer(object):
**kwargs :设置字符识别模型后处理相关参数 **kwargs :设置字符识别模型后处理相关参数
Returns: Returns:
return_type: NONE return_type: 无返回值
Examples: Examples:
self.text_extractor = TextRecgnizer(rec_model_path=rec_model_path, self.text_extractor = TextRecgnizer(rec_model_path=rec_model_path,
...@@ -252,18 +244,15 @@ class TextRecgnizer(object): ...@@ -252,18 +244,15 @@ class TextRecgnizer(object):
class BaseRecLabelDecode(object): class BaseRecLabelDecode(object):
def __init__(self, character_dict_path=None, def __init__(self, character_dict_path=None,
use_space_char=False) use_space_char=False)
"""Convert between text-label and text-index """
字符识别(crnn+ctc)。 字符识别(crnn+ctc)。
Args: Args:
character_dict_path :字符集文件路径。 character_dict_path :字符集文件路径。
use_space_char :字符集中是否包含空格。 use_space_char :字符集中是否包含空格。
Returns: Returns:
return_type: NONE。 return_type: 无返回值。
Examples:
Examples:
""" """
class CTCLabelDecode(BaseRecLabelDecode): class CTCLabelDecode(BaseRecLabelDecode):
...@@ -277,140 +266,28 @@ class TextRecgnizer(object): ...@@ -277,140 +266,28 @@ class TextRecgnizer(object):
character_dict_path :字符集文件路径。 character_dict_path :字符集文件路径。
use_space_char :字符集中是否包含空格。 use_space_char :字符集中是否包含空格。
Returns: Returns:
return_type: NONE return_type: 无返回值
Examples: Examples:
""" """
``` ```
## 推理 ## 推理
### 字符检测模型推理
```python
def __call__(self, src_img):
data = self.preprocess(src_img)
"""支持两种数据拷贝方式:offload_copy=true、offload_copy=false。当offload_copy为true时,不需要进行内存拷贝,如果为false,需要先预分配输入输出的设备内存,并在推理前,将预处理数据拷贝到设备内存,推理后将模型输出从设备内存中拷贝出来,在做后处理。"""
if self.offload_copy==False:
self.d_mem[self.det_input_name] =migraphx.to_gpu(migraphx.argument(data["image"]))
results = self.db_model.run(self.d_mem)
else:
results = self.db_model.run({self.det_input_name:data["image"]})
if self.offload_copy==False :
#从gpu拷贝推理结果到cpu
result=migraphx.from_gpu(results[0])
print("offload copy model")
result = np.array(result)
else:
result = results[0]
shape_list = np.expand_dims(data["shape"], axis=0)
pred = np.array(result)
pred = pred[:, 0, :, :]
#获取大于阈值的概率
segmentation = pred > self.thresh
boxes_batch = []
for batch_index in range(pred.shape[0]):
src_h, src_w, ratio_h, ratio_w = shape_list[batch_index]
if self.dilation_kernel is not None:
mask = cv2.dilate(
np.array(segmentation[batch_index]).astype(np.uint8),
self.dilation_kernel,
)
else:
mask = segmentation[batch_index]
#根据预测的bitmap获取文本区域
if self.box_type == "poly":
boxes, scores = self.polygons_from_bitmap(
pred[batch_index], mask, ratio_w,ratio_h, src_w, src_h
)
elif self.box_type == "quad":
boxes, scores = self.boxes_from_bitmap(
pred[batch_index], mask, ratio_w,ratio_h, src_w, src_h
)
else:
raise ValueError("box_type can only be one of ['quad', 'poly']")
boxes_batch.append(boxes)
#文本区域按照从上到下,从左到右的顺序排序
det_box_batch = self.sorted_boxes(boxes_batch)
#文本区域按坐标映射到原始图像
dt_boxes,det_rects = self.box_standardization(det_box_batch,shape_list)
return dt_boxes,det_rects
```
### 字符识别推理
```python ```python
def __call__(self, batch_img_list): def __call__(self, src_img):
if len(batch_img_list) == 0: import time
return [] start = time.time()
width_list = [] #字符检测
#遍历图片列表(字符roi存放在图片列表中),为了支持多batch推理,这里还会将batch_size张图片进行拼接np.concatenate(batch_norm_imgs) dt_boxs,dt_rects = self.db_detector(src_img)
for b in range(len(batch_img_list)): res_img = self.vis_boxes(dt_boxs,src_img)
for img in batch_img_list[b]: #字符区域图片裁剪
width_list.append(img.shape[1] / float(img.shape[0])) batch_img_list = self.detection_roi_crop(src_img,dt_rects)
#字符特征提取
indices = np.argsort(np.array(width_list)) batch_outputs_pre ,batch_max_wh_ratio_pre = self.text_extractor(batch_img_list)
#字符编码
input_batch = self.rec_batch_num batch_text_list, batch_label_list = self.ctc_decoder(batch_outputs_pre,return_word_box=False,wh_ratio_list = batch_max_wh_ratio_pre)
batch_outputs_pre = []
batch_max_wh_ratio_pre = []
for b in range(len(batch_img_list)):
im_count = len(batch_img_list[b])
batch_outputs = []
batch_max_wh_ratio = []
for beg_img_no in range(0, im_count, input_batch):
end_img_no = min(im_count, beg_img_no + input_batch)
# for ino in range(beg_img_no, end_img_no):
# h, w = batch_img_list[b][indices[ino]].shape[0:2]
# wh_ratio = w * 1.0 / h
# max_wh_ratio = max(max_wh_ratio, wh_ratio)
batch_norm_imgs = []
max_wh_ratio = list()
# N batch
for ino in range(beg_img_no, end_img_no):
#单张图片预处理
norm_img = self.preprocess(batch_img_list[b][indices[ino]], max_wh_ratio)
norm_img = norm_img[np.newaxis, :].astype(np.float32)
batch_norm_imgs.append(norm_img)
batch_max_wh_ratio.append(max_wh_ratio)
#batch_size张图片进行拼接
if self.rec_batch_num >1:
norm_img_batch = np.concatenate(batch_norm_imgs)
norm_img_batch = norm_img_batch.copy()
else:
norm_img_batch = np.array([batch_norm_imgs.copy()])
if self.offload_copy==False:
print("offload copy model")
self.d_mem[self.rec_input_name] =migraphx.to_gpu(migraphx.argument(norm_img_batch))
results = self.rec_model.run(self.d_mem)
output = np.array(results[0])
else:
results = self.rec_model.run({self.rec_input_name:norm_img_batch})
output = results[0]
# batch_outputs.append(np.array(output))
#将所有batch的输出结果append到batch_outputs中方便后处理
[batch_outputs.append(out) for out in np.array(output)]
batch_outputs_pre.append(np.array(batch_outputs))
batch_max_wh_ratio_pre.append(batch_max_wh_ratio)
return batch_outputs_pre ,batch_max_wh_ratio_pre
``` ```
# Ocrv5 API调用说明 # Ocrv5 API调用说明
API调用步骤如下: API调用步骤如下:
...@@ -425,8 +302,8 @@ if __name__ == '__main__': ...@@ -425,8 +302,8 @@ if __name__ == '__main__':
rec_onnx_path = "../Resource/Models/ppocrv5_server_rec_infer.onnx" rec_onnx_path = "../Resource/Models/ppocrv5_server_rec_infer.onnx"
image_path = "../Resource/Images/lite_demo.png" image_path = "../Resource/Images/lite_demo.png"
img = cv2.imread(image_path) img = cv2.imread(image_path)
ppocrv5 = PPOcrV5(det_onnx_path,rec_onnx_path,offload_copy=True,precision_mode="fp32") ppocrv5 = PPOcrV5(det_onnx_path,rec_onnx_path,offload_copy=True,precision_mode="fp16")
res_img = ppocrv5(img) res_img = ppocrv5(img)
cv2.imwrite("res.jpg",res_img) cv2.imwrite("res.jpg",res_img)
``` ```
sample支持两种精度推理(fp32和fp16),默认是fp32),精度和内存拷贝方式分别通过precision_mode和offload_copy参数控制。 sample支持两种精度推理(fp32和fp16),默认是fp16),精度和内存拷贝方式分别通过precision_mode和offload_copy参数控制。
\ No newline at end of file \ No newline at end of file
This diff is collapsed.
...@@ -81,6 +81,7 @@ pip install -r requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple ...@@ -81,6 +81,7 @@ pip install -r requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple
``` ```
python inference.py python inference.py
``` ```
结果图片保存在当前目录下:res.jpg
offload_copy和precision_mode设置可参考[Tutorial_Python.md](Doc/Tutorial_Python.md),在main中示例。 offload_copy和precision_mode设置可参考[Tutorial_Python.md](Doc/Tutorial_Python.md),在main中示例。
### C++版本推理 ### C++版本推理
...@@ -104,7 +105,6 @@ cd <path_to_ppocrv5_migraphx> ...@@ -104,7 +105,6 @@ cd <path_to_ppocrv5_migraphx>
sh ./3rdParty/InstallOpenCVDependences.sh sh ./3rdParty/InstallOpenCVDependences.sh
``` ```
#### 安装OpenCV并构建工程 #### 安装OpenCV并构建工程
``` ```
...@@ -116,30 +116,9 @@ rbuild build -d depend ...@@ -116,30 +116,9 @@ rbuild build -d depend
- 进入到opencv-3.4.11_mini目录下创建build目录,cd build - 进入到opencv-3.4.11_mini目录下创建build目录,cd build
- 执行以下命令: - 执行以下命令:
``` ```
cmake -D CMAKE_BUILD_TYPE=RELEASE -D CMAKE_INSTALL_PREFIX=./opencv_dep -D INSTALL_C_EXAMPLES=ON -D INSTALL_PYTHON_EXAMPLES=ON -D OPENCV_GENERATE_PKGCONFIG=ON -D BUILD_EXAMPLES=ON -D OPENCV_EXTRA_MODULES_PATH=../modules/ .. cmake -D CMAKE_BUILD_TYPE=RELEASE -D CMAKE_INSTALL_PREFIX=./opencv_dep -D INSTALL_C_EXAMPLES=ON -D INSTALL_PYTHON_EXAMPLES=ON -D OPENCV_GENERATE_PKGCONFIG=ON -D BUILD_EXAMPLES=ON -D OPENCV_EXTRA_MODULES_PATH=../modules/ ..
``` ```
- 执行make -j8 && make install,编译的头文件和库目录存放在opencv_dep,将opencv_dep目录拷贝到3rdParty,并命名为opencv - 执行make -j8 && make install,编译的头文件和库目录存放在opencv_dep,将opencv_dep目录拷贝到3rdParty,并命名为opencv
#### 设置环境变量
将依赖库依赖加入环境变量LD_LIBRARY_PATH,在~/.bashrc中添加如下语句:
当操作系统是ubuntu系统时:
```
export LD_LIBRARY_PATH=<path_to_ppocrv5_migraphx>/depend/lib/:$LD_LIBRARY_PATH
```
当操作系统是centos系统时:
```
export LD_LIBRARY_PATH=<path_to_ppocrv5_migraphx>/depend/lib64/:$LD_LIBRARY_PATH
```
然后执行:
```
source ~/.bashrc
```
#### 运行示例 #### 运行示例
...@@ -155,58 +134,112 @@ cmake .. && make ...@@ -155,58 +134,112 @@ cmake .. && make
#运行 #运行
./ppOcrV5cd ./ppOcrV5cd
``` ```
结果图片保存在当前目录下:res.jpg
## result ## result
### Python版本 ### Python版本
输出结果中,每个值分别对应每个label的实际概率 输出结果中展示了识别到的字符,每个字符后面跟着一个置信度,置信度值越大,识别结果越准确
``` ```
产品信息/参数, 0.954 '0', 0.991
发足够的滋养, 1.000 纯臻营养护发素, 1.000
纯臻宫乔护发素, 0.883 '0'.'9''9''3''6''0''4', 0.999
花费了'0'.'4''5''7''3''3''5'秒, 0.993 '1', 0.998
【净含量】:'2''2''0'ml, 0.993 产品信息/参数, 0.934
'0'.'9''9''2''7''2''8', 0.999
'2', 0.999
('4''5'元/每公斤,'1''0''0'公斤起订), 0.970
'0'.'9''7''4''1''7', 0.999
'3', 0.999
每瓶'2''2'元,'1''0''0''0'瓶起订), 0.998 每瓶'2''2'元,'1''0''0''0'瓶起订), 0.998
【品名】:纯臻营养护发素, 0.998 '0'.'9''9''3''9''7''6', 0.999
【品牌】:代加工方式/'0'EMODM, 0.968 '4', 0.998
糖、椰油酰胺丙基甜菜碱、泛醒, 0.997 【品牌】:代加工方式/'0'EMODM, 0.959
【适用人群】:适合所有肤质, 0.998 '0'.'9''8''5''1''3''3', 0.998
【产品编号】:YM'-'X'-''3''0''1''1''0'.'9''6''8''9''9', 0.993 '5', 0.998
('4''5'元/每公斤,'1''0''0'公斤起订), 0.972 【品名】:纯臻营养护发素, 0.997
【主要成分】:鲸蜡硬脂醇、燕麦B'-'葡聚, 0.966 '0'.'9''9''5''0''0''7', 0.999
【主要功能】:可紧致头发磷层,从而达到, 0.994 '6', 0.995
即时持久改善头发光泽的效果,给干燥的头, 0.997 【产品编号】:YM'-'X'-''3''0''1''1''0'.'9''6''8''9''9', 0.973
The detectionvisualizedimagsavedin./vis.jpg, 0.940 '7', 0.999
[Time info] elapsed:3.5736 【净含量】:'2''2''0'ml, 0.994
'0'.'9''9''6''5''7''7', 0.999
'8', 0.998
【适用人群】:适合所有肤质, 0.997
'0'.'9''9''5''8''4''2', 0.999
'9', 0.997
【主要成分】:鲸蜡硬脂醇、燕麦B'-'葡聚, 0.976
'0'.'9''6''1''9''2''8', 0.999
'1''0', 1.000
糖、椰油酰胺丙基甜菜碱、泛醒, 0.996
'0'.'9''2''5''8''9''8', 0.999
'1''1', 0.999
(成品包材), 0.998
'0'.'9''7''2''5''7''3', 0.999
'1''2', 1.000
【主要功能】:可紧致头发磷层,从而达到, 0.992
'0'.'9''9''4''4''4''8', 0.999
'1''3', 0.999
即时持久改善头发光泽的效果,给干燥的头, 0.989
'0'.'9''9''0''1''9''8', 0.999
'1''4', 0.999
发足够的滋养, 0.999
'0'.'9''9''7''6''6''8', 0.999
花费了'0'.'4''5''7''3''3''5'秒, 0.993
[Time info] elapsed:578.6152 ms
``` ```
### C++版本 ### C++版本
``` ```
ocr res :[生成一幅画,负向提示词为:画中不要出现人物。正负提示词结合会] ocr res :花费了'0'.'4''5''7''3''3''5'秒 0.984009
ocr res :[Text_encode_'2'.副文本编码器,补充描述性细节(如材质、光照、] ocr res :'0'.'9''9''7' 0.773633
ocr res :[图片的准确性,过滤掉不需要的元素,例如正向提示词为:提示模型] ocr res :发足够的滋养 0.96818
ocr res :[编码器特征融合提升模型的理解能力。] ocr res :'1' 0.697754
ocr res :[正负 prompt 设置:正向 prompt 和负向 prompt 结合可以提升生成] ocr res :'0''0'.'9''9''0''1''9' 0.656647
ocr res :[语义表示捕获提示词的基础含义和全局语境(如对象、动作),与副] ocr res :即时持久改善头发光泽的效果,给干燥的头 0.996608
ocr res :[的图像不会发生变化,随机种子可以增加生成图像的多样性。] ocr res : 0
ocr res :[Text_encode.主文本编码器,将prompt序列转换为一个综合的] ocr res :【主要功能】:可紧致头发磷层,从而达到 0.993421
ocr res :[响初始噪声和生成结果的确定性,固定种子后,同一个prompt生成] ocr res :'0'.'9''9''4''4' 0.677327
ocr res :[声转化为目标图像。] ocr res : 0
ocr res :[随机数设置:随机数种子是控制生成过程随机性的关键参数,直接影] ocr res :'0'.'9''7''2' 0.637158
ocr res :[Scheduler:调度器,控制图像生成,决定了如何逐步将随机噪] ocr res :(成品包材) 0.901937
ocr res :[程和图像生成过程中有着至关重要的作用。] ocr res :'1' 0.32251
ocr res :[在stable'-'dffusion'-'xl'-'base'-''1'.'0'模型中主要包含一下子组件:] ocr res :糖、椰油酰胺丙基甜菜碱、泛醒 0.993478
ocr res :[Pipeline的配置参数控制图像生成的质量和速度,在扩散模型预测过] ocr res :'0'.'9''2''5' 0.586279
ocr res :[具配置文件中的定义手动加载各个子组件。] ocr res :'1''0' 0.547241
ocr res :[这里使用了扩散模型加载器统一加载了所有的子组件,也可以更] ocr res :【主要成分】:鲸蜡硬脂醇、燕麦B'-'葡聚 0.975303
ocr res :[·'2'.'3'pipeline 配置] ocr res :'0'.'9''1''9' 0.568408
Time taken by task: 3475 ms ocr res : 0
ocr res :'0'.'9''9''5''2' 0.613647
ocr res :【适用人群】:适合所有肤质 0.996882
ocr res :'8' 0.378906
ocr res :'0'.'9''9' 0.595581
ocr res :【净含量】:'2''2''0'ml 0.835671
ocr res :'7' 0.356689
ocr res :【产品编号】:YM'-'X'-''3''0''1''1''0'.'9''6''8''9''9' 0.993695
ocr res :'6' 0.214355
ocr res :'0'.'9''9''5' 0.478052
ocr res :【品名】:纯臻营养护发素 0.996175
ocr res :'5' 0.594727
ocr res : 0
ocr res :'0'.'9''8''5' 0.55166
ocr res :【品牌】:代加工方式/'0'EMODM 0.917768
ocr res :每瓶'2''2'元,'1''0''0''0'瓶起订) 0.974644
ocr res :'0'.'9''9''3''9''7''6' 0.736755
ocr res :'3' 0.486572
ocr res :('4''5'元/每公斤,'1''0''0'公斤起订) 0.940028
ocr res :'0'.'9'm'7' 0.534668
ocr res :'2' 0.961426
ocr res : 0
ocr res :'0'.'9''9''2' 0.524121
ocr res :产品信息/参数 0.913853
ocr res :纯臻营养护发素'0'.'9''9''3''6''0''4' 0.964128
ocr res :'0' 0.380127
ocr res :The detection visualized imagesavedin./vis.jpg 0.94302
[Time info] elapsed: 389 ms
``` ```
### 精度 ### 精度
......
#include "cv_put_Text.hpp"
PutText::PutText(const char* font_path) {
// 初始化 FreeType
if (FT_Init_FreeType(&ft)) {
std::cerr << "Error: Could not init FreeType !" << std::endl;
return;
}
// 加载字体文件( 这里使用 SimHei.ttf 字体文件)
if (FT_New_Face(ft, font_path, 0, &face)) {
std::cerr << "Error: Load front failed!" << std::endl;
exit(-1);
}
}
PutText::~PutText() {
// 释放 FreeType 资源
FT_Done_Face(face);
FT_Done_FreeType(ft);
}
void PutText::putText(cv::Mat& img, const std::string& text, int x, int y, int fontSize, cv::Scalar color) {
if(img.empty())
{
std::cerr << "Empty image!";
return ;
}
// 设置字体大小
FT_Set_Pixel_Sizes(face, 0, fontSize);
int start_point_x = x;
int start_point_y = y + fontSize; // 调整基线
// 循环处理每个字符
for (size_t i = 0; i < text.size(); ) {
// 解析 UTF-8 字符
unsigned long unicode = 0;
if ((text[i] & 0x80) == 0) {
unicode = text[i];
i += 1;
} else if ((text[i] & 0xE0) == 0xC0) {
unicode = ((text[i] & 0x1F) << 6) | (text[i + 1] & 0x3F);
i += 2;
} else if ((text[i] & 0xF0) == 0xE0) {
unicode = ((text[i] & 0x0F) << 12) | ((text[i + 1] & 0x3F) << 6) | (text[i + 2] & 0x3F);
i += 3;
} else {
i++; // 无效 UTF-8
continue;
}
// 加载字符字形
if (FT_Load_Char(face, unicode, FT_LOAD_RENDER)) {
std::cerr << "Error: Could not load glyph" << std::endl;
continue;
}
// 绘制到 OpenCV 图像
FT_Bitmap& bitmap = face->glyph->bitmap;
for (int row = 0; row < bitmap.rows; ++row) {
for (int col = 0; col < bitmap.width; ++col) {
unsigned char intensity = bitmap.buffer[row * bitmap.width + col];
if (intensity > 0) {
cv::Vec3b& pixel = img.at<cv::Vec3b>(start_point_y - face->glyph->bitmap_top + row, start_point_x + face->glyph->bitmap_left + col);
pixel[0] = color[0] * (intensity / 255.0) + pixel[0] * (1 - intensity / 255.0);
pixel[1] = color[1] * (intensity / 255.0) + pixel[1] * (1 - intensity / 255.0);
pixel[2] = color[2] * (intensity / 255.0) + pixel[2] * (1 - intensity / 255.0);
}
}
}
start_point_x += face->glyph->advance.x >> 6;
}
}
\ No newline at end of file
#pragma once
#include <ft2build.h>
#include FT_FREETYPE_H
#include <opencv2/opencv.hpp>
class PutText {
private:
FT_Library ft;
FT_Face face;
public:
PutText(const char* font_path);
~PutText();
/**
* @brief 向图片写文字(支持中文)
* @param img 待叠加字符的图片
* @param text 待叠加的字符
* @param x 垂直方向缩放比例
* @param y 水平方向缩放比例
* @param fontSize 原始图像
* @param color 字体颜色,默认绿色
*
* @return 无返回值
*/
void putText(cv::Mat& img, const std::string& text, int x, int y, int fontSize=2, cv::Scalar color=cv::Scalar(0, 255, 0));
};
#include "ocr_engine.hpp" #include "ocr_engine.hpp"
using namespace ppocr; using namespace ppocr;
int main(int argc, char** argv) int main(int argc, char** argv){
{
std::string det_model_onnx = "../Resource/Models/ppocrv5_server_det_infer.onnx"; std::string det_model_onnx = "../Resource/Models/ppocrv5_server_det_infer.onnx";
std::string rec_model_onnx = "../Resource/Models/ppocrv5_server_rec_infer.onnx"; std::string rec_model_onnx = "../Resource/Models/ppocrv5_server_rec_infer.onnx";
std::string img_path = "../Resource/Images/20250703205038.png"; std::string img_path = "../Resource/Images/demo.png";
std::string character_dict_path = "../Resource/ppocr_keys_v5.txt"; std::string character_dict_path = "../Resource/ppocr_keys_v5.txt";
std::string front = "../Resource/fonts/SimHei.ttf";
float segm_thres=0.3; float segm_thres=0.3;
float box_thresh=0.3; float box_thresh=0.3;
ppOcrEngine ocr_engine(det_model_onnx, ppOcrEngine ocr_engine(det_model_onnx,
rec_model_onnx, rec_model_onnx,
character_dict_path, character_dict_path,
front,
segm_thres, segm_thres,
box_thresh, box_thresh,
true, true,
"fp32"); "fp16");
cv::Mat img=cv::imread(img_path); cv::Mat img=cv::imread(img_path);
ocr_engine.forward(img); ocr_engine.forward(img);
return 0; return 0;
} }
\ No newline at end of file
This diff is collapsed.
...@@ -10,15 +10,9 @@ ...@@ -10,15 +10,9 @@
#include "Filesystem.h" #include "Filesystem.h"
#include "SimpleLog.h" #include "SimpleLog.h"
#include "clipper.h" #include "clipper.h"
#include "cv_put_Text.hpp"
namespace ppocr{ namespace ppocr{
struct _TEXT_BOX
{
cv::Rect t_rect;
float score;
};
using T_BOX = struct _TEXT_BOX;
class CTCDecode class CTCDecode
{ {
private: private:
...@@ -38,7 +32,6 @@ namespace ppocr{ ...@@ -38,7 +32,6 @@ namespace ppocr{
void* output_buffer_device; void* output_buffer_device;
void* output_buffer_host; void* output_buffer_host;
migraphx::shape input_shape; migraphx::shape input_shape;
migraphx::shape output_shape; migraphx::shape output_shape;
std::string input_name; std::string input_name;
...@@ -51,7 +44,7 @@ namespace ppocr{ ...@@ -51,7 +44,7 @@ namespace ppocr{
public: public:
CTCDecode(std::string rec_model_path, CTCDecode(std::string rec_model_path,
std::string precision_mode="fp32", std::string precision_mode="fp16",
int image_width=480, int image_width=480,
int image_height=48, int image_height=48,
int channel=3, int channel=3,
...@@ -61,7 +54,9 @@ namespace ppocr{ ...@@ -61,7 +54,9 @@ namespace ppocr{
~CTCDecode(); ~CTCDecode();
/** /**
* @brief 字符识别编码,可支持,最长可支持预测90个字符,18385个字符 * @brief 字符识别、编码API 字符识别编码,可支持,最长可支持预测90个字符,18385个字符
* @param img 输入图片
* @return 编码后的字符串
*/ */
std::string forward(cv::Mat& img); std::string forward(cv::Mat& img);
...@@ -93,7 +88,6 @@ namespace ppocr{ ...@@ -93,7 +88,6 @@ namespace ppocr{
* @return 成功:text,失败:"" * @return 成功:text,失败:""
*/ */
std::string decode(std::vector<float>& probs,std::vector<int>& indexs,float& mean_prob); std::string decode(std::vector<float>& probs,std::vector<int>& indexs,float& mean_prob);
}; };
class OcrDet class OcrDet
...@@ -117,25 +111,33 @@ namespace ppocr{ ...@@ -117,25 +111,33 @@ namespace ppocr{
float* data; float* data;
//Allocate device buffer and host buffer,if offload_copy is false //当offload_copy为true时,分配设备内存
std::unordered_map<std::string, migraphx::argument> dev_argument; std::unordered_map<std::string, migraphx::argument> dev_argument;
void* input_buffer_device; void* input_buffer_device;
void* output_buffer_device; void* output_buffer_device;
void* output_buffer_host; void* output_buffer_host;
//postprocess //后处理
int n_channel; int n_channel;
int feature_size; //single channel feature map size. int feature_size; //单个通道的特征大小,例如模型输出[1,3,32,32],feature_size= 32x32.
int output_width; int output_width;
int output_height; int output_height;
int max_candidates;//maximun number of candidates contours. int max_candidates;//最大检测的候选区域.
public: public:
OcrDet(std::string det_model_path, OcrDet(std::string det_model_path,
std::string precision_mode="float32", std::string precision_mode="fp16",
bool offload_copy = true, bool offload_copy = true,
float segm_thres = 0.3, float segm_thres = 0.3,
float box_thresh = 0.7); float box_thresh = 0.7);
~OcrDet(); ~OcrDet();
/**
* @brief 字符检测模型推理API
* @param img 原始图片
* @param text_roi_boxes 字符区域坐标,格式:[[[tl.x, tl.y], [tr.x, tr.y],[], [br.x, br.y], [bl.x, bl.y]]]]
* | | | |
* 左上坐标 右上坐标 右下坐标 左下坐标
* @return 成功返回true,失败返回false
*/
bool forward(cv::Mat& img,std::vector<std::vector<std::vector<int>>>& text_roi_boxes); bool forward(cv::Mat& img,std::vector<std::vector<std::vector<int>>>& text_roi_boxes);
private: private:
...@@ -159,17 +161,18 @@ namespace ppocr{ ...@@ -159,17 +161,18 @@ namespace ppocr{
*/ */
int postprocess(float* feature, std::vector<std::vector<std::vector<int>>> &boxes); int postprocess(float* feature, std::vector<std::vector<std::vector<int>>> &boxes);
/**
* @brief 后处理,文本区域提取
* @param pred 二值图(这里字符检测使用了dbnet分割字符区域,二值图对应了文本区域)
int boxes_from_bitmap(cv::Mat& bit_map,std::vector<T_BOX>& box); * @param bitmap 二值图(pred做形态学运算输出bitmap,结合pred结算平均边框得分)
* @return 成功:0,失败:-1
*/
std::vector<std::vector<std::vector<int>>>boxes_from_bitmap( std::vector<std::vector<std::vector<int>>>boxes_from_bitmap(
const cv::Mat pred, const cv::Mat bitmap, const float &box_thresh, const cv::Mat pred, const cv::Mat bitmap, const float &box_thresh,
const float &det_db_unclip_ratio, const bool &use_polygon_score); const float &det_db_unclip_ratio, const bool &use_polygon_score);
std::vector<std::vector<float>> Mat2Vector(cv::Mat mat); std::vector<std::vector<float>> Mat2Vector(cv::Mat mat);
/** /**
* @brief 统计多边形区域的平均得分 * @brief 统计多边形区域的平均得分
* @param contour 字符区域的轮廓点集合 * @param contour 字符区域的轮廓点集合
...@@ -238,9 +241,6 @@ namespace ppocr{ ...@@ -238,9 +241,6 @@ namespace ppocr{
*/ */
float box_score_fast(std::vector<std::vector<float>> box_array,cv::Mat pred) ; float box_score_fast(std::vector<std::vector<float>> box_array,cv::Mat pred) ;
void visualize_boxes(const cv::Mat &srcimg,
const std::vector<std::vector<std::vector<int>>> &boxes) ;
bool text_recognition(const cv::Mat &srcimg, bool text_recognition(const cv::Mat &srcimg,
const std::vector<std::vector<std::vector<int>>> &boxes); const std::vector<std::vector<std::vector<int>>> &boxes);
...@@ -250,16 +250,21 @@ namespace ppocr{ ...@@ -250,16 +250,21 @@ namespace ppocr{
private: private:
std::shared_ptr<OcrDet> text_detector; std::shared_ptr<OcrDet> text_detector;
std::shared_ptr<CTCDecode> text_recognizer; std::shared_ptr<CTCDecode> text_recognizer;
std::shared_ptr<PutText> ft2 ;
public: public:
ppOcrEngine(const std::string &det_model_path, ppOcrEngine(const std::string &det_model_path,
const std::string &rec_model_path, const std::string &rec_model_path,
const std::string &character_dict_path, const std::string &character_dict_path,
const std::string front,
const float segm_thres=0.3, const float segm_thres=0.3,
const float box_thresh=0.7, const float box_thresh=0.7,
bool offload_copy =true, bool offload_copy =true,
std::string precision_mode = "fp32") ; std::string precision_mode = "fp16") ;
~ppOcrEngine(); ~ppOcrEngine();
std::vector<std::string> forward(cv::Mat &srcimg); std::vector<std::string> forward(cv::Mat &srcimg);
cv::Mat visualize_text(std::vector<std::string> texts,std::vector<cv::Point> points, cv::Mat &img);
void visualize_boxes(cv::Mat &srcimg,
const std::vector<std::vector<std::vector<int>>> &boxes) ;
}; };
} }
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment