Commit 417a4ca0 authored by liuhy's avatar liuhy
Browse files

1、新增warm up功能 2、新增图片叠加OCR字符功能

parent 369751c2
...@@ -10,6 +10,7 @@ set(CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS} -std=c++17) ...@@ -10,6 +10,7 @@ set(CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS} -std=c++17)
set(CMAKE_BUILD_TYPE release) set(CMAKE_BUILD_TYPE release)
set(INCLUDE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/Src/ set(INCLUDE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/Src/
/usr/include/freetype2
$ENV{DTKROOT}/include/ $ENV{DTKROOT}/include/
${CMAKE_CURRENT_SOURCE_DIR}/Src/Utility ${CMAKE_CURRENT_SOURCE_DIR}/Src/Utility
${CMAKE_CURRENT_SOURCE_DIR}/3rdParty/opencv/include) ${CMAKE_CURRENT_SOURCE_DIR}/3rdParty/opencv/include)
...@@ -17,6 +18,7 @@ include_directories(${INCLUDE_PATH}) ...@@ -17,6 +18,7 @@ include_directories(${INCLUDE_PATH})
# 添加依赖库路径 # 添加依赖库路径
set(LIBRARY_PATH ${CMAKE_CURRENT_SOURCE_DIR}/3rdParty/opencv/lib set(LIBRARY_PATH ${CMAKE_CURRENT_SOURCE_DIR}/3rdParty/opencv/lib
/usr/lib/x86_64-linux-gnu
$ENV{DTKROOT}/lib/) $ENV{DTKROOT}/lib/)
link_directories(${LIBRARY_PATH}) link_directories(${LIBRARY_PATH})
...@@ -24,6 +26,7 @@ link_directories(${LIBRARY_PATH}) ...@@ -24,6 +26,7 @@ link_directories(${LIBRARY_PATH})
set(LIBRARY opencv_core set(LIBRARY opencv_core
opencv_imgproc opencv_imgproc
opencv_imgcodecs opencv_imgcodecs
freetype
opencv_dnn opencv_dnn
migraphx migraphx
migraphx_gpu migraphx_gpu
...@@ -36,6 +39,7 @@ link_libraries(${LIBRARY}) ...@@ -36,6 +39,7 @@ link_libraries(${LIBRARY})
set(SOURCE_FILES ${CMAKE_CURRENT_SOURCE_DIR}/Src/main.cpp set(SOURCE_FILES ${CMAKE_CURRENT_SOURCE_DIR}/Src/main.cpp
${CMAKE_CURRENT_SOURCE_DIR}/Src/Utility/Filesystem.cpp ${CMAKE_CURRENT_SOURCE_DIR}/Src/Utility/Filesystem.cpp
${CMAKE_CURRENT_SOURCE_DIR}/Src/Utility/clipper.cpp ${CMAKE_CURRENT_SOURCE_DIR}/Src/Utility/clipper.cpp
${CMAKE_CURRENT_SOURCE_DIR}/Src/Utility/cv_put_Text.cpp
${CMAKE_CURRENT_SOURCE_DIR}/Src/ocr_engine.cpp) ${CMAKE_CURRENT_SOURCE_DIR}/Src/ocr_engine.cpp)
# 添加可执行目标 # 添加可执行目标
......
Doc/Images/CRNN.png

112 KB | W: | H:

Doc/Images/CRNN.png

96.4 KB | W: | H:

Doc/Images/CRNN.png
Doc/Images/CRNN.png
Doc/Images/CRNN.png
Doc/Images/CRNN.png
  • 2-up
  • Swipe
  • Onion skin
Doc/Images/DBNet.png

597 KB | W: | H:

Doc/Images/DBNet.png

311 KB | W: | H:

Doc/Images/DBNet.png
Doc/Images/DBNet.png
Doc/Images/DBNet.png
Doc/Images/DBNet.png
  • 2-up
  • Swipe
  • Onion skin
...@@ -4,10 +4,14 @@ PP-OCRv5 是PP-OCR新一代文字识别解决方案,该方案聚焦于多场 ...@@ -4,10 +4,14 @@ PP-OCRv5 是PP-OCR新一代文字识别解决方案,该方案聚焦于多场
## 模型简介 ## 模型简介
### 文本检测 ### 文本检测
文本检测使用了dbnet(论文地址:https://arxiv.org/pdf/1911.08947),网络结构:![alt text](Images/DBNet.png),模型输出概率图,并用Vatti Clipping算法对字符区域多边形简化处理,sample中借助Clipping 库。 sample中使用动态shape(N,3,H,C),最大输入shape是[1,3,640,640],模型地址:Resource/Models/ppocrv5_server_det_infer.onnx 文本检测使用了dbnet( 论文地址:https://arxiv.org/pdf/1911.08947 ),网络结构:
![alt text](Images/DBNet.png)
模型输出概率图,并用Vatti Clipping算法对字符区域多边形简化处理,sample中借助Clipping 库。 sample模型输入shape为[1,3,640,640],模型路径:Resource/Models/ppocrv5_server_det_infer.onnx
### 文本识别 ### 文本识别
文本识别使用了CRNN+CTCDecode(https://arxiv.org/pdf/2009.09941),网络结构:![alt text](Images/CRNN.png),sample中使用了动态shape (N,3,48,W),最大输入shape是[1,3,48,720],模型地址:Resource/Models/ppocrv5_server_rec_infer.onnx 文本识别使用了CRNN+CTCDecode( https://arxiv.org/pdf/2009.09941 ),网络结构:
![(Images/CRNN.png)](Images/CRNN.png)
sample中模型输入shape为[1,3,48,720],模型路径:Resource/Models/ppocrv5_server_rec_infer.onnx
## 预处理 ## 预处理
### 检测模型预处理 ### 检测模型预处理
...@@ -110,7 +114,7 @@ class ppOcrEngine { ...@@ -110,7 +114,7 @@ class ppOcrEngine {
const float segm_thres=0.3, const float segm_thres=0.3,
const float box_thresh=0.7, const float box_thresh=0.7,
bool offload_copy =true, bool offload_copy =true,
std::string precision_mode = "fp32") ; std::string precision_mode = "fp16") ;
/** /**
* @brief OCR engine初始化 * @brief OCR engine初始化
* @param det_model_path 字符检测模型路径 * @param det_model_path 字符检测模型路径
...@@ -119,7 +123,7 @@ class ppOcrEngine { ...@@ -119,7 +123,7 @@ class ppOcrEngine {
* @param segm_thres 像素分割阈值 * @param segm_thres 像素分割阈值
* @param box_thresh 字符区域box阈值 * @param box_thresh 字符区域box阈值
* @param offload_copy 内存拷贝存模式, 支持两种数据拷贝方式:*offload_copy=true、offload_copy=false。当offload_copy为true时,不需*要进行内存拷贝,如果为false,需要先预分配输入输出的设备内存,并在推理* *前,将预处理数据拷贝到设备内存,推理后将模型输出从设备内存中拷贝出来 * @param offload_copy 内存拷贝存模式, 支持两种数据拷贝方式:*offload_copy=true、offload_copy=false。当offload_copy为true时,不需*要进行内存拷贝,如果为false,需要先预分配输入输出的设备内存,并在推理* *前,将预处理数据拷贝到设备内存,推理后将模型输出从设备内存中拷贝出来
* @param precision_mode 精度模式,支持:fp32、fp16 * @param precision_mode 精度模式,支持:fp32、fp16,默认支持fp16
* *
* @return NONE * @return NONE
*/ */
...@@ -130,36 +134,11 @@ class ppOcrEngine { ...@@ -130,36 +134,11 @@ class ppOcrEngine {
class CTCDecode class CTCDecode
{ {
private: private:
//inference image ...
float* data;
std::unordered_map<std::string, migraphx::argument> device_data;
migraphx::program net;
int batch_size;
int net_input_width;
int net_input_height;
int net_input_channel;
bool offload_copy;
std::string precision_mode;
std::unordered_map<std::string, migraphx::argument> dev_argument;
void* input_buffer_device;
void* output_buffer_device;
void* output_buffer_host;
migraphx::shape input_shape;
migraphx::shape output_shape;
std::string input_name;
std::string output_name;
//postprocess: n_channel->model output channel,feature_size--> feature size one channel
int n_channel;
int feature_size;
std::vector<std::string> k_words;
public: public:
CTCDecode(std::string rec_model_path, CTCDecode(std::string rec_model_path,
std::string precision_mode="fp32", std::string precision_mode="fp16",
int image_width=480, int image_width=480,
int image_height=48, int image_height=48,
int channel=3, int channel=3,
...@@ -169,73 +148,21 @@ class ppOcrEngine { ...@@ -169,73 +148,21 @@ class ppOcrEngine {
~CTCDecode(); ~CTCDecode();
/** /**
* @brief 字符识别编码,可支持,最长可支持预测90个字符,18385个字符 * @brief 字符识别、编码API 字符识别编码,可支持,最长可支持预测90个字符,18385个字符
* @param img 输入图片
* @return 编码后的字符串
*/ */
std::string forward(cv::Mat& img); std::string forward(cv::Mat& img);
private: private:
/** ...
* @brief 预处理
* pixel = (src_img*scale-0.5)/0.5;
* scale = 1.0/255
* @param img 字符图片
* @param data 预处理输出
* @param img_w 模型输入宽
* @param img_h 模型输入高
* @return 成功:true,失败:false
*/
bool preproc(cv::Mat img,float* data,int img_w=480,int img_h=48);
/**
* @brief 模型预测后处理,获取每行中概率最大的字符,组成一句长度最大为90个字符的句子,模型预测输出shape=[1,90,18385]
* @param feature model output
* @return 成功:text,失败:""
*/
std::string postprocess(float* feature);
/**
* @brief 解码,将模型预测输出与字符集关联起来
* @param probs 模型预测的最大概率
* @param indexs 模型预测的最大概率的索引值
* @param mean_prob 预测句子的平均概率
* @return 成功:text,失败:""
*/
std::string decode(std::vector<float>& probs,std::vector<int>& indexs,float& mean_prob);
}; };
class OcrDet class OcrDet
{ {
private: private:
std::string precision_mode; ...
bool offload_copy;
migraphx::program net;
migraphx::shape input_shape;
migraphx::shape output_shape;
std::string input_name;
std::string output_name;
int det_batch_size;
int data_size ;
float segm_thres;
float box_thres;
int net_input_width;
int net_input_height;
int net_input_channel;
float* data;
//Allocate device buffer and host buffer,if offload_copy is false
std::unordered_map<std::string, migraphx::argument> dev_argument;
void* input_buffer_device;
void* output_buffer_device;
void* output_buffer_host;
//postprocess
int n_channel;
int feature_size; //single channel feature map size.
int output_width;
int output_height;
int max_candidates;//maximun number of candidates contours.
public: public:
OcrDet(std::string det_model_path, OcrDet(std::string det_model_path,
...@@ -244,113 +171,19 @@ class ppOcrEngine { ...@@ -244,113 +171,19 @@ class ppOcrEngine {
float segm_thres = 0.3, float segm_thres = 0.3,
float box_thresh = 0.7); float box_thresh = 0.7);
~OcrDet(); ~OcrDet();
bool forward(cv::Mat& img,std::vector<std::vector<std::vector<int>>>& text_roi_boxes);
private:
/**
* @brief 预处理
* pixel = (scale*src_img*mean/std);
* scale = 1.0/255
* mean = [0.485, 0.456, 0.406]
* std = [0.229, 0.224, 0.225]
* @param img 字符图片
* @param data 预处理输出
* @return 成功:w,h维度的缩放比例
*/
cv::Size preproc(cv::Mat img,float* data);
/**
* @brief 后处理,通过模型预测的二值图获取文本区域
* @param feature 模型预测tensor(这里字符检测使用了dbnet)
* @param boxes 字符区域坐标
* @return 成功:0,失败:-1
*/
int postprocess(float* feature, std::vector<std::vector<std::vector<int>>> &boxes);
int boxes_from_bitmap(cv::Mat& bit_map,std::vector<T_BOX>& box);
std::vector<std::vector<std::vector<int>>>boxes_from_bitmap(
const cv::Mat pred, const cv::Mat bitmap, const float &box_thresh,
const float &det_db_unclip_ratio, const bool &use_polygon_score);
std::vector<std::vector<float>> Mat2Vector(cv::Mat mat);
/**
* @brief 统计多边形区域的平均得分
* @param contour 字符区域的轮廓点集合
* @param pred 模型预测二值图
* @return score
*/
float polygon_score_acc(std::vector<cv::Point> contour,cv::Mat pred);
/**
* @brief 对模型预测的区域进行向内或向外扩散,扩散比例是unclip_ratio ,目的是找到更加合适的字符区域
* @param box 字符区域坐标
* @param pred 模型预测二值图
* @return 处理后的字符区域
*/
cv::RotatedRect unClip(std::vector<std::vector<float>> box,
const float &unclip_ratio);
/** /**
* @brief 计算偏移距离 * @brief 字符检测模型推理API
* distance = area * unclip_ratio / dist; * @param img 原始图片
* area = ∑(x_i*y_{i+1} - x_{i+1}*y_i) * @param text_roi_boxes 字符区域坐标,格式:[[[tl.x, tl.y], [tr.x, tr.y],[], [br.x, br.y], [bl.x, bl.y]]]]
* dist = sqrtf(dx * dx + dy * dy) * | | | |
* * 左上坐标 右上坐标 右下坐标 左下坐标
* @param box 字符区域坐标 * @return 成功返回true,失败返回false
* @param unclip_ratio 缩放比例
* @param distance 偏移距离
* @return NONE
*/ */
void get_contour_area(const std::vector<std::vector<float>> &box, bool forward(cv::Mat& img,std::vector<std::vector<std::vector<int>>>& text_roi_boxes);
float unclip_ratio, float &distance) ;
/**
* @brief 无效字符区域过滤。首先将boxes映射回原始图像,然后过滤无效区域
* @param boxes 字符区域坐标
* @param ratio_h 垂直方向缩放比例
* @param ratio_w 水平方向缩放比例
* @param srcimg 原始图像
*
* @return 字符区域有效坐标
*/
std::vector<std::vector<std::vector<int>>> filter_det_res(std::vector<std::vector<std::vector<int>>> boxes,
float ratio_h, float ratio_w, cv::Mat srcimg);
/**
* @brief 对字符区域按照从上到下,从左到右的顺序排序
* @param pts 字符区域坐标
*
* @return 字符区域有效坐标
*/
std::vector<std::vector<int>> order_points_clockwise(std::vector<std::vector<int>> pts);
/**
* @brief 获取最小矩形坐标
* @param box 字符区域最小外接矩形的坐标
* @param ssid box的最大边
* @return 字符区域有效坐标
*/
std::vector<std::vector<float>> get_mini_boxes(cv::RotatedRect box,float &ssid) ;
/**
* @brief 计算bitmap上的t_rect区域的平均分数
* @param box_array 模型预测的字符区域
* @param pred 模型预测二值图
* @return score
*/
float box_score_fast(std::vector<std::vector<float>> box_array,cv::Mat pred) ;
void visualize_boxes(const cv::Mat &srcimg,
const std::vector<std::vector<std::vector<int>>> &boxes) ;
bool text_recognition(const cv::Mat &srcimg, private:
const std::vector<std::vector<std::vector<int>>> &boxes); ...
}; };
...@@ -358,119 +191,84 @@ class ppOcrEngine { ...@@ -358,119 +191,84 @@ class ppOcrEngine {
## 推理 ## 推理
### 字符检测模型推理 - 字符检测
- 字符识别、解码
- 字符框可视化
- OCR结果可视化
```c++ ```c++
bool OcrDet::forward(cv::Mat& img,std::vector<std::vector<std::vector<int>>>& text_roi_boxes) std::vector<std::string> ppOcrEngine::forward(cv::Mat &srcimg){
std::vector<std::vector<std::vector<int>>> text_roi_boxes;
std::vector<std::string> text_vec;
auto start = std::chrono::high_resolution_clock::now();
//字符区域检测
text_detector->forward(srcimg,text_roi_boxes);
if(text_roi_boxes.size() == 0)
{ {
std::vector<std::vector<std::vector<int>>> boxes; std::cout<<"Not found text roi !\n";
//输入数据预处理 return std::vector<std::string>();
cv::Size ratio = preproc(img,data);
/*
支持两种数据拷贝方式:offload_copy=true、offload_copy=false。当offload_copy为true时,不需要进行内存拷贝,如果为false,需要先预分配输入输出的设备内存,并在推理前,将预处理数据拷贝到设备内存,推理后将模型输出从设备内存中拷贝出来,在做后处理。
*/
if( this->offload_copy ==false )
{
hipMemcpy(input_buffer_device,
(void*)data,
this->input_shape.bytes(),
hipMemcpyHostToDevice);
std::vector<migraphx::argument> results = net.eval(dev_argument);
hipMemcpy(output_buffer_host,
(void*)output_buffer_device,
output_shape.bytes(),
hipMemcpyDeviceToHost);
postprocess((float *)output_buffer_host,boxes);
std::cout<<"copy mode ..."<<std::endl;
}else{
std::unordered_map<std::string, migraphx::argument> inputData;
inputData[input_name] = migraphx::argument{input_shape, (float *)data};
std::vector<migraphx::argument> results = net.eval(inputData);
migraphx::argument result = results[0] ; //get output data
postprocess((float *)result.data(),boxes);
std::cout<<"offload copy mode ..."<<std::endl;
}
//计算等比缩放比例
float ratio_w = float(net_input_width) / float(img.cols);
float ratio_h = float(net_input_height) / float(img.rows);
//过滤无效框
text_roi_boxes = filter_det_res(boxes, ratio_h, ratio_w, img);
//可视化检测结果
visualize_boxes(img,text_roi_boxes);
// TextRecognition(img,boxes);
return true;
} }
std::vector<cv::Point> points;
``` //字符识别+编码
### 字符识别推理 for (int n = 0; n < text_roi_boxes.size(); n++) {
```c++
std::string CTCDecode::forward(cv::Mat& img) cv::Rect rect;
cv::Mat text_roi_mat;
rect.x = text_roi_boxes[n][0][0];
rect.y = text_roi_boxes[n][0][1];
rect.width = text_roi_boxes[n][2][0] - text_roi_boxes[n][0][0];
rect.height = text_roi_boxes[n][2][1] - text_roi_boxes[n][0][1];
if(rect.width <3 || rect.height<3)
{ {
//预处理 continue;
preproc(img,data,net_input_width,net_input_height);
/*
支持两种数据拷贝方式:offload_copy=true、offload_copy=false。当offload_copy为true时,不需要进行内存拷贝,如果为false,需要先预分配输入输出的设备内存,并在推理前,将预处理数据拷贝到设备内存,推理后将模型输出从设备内存中拷贝出来,在做后处理。
*/
if( this->offload_copy ==false )
{
hipMemcpy(input_buffer_device,
(void*)data,
this->input_shape.bytes(),
hipMemcpyHostToDevice);
std::vector<migraphx::argument> results = net.eval(dev_argument);
hipMemcpy(output_buffer_host,
(void*)output_buffer_device,
output_shape.bytes(),
hipMemcpyDeviceToHost);
//模型后处理,获取字符的最大概率和索引,并根据索引在字符库中查找对应的字符,然后合成一个句子
std::string text = postprocess((float *)output_buffer_device);
return text;
}else{
std::unordered_map<std::string, migraphx::argument> inputData;
inputData[input_name] = migraphx::argument{input_shape, (float *)data};
std::vector<migraphx::argument> results = net.eval(inputData);
migraphx::argument result = results[0] ;
std::string text = postprocess((float *)result.data());
// std::cout<<"ctc: offload copy mode ..."<<std::endl;
return text;
} }
text_roi_mat = srcimg(rect).clone();
std::string text = text_recognizer->forward(text_roi_mat);
text_vec.push_back(text);
points.push_back(cv::Point(rect.x,rect.y));
} }
auto end = std::chrono::high_resolution_clock::now();
auto duration_ms = std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
std::cout<<"[Time info] elapsed: "<< duration_ms.count() <<" ms\n";
//字符框可视化
visualize_boxes(srcimg,text_roi_boxes);
//OCR可视化
cv::Mat res_img = visualize_text(text_vec,points, srcimg);
...
}
``` ```
# Ocrv5 API调用说明 # Ocrv5 API调用说明
API调用步骤如下: API调用步骤如下:
- 类实例化 - 类实例化
- 读取测试图片
- 识别接口调用 - 识别接口调用
例: 例:
```c++ ```c++
int main(int argc, char** argv) int main(int argc, char** argv){
{
std::string det_model_onnx = "../Resource/Models/ppocrv5_server_det_infer.onnx"; std::string det_model_onnx = "../Resource/Models/ppocrv5_server_det_infer.onnx";
std::string rec_model_onnx = "../Resource/Models/ppocrv5_server_rec_infer.onnx"; std::string rec_model_onnx = "../Resource/Models/ppocrv5_server_rec_infer.onnx";
std::string img_path = "../Resource/Images/20250703205038.png"; std::string img_path = "../Resource/Images/demo.png";
std::string character_dict_path = "../Resource/ppocr_keys_v5.txt"; std::string character_dict_path = "../Resource/ppocr_keys_v5.txt";
std::string front = "../Resource/fonts/SimHei.ttf";
float segm_thres=0.3; float segm_thres=0.3;
float box_thresh=0.3; float box_thresh=0.3;
ppOcrEngine ocr_engine(det_model_onnx, ppOcrEngine ocr_engine(det_model_onnx,
rec_model_onnx, rec_model_onnx,
character_dict_path, character_dict_path,
front,
segm_thres, segm_thres,
box_thresh, box_thresh,
true, true,
"fp32"); "fp16");
cv::Mat img=cv::imread(img_path); cv::Mat img=cv::imread(img_path);
ocr_engine.forward(img); ocr_engine.forward(img);
return 0; return 0;
} }
``` ```
sample支持两种精度推理(fp32和fp16),默认是fp32),精度和内存拷贝方式分别通过ocr_engine的构造函数传入参数来设置。 sample支持两种精度推理(fp32和fp16),默认是fp16),精度和内存拷贝方式分别通过ocr_engine的构造函数传入参数来设置。
\ No newline at end of file \ No newline at end of file
...@@ -4,10 +4,14 @@ PP-OCRv5 是PP-OCR新一代文字识别解决方案,该方案聚焦于多场 ...@@ -4,10 +4,14 @@ PP-OCRv5 是PP-OCR新一代文字识别解决方案,该方案聚焦于多场
## 模型简介 ## 模型简介
### 文本检测 ### 文本检测
文本检测使用了dbnet(论文地址:https://arxiv.org/pdf/1911.08947),网络结构:![alt text](Images/DBNet.png),模型输出概率图,并用Vatti Clipping算法对字符区域多边形简化处理。 sample中使用动态shape(N,3,H,C),最大输入shape是[1,3,640,640],模型地址:Resource/Models/ppocrv5_server_det_infer.onnx 文本检测使用了dbnet( 论文地址:https://arxiv.org/pdf/1911.08947 ),网络结构:
![alt text](Images/DBNet.png)
模型输出概率图,并用Vatti Clipping算法对字符区域多边形简化处理,sample中借助Clipping 库。 sample中模型输入shape为[1,3,640,640],模型路径:Resource/Models/ppocrv5_server_det_infer.onnx
### 文本识别 ### 文本识别
文本识别使用了CRNN+CTCDecode(https://arxiv.org/pdf/2009.09941),网络结构:![alt text](Images/CRNN.png),sample中使用了动态shape (N,3,48,W),最大输入shape是[1,3,48,720],模型地址:Resource/Models/ppocrv5_server_rec_infer.onnx 文本识别使用了CRNN+CTCDecode( https://arxiv.org/pdf/2009.09941 ),网络结构:
![(Images/CRNN.png)](Images/CRNN.png)
sample中模型输入shape为[1,3,48,720],模型路径:Resource/Models/ppocrv5_server_rec_infer.onnx
## 预处理 ## 预处理
### 检测模型预处理 ### 检测模型预处理
...@@ -80,28 +84,20 @@ def preprocess(self, img, max_wh_ratio): ...@@ -80,28 +84,20 @@ def preprocess(self, img, max_wh_ratio):
imgH, imgW = self.rec_input_size imgH, imgW = self.rec_input_size
max_h,max_w = self.rec_input_size max_h,max_w = self.rec_input_size
h, w = img.shape[:2] h, w = img.shape[:2]
# re_size = (max_w,max_h)
#保留H的原始维度 #保留H的原始维度
if h <= max_h: if h <= max_h:
ratio = max_h / h ratio = max_h / h
w = int(w*ratio) w = int(w*ratio)
if w <= max_w: if w <= max_w:
re_size =(w,max_h) re_size =(w,max_h)
else: else:
re_size = (max_w,max_h) re_size = (max_w,max_h)
else: else:
ratio = max_h/h ratio = max_h/h
w,h = int(w*ratio),max_h w,h = int(w*ratio),max_h
if w <= max_w: if w <= max_w:
re_size = (w,h) re_size = (w,h)
else: else:
re_size = (max_w,h) re_size = (max_w,h)
...@@ -112,12 +108,9 @@ def preprocess(self, img, max_wh_ratio): ...@@ -112,12 +108,9 @@ def preprocess(self, img, max_wh_ratio):
resized_image = resized_image.transpose((2, 0, 1)) / 255 resized_image = resized_image.transpose((2, 0, 1)) / 255
resized_image -= 0.5 resized_image -= 0.5
resized_image /= 0.5 resized_image /= 0.5
#填充,沿着右、下填充 #填充,沿着右、下填充
padding_im = np.zeros((3, imgH, imgW), dtype=np.float32) padding_im = np.zeros((3, imgH, imgW), dtype=np.float32)
padding_im[:, :, 0:re_size[0]] = resized_image padding_im[:, :, 0:re_size[0]] = resized_image
return padding_im return padding_im
``` ```
## 类介绍 ## 类介绍
...@@ -154,7 +147,7 @@ class PPOcrV5(): ...@@ -154,7 +147,7 @@ class PPOcrV5():
**kwargs :设置字符检测模型后处理相关参数 **kwargs :设置字符检测模型后处理相关参数
Returns: Returns:
return_type: NONE。 return_type: 无返回值
Examples: Examples:
det_onnx_path = "PATH/TO/det_onnx_model.onnx" det_onnx_path = "PATH/TO/det_onnx_model.onnx"
...@@ -198,7 +191,7 @@ class TextDetector(object): ...@@ -198,7 +191,7 @@ class TextDetector(object):
**kwargs :设置字符检测模型后处理相关参数 **kwargs :设置字符检测模型后处理相关参数
Returns: Returns:
return_type: NONE return_type: 无返回值
Examples: Examples:
self.db_detector = TextDetector( self.db_detector = TextDetector(
...@@ -216,7 +209,6 @@ class TextDetector(object): ...@@ -216,7 +209,6 @@ class TextDetector(object):
""" """
class TextRecgnizer(object): class TextRecgnizer(object):
"""Support SVTR_LCNet """
def __init__( def __init__(
self, self,
rec_model_path, rec_model_path,
...@@ -240,7 +232,7 @@ class TextRecgnizer(object): ...@@ -240,7 +232,7 @@ class TextRecgnizer(object):
**kwargs :设置字符识别模型后处理相关参数 **kwargs :设置字符识别模型后处理相关参数
Returns: Returns:
return_type: NONE return_type: 无返回值
Examples: Examples:
self.text_extractor = TextRecgnizer(rec_model_path=rec_model_path, self.text_extractor = TextRecgnizer(rec_model_path=rec_model_path,
...@@ -252,18 +244,15 @@ class TextRecgnizer(object): ...@@ -252,18 +244,15 @@ class TextRecgnizer(object):
class BaseRecLabelDecode(object): class BaseRecLabelDecode(object):
def __init__(self, character_dict_path=None, def __init__(self, character_dict_path=None,
use_space_char=False) use_space_char=False)
"""Convert between text-label and text-index """
字符识别(crnn+ctc)。 字符识别(crnn+ctc)。
Args: Args:
character_dict_path :字符集文件路径。 character_dict_path :字符集文件路径。
use_space_char :字符集中是否包含空格。 use_space_char :字符集中是否包含空格。
Returns: Returns:
return_type: NONE。 return_type: 无返回值。
Examples: Examples:
""" """
class CTCLabelDecode(BaseRecLabelDecode): class CTCLabelDecode(BaseRecLabelDecode):
...@@ -277,139 +266,27 @@ class TextRecgnizer(object): ...@@ -277,139 +266,27 @@ class TextRecgnizer(object):
character_dict_path :字符集文件路径。 character_dict_path :字符集文件路径。
use_space_char :字符集中是否包含空格。 use_space_char :字符集中是否包含空格。
Returns: Returns:
return_type: NONE return_type: 无返回值
Examples: Examples:
""" """
``` ```
## 推理 ## 推理
### 字符检测模型推理
```python
def __call__(self, src_img):
data = self.preprocess(src_img)
"""支持两种数据拷贝方式:offload_copy=true、offload_copy=false。当offload_copy为true时,不需要进行内存拷贝,如果为false,需要先预分配输入输出的设备内存,并在推理前,将预处理数据拷贝到设备内存,推理后将模型输出从设备内存中拷贝出来,在做后处理。"""
if self.offload_copy==False:
self.d_mem[self.det_input_name] =migraphx.to_gpu(migraphx.argument(data["image"]))
results = self.db_model.run(self.d_mem)
else:
results = self.db_model.run({self.det_input_name:data["image"]})
if self.offload_copy==False :
#从gpu拷贝推理结果到cpu
result=migraphx.from_gpu(results[0])
print("offload copy model")
result = np.array(result)
else:
result = results[0]
shape_list = np.expand_dims(data["shape"], axis=0)
pred = np.array(result)
pred = pred[:, 0, :, :]
#获取大于阈值的概率
segmentation = pred > self.thresh
boxes_batch = []
for batch_index in range(pred.shape[0]):
src_h, src_w, ratio_h, ratio_w = shape_list[batch_index]
if self.dilation_kernel is not None:
mask = cv2.dilate(
np.array(segmentation[batch_index]).astype(np.uint8),
self.dilation_kernel,
)
else:
mask = segmentation[batch_index]
#根据预测的bitmap获取文本区域
if self.box_type == "poly":
boxes, scores = self.polygons_from_bitmap(
pred[batch_index], mask, ratio_w,ratio_h, src_w, src_h
)
elif self.box_type == "quad":
boxes, scores = self.boxes_from_bitmap(
pred[batch_index], mask, ratio_w,ratio_h, src_w, src_h
)
else:
raise ValueError("box_type can only be one of ['quad', 'poly']")
boxes_batch.append(boxes)
#文本区域按照从上到下,从左到右的顺序排序
det_box_batch = self.sorted_boxes(boxes_batch)
#文本区域按坐标映射到原始图像
dt_boxes,det_rects = self.box_standardization(det_box_batch,shape_list)
return dt_boxes,det_rects
```
### 字符识别推理
```python ```python
def __call__(self, batch_img_list): def __call__(self, src_img):
if len(batch_img_list) == 0: import time
return [] start = time.time()
width_list = [] #字符检测
#遍历图片列表(字符roi存放在图片列表中),为了支持多batch推理,这里还会将batch_size张图片进行拼接np.concatenate(batch_norm_imgs) dt_boxs,dt_rects = self.db_detector(src_img)
for b in range(len(batch_img_list)): res_img = self.vis_boxes(dt_boxs,src_img)
for img in batch_img_list[b]: #字符区域图片裁剪
width_list.append(img.shape[1] / float(img.shape[0])) batch_img_list = self.detection_roi_crop(src_img,dt_rects)
#字符特征提取
indices = np.argsort(np.array(width_list)) batch_outputs_pre ,batch_max_wh_ratio_pre = self.text_extractor(batch_img_list)
#字符编码
input_batch = self.rec_batch_num batch_text_list, batch_label_list = self.ctc_decoder(batch_outputs_pre,return_word_box=False,wh_ratio_list = batch_max_wh_ratio_pre)
batch_outputs_pre = []
batch_max_wh_ratio_pre = []
for b in range(len(batch_img_list)):
im_count = len(batch_img_list[b])
batch_outputs = []
batch_max_wh_ratio = []
for beg_img_no in range(0, im_count, input_batch):
end_img_no = min(im_count, beg_img_no + input_batch)
# for ino in range(beg_img_no, end_img_no):
# h, w = batch_img_list[b][indices[ino]].shape[0:2]
# wh_ratio = w * 1.0 / h
# max_wh_ratio = max(max_wh_ratio, wh_ratio)
batch_norm_imgs = []
max_wh_ratio = list()
# N batch
for ino in range(beg_img_no, end_img_no):
#单张图片预处理
norm_img = self.preprocess(batch_img_list[b][indices[ino]], max_wh_ratio)
norm_img = norm_img[np.newaxis, :].astype(np.float32)
batch_norm_imgs.append(norm_img)
batch_max_wh_ratio.append(max_wh_ratio)
#batch_size张图片进行拼接
if self.rec_batch_num >1:
norm_img_batch = np.concatenate(batch_norm_imgs)
norm_img_batch = norm_img_batch.copy()
else:
norm_img_batch = np.array([batch_norm_imgs.copy()])
if self.offload_copy==False:
print("offload copy model")
self.d_mem[self.rec_input_name] =migraphx.to_gpu(migraphx.argument(norm_img_batch))
results = self.rec_model.run(self.d_mem)
output = np.array(results[0])
else:
results = self.rec_model.run({self.rec_input_name:norm_img_batch})
output = results[0]
# batch_outputs.append(np.array(output))
#将所有batch的输出结果append到batch_outputs中方便后处理
[batch_outputs.append(out) for out in np.array(output)]
batch_outputs_pre.append(np.array(batch_outputs))
batch_max_wh_ratio_pre.append(batch_max_wh_ratio)
return batch_outputs_pre ,batch_max_wh_ratio_pre
``` ```
# Ocrv5 API调用说明 # Ocrv5 API调用说明
...@@ -425,8 +302,8 @@ if __name__ == '__main__': ...@@ -425,8 +302,8 @@ if __name__ == '__main__':
rec_onnx_path = "../Resource/Models/ppocrv5_server_rec_infer.onnx" rec_onnx_path = "../Resource/Models/ppocrv5_server_rec_infer.onnx"
image_path = "../Resource/Images/lite_demo.png" image_path = "../Resource/Images/lite_demo.png"
img = cv2.imread(image_path) img = cv2.imread(image_path)
ppocrv5 = PPOcrV5(det_onnx_path,rec_onnx_path,offload_copy=True,precision_mode="fp32") ppocrv5 = PPOcrV5(det_onnx_path,rec_onnx_path,offload_copy=True,precision_mode="fp16")
res_img = ppocrv5(img) res_img = ppocrv5(img)
cv2.imwrite("res.jpg",res_img) cv2.imwrite("res.jpg",res_img)
``` ```
sample支持两种精度推理(fp32和fp16),默认是fp32),精度和内存拷贝方式分别通过precision_mode和offload_copy参数控制。 sample支持两种精度推理(fp32和fp16),默认是fp16),精度和内存拷贝方式分别通过precision_mode和offload_copy参数控制。
\ No newline at end of file \ No newline at end of file
This diff is collapsed.
...@@ -81,6 +81,7 @@ pip install -r requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple ...@@ -81,6 +81,7 @@ pip install -r requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple
``` ```
python inference.py python inference.py
``` ```
结果图片保存在当前目录下:res.jpg
offload_copy和precision_mode设置可参考[Tutorial_Python.md](Doc/Tutorial_Python.md),在main中示例。 offload_copy和precision_mode设置可参考[Tutorial_Python.md](Doc/Tutorial_Python.md),在main中示例。
### C++版本推理 ### C++版本推理
...@@ -104,7 +105,6 @@ cd <path_to_ppocrv5_migraphx> ...@@ -104,7 +105,6 @@ cd <path_to_ppocrv5_migraphx>
sh ./3rdParty/InstallOpenCVDependences.sh sh ./3rdParty/InstallOpenCVDependences.sh
``` ```
#### 安装OpenCV并构建工程 #### 安装OpenCV并构建工程
``` ```
...@@ -119,27 +119,6 @@ rbuild build -d depend ...@@ -119,27 +119,6 @@ rbuild build -d depend
cmake -D CMAKE_BUILD_TYPE=RELEASE -D CMAKE_INSTALL_PREFIX=./opencv_dep -D INSTALL_C_EXAMPLES=ON -D INSTALL_PYTHON_EXAMPLES=ON -D OPENCV_GENERATE_PKGCONFIG=ON -D BUILD_EXAMPLES=ON -D OPENCV_EXTRA_MODULES_PATH=../modules/ .. cmake -D CMAKE_BUILD_TYPE=RELEASE -D CMAKE_INSTALL_PREFIX=./opencv_dep -D INSTALL_C_EXAMPLES=ON -D INSTALL_PYTHON_EXAMPLES=ON -D OPENCV_GENERATE_PKGCONFIG=ON -D BUILD_EXAMPLES=ON -D OPENCV_EXTRA_MODULES_PATH=../modules/ ..
``` ```
- 执行make -j8 && make install,编译的头文件和库目录存放在opencv_dep,将opencv_dep目录拷贝到3rdParty,并命名为opencv - 执行make -j8 && make install,编译的头文件和库目录存放在opencv_dep,将opencv_dep目录拷贝到3rdParty,并命名为opencv
#### 设置环境变量
将依赖库依赖加入环境变量LD_LIBRARY_PATH,在~/.bashrc中添加如下语句:
当操作系统是ubuntu系统时:
```
export LD_LIBRARY_PATH=<path_to_ppocrv5_migraphx>/depend/lib/:$LD_LIBRARY_PATH
```
当操作系统是centos系统时:
```
export LD_LIBRARY_PATH=<path_to_ppocrv5_migraphx>/depend/lib64/:$LD_LIBRARY_PATH
```
然后执行:
```
source ~/.bashrc
```
#### 运行示例 #### 运行示例
...@@ -155,58 +134,112 @@ cmake .. && make ...@@ -155,58 +134,112 @@ cmake .. && make
#运行 #运行
./ppOcrV5cd ./ppOcrV5cd
``` ```
结果图片保存在当前目录下:res.jpg
## result ## result
### Python版本 ### Python版本
输出结果中,每个值分别对应每个label的实际概率 输出结果中展示了识别到的字符,每个字符后面跟着一个置信度,置信度值越大,识别结果越准确
``` ```
产品信息/参数, 0.954 '0', 0.991
发足够的滋养, 1.000 纯臻营养护发素, 1.000
纯臻宫乔护发素, 0.883 '0'.'9''9''3''6''0''4', 0.999
花费了'0'.'4''5''7''3''3''5'秒, 0.993 '1', 0.998
【净含量】:'2''2''0'ml, 0.993 产品信息/参数, 0.934
'0'.'9''9''2''7''2''8', 0.999
'2', 0.999
('4''5'元/每公斤,'1''0''0'公斤起订), 0.970
'0'.'9''7''4''1''7', 0.999
'3', 0.999
每瓶'2''2'元,'1''0''0''0'瓶起订), 0.998 每瓶'2''2'元,'1''0''0''0'瓶起订), 0.998
【品名】:纯臻营养护发素, 0.998 '0'.'9''9''3''9''7''6', 0.999
【品牌】:代加工方式/'0'EMODM, 0.968 '4', 0.998
糖、椰油酰胺丙基甜菜碱、泛醒, 0.997 【品牌】:代加工方式/'0'EMODM, 0.959
【适用人群】:适合所有肤质, 0.998 '0'.'9''8''5''1''3''3', 0.998
【产品编号】:YM'-'X'-''3''0''1''1''0'.'9''6''8''9''9', 0.993 '5', 0.998
('4''5'元/每公斤,'1''0''0'公斤起订), 0.972 【品名】:纯臻营养护发素, 0.997
【主要成分】:鲸蜡硬脂醇、燕麦B'-'葡聚, 0.966 '0'.'9''9''5''0''0''7', 0.999
【主要功能】:可紧致头发磷层,从而达到, 0.994 '6', 0.995
即时持久改善头发光泽的效果,给干燥的头, 0.997 【产品编号】:YM'-'X'-''3''0''1''1''0'.'9''6''8''9''9', 0.973
The detectionvisualizedimagsavedin./vis.jpg, 0.940 '7', 0.999
[Time info] elapsed:3.5736 【净含量】:'2''2''0'ml, 0.994
'0'.'9''9''6''5''7''7', 0.999
'8', 0.998
【适用人群】:适合所有肤质, 0.997
'0'.'9''9''5''8''4''2', 0.999
'9', 0.997
【主要成分】:鲸蜡硬脂醇、燕麦B'-'葡聚, 0.976
'0'.'9''6''1''9''2''8', 0.999
'1''0', 1.000
糖、椰油酰胺丙基甜菜碱、泛醒, 0.996
'0'.'9''2''5''8''9''8', 0.999
'1''1', 0.999
(成品包材), 0.998
'0'.'9''7''2''5''7''3', 0.999
'1''2', 1.000
【主要功能】:可紧致头发磷层,从而达到, 0.992
'0'.'9''9''4''4''4''8', 0.999
'1''3', 0.999
即时持久改善头发光泽的效果,给干燥的头, 0.989
'0'.'9''9''0''1''9''8', 0.999
'1''4', 0.999
发足够的滋养, 0.999
'0'.'9''9''7''6''6''8', 0.999
花费了'0'.'4''5''7''3''3''5'秒, 0.993
[Time info] elapsed:578.6152 ms
``` ```
### C++版本 ### C++版本
``` ```
ocr res :[生成一幅画,负向提示词为:画中不要出现人物。正负提示词结合会] ocr res :花费了'0'.'4''5''7''3''3''5'秒 0.984009
ocr res :[Text_encode_'2'.副文本编码器,补充描述性细节(如材质、光照、] ocr res :'0'.'9''9''7' 0.773633
ocr res :[图片的准确性,过滤掉不需要的元素,例如正向提示词为:提示模型] ocr res :发足够的滋养 0.96818
ocr res :[编码器特征融合提升模型的理解能力。] ocr res :'1' 0.697754
ocr res :[正负 prompt 设置:正向 prompt 和负向 prompt 结合可以提升生成] ocr res :'0''0'.'9''9''0''1''9' 0.656647
ocr res :[语义表示捕获提示词的基础含义和全局语境(如对象、动作),与副] ocr res :即时持久改善头发光泽的效果,给干燥的头 0.996608
ocr res :[的图像不会发生变化,随机种子可以增加生成图像的多样性。] ocr res : 0
ocr res :[Text_encode.主文本编码器,将prompt序列转换为一个综合的] ocr res :【主要功能】:可紧致头发磷层,从而达到 0.993421
ocr res :[响初始噪声和生成结果的确定性,固定种子后,同一个prompt生成] ocr res :'0'.'9''9''4''4' 0.677327
ocr res :[声转化为目标图像。] ocr res : 0
ocr res :[随机数设置:随机数种子是控制生成过程随机性的关键参数,直接影] ocr res :'0'.'9''7''2' 0.637158
ocr res :[Scheduler:调度器,控制图像生成,决定了如何逐步将随机噪] ocr res :(成品包材) 0.901937
ocr res :[程和图像生成过程中有着至关重要的作用。] ocr res :'1' 0.32251
ocr res :[在stable'-'dffusion'-'xl'-'base'-''1'.'0'模型中主要包含一下子组件:] ocr res :糖、椰油酰胺丙基甜菜碱、泛醒 0.993478
ocr res :[Pipeline的配置参数控制图像生成的质量和速度,在扩散模型预测过] ocr res :'0'.'9''2''5' 0.586279
ocr res :[具配置文件中的定义手动加载各个子组件。] ocr res :'1''0' 0.547241
ocr res :[这里使用了扩散模型加载器统一加载了所有的子组件,也可以更] ocr res :【主要成分】:鲸蜡硬脂醇、燕麦B'-'葡聚 0.975303
ocr res :[·'2'.'3'pipeline 配置] ocr res :'0'.'9''1''9' 0.568408
Time taken by task: 3475 ms ocr res : 0
ocr res :'0'.'9''9''5''2' 0.613647
ocr res :【适用人群】:适合所有肤质 0.996882
ocr res :'8' 0.378906
ocr res :'0'.'9''9' 0.595581
ocr res :【净含量】:'2''2''0'ml 0.835671
ocr res :'7' 0.356689
ocr res :【产品编号】:YM'-'X'-''3''0''1''1''0'.'9''6''8''9''9' 0.993695
ocr res :'6' 0.214355
ocr res :'0'.'9''9''5' 0.478052
ocr res :【品名】:纯臻营养护发素 0.996175
ocr res :'5' 0.594727
ocr res : 0
ocr res :'0'.'9''8''5' 0.55166
ocr res :【品牌】:代加工方式/'0'EMODM 0.917768
ocr res :每瓶'2''2'元,'1''0''0''0'瓶起订) 0.974644
ocr res :'0'.'9''9''3''9''7''6' 0.736755
ocr res :'3' 0.486572
ocr res :('4''5'元/每公斤,'1''0''0'公斤起订) 0.940028
ocr res :'0'.'9'm'7' 0.534668
ocr res :'2' 0.961426
ocr res : 0
ocr res :'0'.'9''9''2' 0.524121
ocr res :产品信息/参数 0.913853
ocr res :纯臻营养护发素'0'.'9''9''3''6''0''4' 0.964128
ocr res :'0' 0.380127
ocr res :The detection visualized imagesavedin./vis.jpg 0.94302
[Time info] elapsed: 389 ms
``` ```
### 精度 ### 精度
......
#include "cv_put_Text.hpp"
PutText::PutText(const char* font_path) {
// 初始化 FreeType
if (FT_Init_FreeType(&ft)) {
std::cerr << "Error: Could not init FreeType !" << std::endl;
return;
}
// 加载字体文件( 这里使用 SimHei.ttf 字体文件)
if (FT_New_Face(ft, font_path, 0, &face)) {
std::cerr << "Error: Load front failed!" << std::endl;
exit(-1);
}
}
PutText::~PutText() {
// 释放 FreeType 资源
FT_Done_Face(face);
FT_Done_FreeType(ft);
}
void PutText::putText(cv::Mat& img, const std::string& text, int x, int y, int fontSize, cv::Scalar color) {
if(img.empty())
{
std::cerr << "Empty image!";
return ;
}
// 设置字体大小
FT_Set_Pixel_Sizes(face, 0, fontSize);
int start_point_x = x;
int start_point_y = y + fontSize; // 调整基线
// 循环处理每个字符
for (size_t i = 0; i < text.size(); ) {
// 解析 UTF-8 字符
unsigned long unicode = 0;
if ((text[i] & 0x80) == 0) {
unicode = text[i];
i += 1;
} else if ((text[i] & 0xE0) == 0xC0) {
unicode = ((text[i] & 0x1F) << 6) | (text[i + 1] & 0x3F);
i += 2;
} else if ((text[i] & 0xF0) == 0xE0) {
unicode = ((text[i] & 0x0F) << 12) | ((text[i + 1] & 0x3F) << 6) | (text[i + 2] & 0x3F);
i += 3;
} else {
i++; // 无效 UTF-8
continue;
}
// 加载字符字形
if (FT_Load_Char(face, unicode, FT_LOAD_RENDER)) {
std::cerr << "Error: Could not load glyph" << std::endl;
continue;
}
// 绘制到 OpenCV 图像
FT_Bitmap& bitmap = face->glyph->bitmap;
for (int row = 0; row < bitmap.rows; ++row) {
for (int col = 0; col < bitmap.width; ++col) {
unsigned char intensity = bitmap.buffer[row * bitmap.width + col];
if (intensity > 0) {
cv::Vec3b& pixel = img.at<cv::Vec3b>(start_point_y - face->glyph->bitmap_top + row, start_point_x + face->glyph->bitmap_left + col);
pixel[0] = color[0] * (intensity / 255.0) + pixel[0] * (1 - intensity / 255.0);
pixel[1] = color[1] * (intensity / 255.0) + pixel[1] * (1 - intensity / 255.0);
pixel[2] = color[2] * (intensity / 255.0) + pixel[2] * (1 - intensity / 255.0);
}
}
}
start_point_x += face->glyph->advance.x >> 6;
}
}
\ No newline at end of file
#pragma once
#include <ft2build.h>
#include FT_FREETYPE_H
#include <opencv2/opencv.hpp>
class PutText {
private:
FT_Library ft;
FT_Face face;
public:
PutText(const char* font_path);
~PutText();
/**
* @brief 向图片写文字(支持中文)
* @param img 待叠加字符的图片
* @param text 待叠加的字符
* @param x 垂直方向缩放比例
* @param y 水平方向缩放比例
* @param fontSize 原始图像
* @param color 字体颜色,默认绿色
*
* @return 无返回值
*/
void putText(cv::Mat& img, const std::string& text, int x, int y, int fontSize=2, cv::Scalar color=cv::Scalar(0, 255, 0));
};
...@@ -2,24 +2,28 @@ ...@@ -2,24 +2,28 @@
using namespace ppocr; using namespace ppocr;
int main(int argc, char** argv) int main(int argc, char** argv){
{
std::string det_model_onnx = "../Resource/Models/ppocrv5_server_det_infer.onnx"; std::string det_model_onnx = "../Resource/Models/ppocrv5_server_det_infer.onnx";
std::string rec_model_onnx = "../Resource/Models/ppocrv5_server_rec_infer.onnx"; std::string rec_model_onnx = "../Resource/Models/ppocrv5_server_rec_infer.onnx";
std::string img_path = "../Resource/Images/20250703205038.png"; std::string img_path = "../Resource/Images/demo.png";
std::string character_dict_path = "../Resource/ppocr_keys_v5.txt"; std::string character_dict_path = "../Resource/ppocr_keys_v5.txt";
std::string front = "../Resource/fonts/SimHei.ttf";
float segm_thres=0.3; float segm_thres=0.3;
float box_thresh=0.3; float box_thresh=0.3;
ppOcrEngine ocr_engine(det_model_onnx, ppOcrEngine ocr_engine(det_model_onnx,
rec_model_onnx, rec_model_onnx,
character_dict_path, character_dict_path,
front,
segm_thres, segm_thres,
box_thresh, box_thresh,
true, true,
"fp32"); "fp16");
cv::Mat img=cv::imread(img_path); cv::Mat img=cv::imread(img_path);
ocr_engine.forward(img); ocr_engine.forward(img);
return 0; return 0;
} }
\ No newline at end of file
...@@ -53,15 +53,12 @@ bool XsortFp32(std::vector<float> a, std::vector<float> b) { ...@@ -53,15 +53,12 @@ bool XsortFp32(std::vector<float> a, std::vector<float> b) {
return a[0] < b[0]; return a[0] < b[0];
return false; return false;
} }
namespace ppocr namespace ppocr{
{
OcrDet::OcrDet(const std::string det_model_path, OcrDet::OcrDet(const std::string det_model_path,
std::string precision_mode, std::string precision_mode,
bool offload_copy, bool offload_copy,
float segm_thres, float segm_thres,
float box_thresh ) float box_thresh ){
{
if(!Exists(det_model_path)) if(!Exists(det_model_path))
{ {
LOG_ERROR(stdout, "onnx file not exists!\n"); LOG_ERROR(stdout, "onnx file not exists!\n");
...@@ -119,6 +116,9 @@ namespace ppocr ...@@ -119,6 +116,9 @@ namespace ppocr
options.offload_copy = offload_copy; options.offload_copy = offload_copy;
migraphx::target gpuTarget = migraphx::gpu::target{}; migraphx::target gpuTarget = migraphx::gpu::target{};
net.compile(gpuTarget, options); net.compile(gpuTarget, options);
float *warm_data = (float*)malloc(this->input_shape.bytes());
memset(warm_data, 1.0, this->input_shape.bytes());
if( this->offload_copy ==false ) if( this->offload_copy ==false )
{ {
hipMalloc(&input_buffer_device, this->input_shape.bytes()); hipMalloc(&input_buffer_device, this->input_shape.bytes());
...@@ -127,14 +127,23 @@ namespace ppocr ...@@ -127,14 +127,23 @@ namespace ppocr
dev_argument[input_name] = migraphx::argument{input_shape, input_buffer_device}; dev_argument[input_name] = migraphx::argument{input_shape, input_buffer_device};
dev_argument[output_name] = migraphx::argument{output_shape, output_buffer_device}; dev_argument[output_name] = migraphx::argument{output_shape, output_buffer_device};
}
//decode hipMemcpy(input_buffer_device,
// ocr = std::make_shared<CTCDecode>(res_mpath,100,32,3,keys_path); (void*)warm_data,
this->input_shape.bytes(),
hipMemcpyHostToDevice);
//warm up
std::vector<migraphx::argument> results = net.eval(dev_argument);
}else{
std::unordered_map<std::string, migraphx::argument> inputData;
inputData[input_name] = migraphx::argument{input_shape, (float *)warm_data};
//warm up
std::vector<migraphx::argument> results = net.eval(inputData);
}
free(warm_data);
} }
OcrDet::~OcrDet() OcrDet::~OcrDet(){
{
if(data) if(data)
{ {
free(data); free(data);
...@@ -142,6 +151,7 @@ namespace ppocr ...@@ -142,6 +151,7 @@ namespace ppocr
} }
if( offload_copy == false ) if( offload_copy == false )
{ {
//内存释放
if(input_buffer_device) if(input_buffer_device)
{ {
hipFree(input_buffer_device); hipFree(input_buffer_device);
...@@ -158,8 +168,7 @@ namespace ppocr ...@@ -158,8 +168,7 @@ namespace ppocr
} }
} }
cv::Size OcrDet::preproc(cv::Mat img,float* data) cv::Size OcrDet::preproc(cv::Mat img,float* data){
{
float scale = 1.0/255.0; float scale = 1.0/255.0;
std::vector<float> s_mean={0.485, 0.456, 0.406}; std::vector<float> s_mean={0.485, 0.456, 0.406};
std::vector<float> s_stdv={0.229, 0.224, 0.225}; std::vector<float> s_stdv={0.229, 0.224, 0.225};
...@@ -189,8 +198,7 @@ namespace ppocr ...@@ -189,8 +198,7 @@ namespace ppocr
return scale_r ; return scale_r ;
} }
std::vector<std::vector<float>> OcrDet::get_mini_boxes(cv::RotatedRect box,float &ssid) std::vector<std::vector<float>> OcrDet::get_mini_boxes(cv::RotatedRect box,float &ssid) {
{
ssid = max(box.size.width, box.size.height); ssid = max(box.size.width, box.size.height);
cv::Mat points; cv::Mat points;
cv::boxPoints(box, points); cv::boxPoints(box, points);
...@@ -252,7 +260,6 @@ namespace ppocr ...@@ -252,7 +260,6 @@ namespace ppocr
auto array = get_mini_boxes(box, ssid); auto array = get_mini_boxes(box, ssid);
auto box_for_unclip = array; auto box_for_unclip = array;
// end get_mini_box
if (ssid < min_size) { if (ssid < min_size) {
continue; continue;
...@@ -260,7 +267,7 @@ namespace ppocr ...@@ -260,7 +267,7 @@ namespace ppocr
float score; float score;
if (use_polygon_score) if (use_polygon_score)
/* compute using polygon*/ //多边形区域的平均得分作为box的分数
score = polygon_score_acc(contours[_i], pred); score = polygon_score_acc(contours[_i], pred);
else else
score = box_score_fast(array, pred); score = box_score_fast(array, pred);
...@@ -268,12 +275,11 @@ namespace ppocr ...@@ -268,12 +275,11 @@ namespace ppocr
if (score < box_thresh) if (score < box_thresh)
continue; continue;
// start for unclip //简化边界得到准确的边界
cv::RotatedRect points = unClip(box_for_unclip, det_db_unclip_ratio); cv::RotatedRect points = unClip(box_for_unclip, det_db_unclip_ratio);
if (points.size.height < 1.001 && points.size.width < 1.001) { if (points.size.height < 1.001 && points.size.width < 1.001) {
continue; continue;
} }
// end for unclip
cv::RotatedRect clipbox = points; cv::RotatedRect clipbox = points;
auto cliparray = get_mini_boxes(clipbox, ssid); auto cliparray = get_mini_boxes(clipbox, ssid);
...@@ -296,12 +302,11 @@ namespace ppocr ...@@ -296,12 +302,11 @@ namespace ppocr
} }
boxes.push_back(intcliparray); boxes.push_back(intcliparray);
} // end for }
return boxes; return boxes;
} }
std::vector<std::vector<float>> OcrDet::Mat2Vector(cv::Mat mat) std::vector<std::vector<float>> OcrDet::Mat2Vector(cv::Mat mat){
{
std::vector<std::vector<float>> img_vec; std::vector<std::vector<float>> img_vec;
std::vector<float> tmp; std::vector<float> tmp;
...@@ -316,8 +321,7 @@ namespace ppocr ...@@ -316,8 +321,7 @@ namespace ppocr
} }
float OcrDet::polygon_score_acc(std::vector<cv::Point> contour, float OcrDet::polygon_score_acc(std::vector<cv::Point> contour,
cv::Mat pred) cv::Mat pred){
{
int width = pred.cols; int width = pred.cols;
int height = pred.rows; int height = pred.rows;
std::vector<float> box_x; std::vector<float> box_x;
...@@ -364,8 +368,7 @@ namespace ppocr ...@@ -364,8 +368,7 @@ namespace ppocr
} }
float OcrDet::box_score_fast(std::vector<std::vector<float>> box_array, float OcrDet::box_score_fast(std::vector<std::vector<float>> box_array,
cv::Mat pred) cv::Mat pred) {
{
auto array = box_array; auto array = box_array;
int width = pred.cols; int width = pred.cols;
int height = pred.rows; int height = pred.rows;
...@@ -402,8 +405,7 @@ namespace ppocr ...@@ -402,8 +405,7 @@ namespace ppocr
return score; return score;
} }
cv::RotatedRect OcrDet::unClip(std::vector<std::vector<float>> box, cv::RotatedRect OcrDet::unClip(std::vector<std::vector<float>> box,
const float &unclip_ratio) const float &unclip_ratio){
{
float distance = 1.0; float distance = 1.0;
get_contour_area(box, unclip_ratio, distance); get_contour_area(box, unclip_ratio, distance);
ClipperLib::ClipperOffset offset; ClipperLib::ClipperOffset offset;
...@@ -433,8 +435,7 @@ namespace ppocr ...@@ -433,8 +435,7 @@ namespace ppocr
} }
void OcrDet::get_contour_area(const std::vector<std::vector<float>> &box, void OcrDet::get_contour_area(const std::vector<std::vector<float>> &box,
float unclip_ratio, float &distance) float unclip_ratio, float &distance) {
{
int pts_num = 4; int pts_num = 4;
float area = 0.0f; float area = 0.0f;
float dist = 0.0f; float dist = 0.0f;
...@@ -452,8 +453,7 @@ namespace ppocr ...@@ -452,8 +453,7 @@ namespace ppocr
std::vector<std::vector<std::vector<int>>> std::vector<std::vector<std::vector<int>>>
OcrDet::filter_det_res(std::vector<std::vector<std::vector<int>>> boxes, OcrDet::filter_det_res(std::vector<std::vector<std::vector<int>>> boxes,
float ratio_h, float ratio_w, cv::Mat srcimg) float ratio_h, float ratio_w, cv::Mat srcimg){
{
int oriimg_h = srcimg.rows; int oriimg_h = srcimg.rows;
int oriimg_w = srcimg.cols; int oriimg_w = srcimg.cols;
...@@ -482,8 +482,7 @@ namespace ppocr ...@@ -482,8 +482,7 @@ namespace ppocr
return root_points; return root_points;
} }
std::vector<std::vector<int>> OcrDet::order_points_clockwise(std::vector<std::vector<int>> pts) std::vector<std::vector<int>> OcrDet::order_points_clockwise(std::vector<std::vector<int>> pts){
{
std::vector<std::vector<int>> box = pts; std::vector<std::vector<int>> box = pts;
std::sort(box.begin(), box.end(), XsortInt); std::sort(box.begin(), box.end(), XsortInt);
std::vector<std::vector<int>> leftmost = {box[0], box[1]}; std::vector<std::vector<int>> leftmost = {box[0], box[1]};
...@@ -500,31 +499,8 @@ namespace ppocr ...@@ -500,31 +499,8 @@ namespace ppocr
return rect; return rect;
} }
void OcrDet::visualize_boxes(const cv::Mat &srcimg,
const std::vector<std::vector<std::vector<int>>> &boxes)
{
cv::Mat img_vis;
srcimg.copyTo(img_vis);
for (int n = 0; n < boxes.size(); n++) {
cv::Point rook_points[4];
// std::cout<<"size :"<<boxes[n].size()<<'\n';
for (int m = 0; m < boxes[n].size(); m++) {
rook_points[m] = cv::Point(int(boxes[n][m][0]), int(boxes[n][m][1]));
}
const cv::Point *ppt[1] = {rook_points};
int npt[] = {4};
cv::polylines(img_vis, ppt, npt, 1, 1, CV_RGB(0, 255, 0), 2, 8, 0);
}
cv::imwrite("./ocr_debug.png", img_vis);
std::cout << "image saved in ./ocr_result.png"
<< std::endl;
}
bool OcrDet::text_recognition(const cv::Mat &srcimg, bool OcrDet::text_recognition(const cv::Mat &srcimg,
const std::vector<std::vector<std::vector<int>>> &boxes) const std::vector<std::vector<std::vector<int>>> &boxes){
{
if(boxes.size() == 0) if(boxes.size() == 0)
{ {
std::cout<<"Not found text roi !\n"; std::cout<<"Not found text roi !\n";
...@@ -540,15 +516,11 @@ namespace ppocr ...@@ -540,15 +516,11 @@ namespace ppocr
rect.width = boxes[n][2][0] - boxes[n][0][0]; rect.width = boxes[n][2][0] - boxes[n][0][0];
rect.height = boxes[n][2][1] - boxes[n][0][1]; rect.height = boxes[n][2][1] - boxes[n][0][1];
text_mat = srcimg(rect).clone(); text_mat = srcimg(rect).clone();
// ocr->forward(text_mat);
// cv::rectangle(srcimg,rect,cv::Scalar(0,255,0),2);
} }
// cv::imwrite("region_debug.jpg",srcimg);
return true; return true;
} }
int OcrDet::postprocess(float* feature, std::vector<std::vector<std::vector<int>>> &boxes) int OcrDet::postprocess(float* feature, std::vector<std::vector<std::vector<int>>> &boxes){
{
int batch_s = 1; int batch_s = 1;
float conf_thres = 0.6; float conf_thres = 0.6;
cv::Mat thres_mat = cv::Mat(cv::Size(output_height,output_width), CV_8UC1); cv::Mat thres_mat = cv::Mat(cv::Size(output_height,output_width), CV_8UC1);
...@@ -574,8 +546,7 @@ namespace ppocr ...@@ -574,8 +546,7 @@ namespace ppocr
return 0; return 0;
} }
bool OcrDet::forward(cv::Mat& img,std::vector<std::vector<std::vector<int>>>& text_roi_boxes) bool OcrDet::forward(cv::Mat& img,std::vector<std::vector<std::vector<int>>>& text_roi_boxes){
{
std::vector<std::vector<std::vector<int>>> boxes; std::vector<std::vector<std::vector<int>>> boxes;
cv::Size ratio = preproc(img,data); cv::Size ratio = preproc(img,data);
...@@ -608,8 +579,7 @@ namespace ppocr ...@@ -608,8 +579,7 @@ namespace ppocr
float ratio_h = float(net_input_height) / float(img.rows); float ratio_h = float(net_input_height) / float(img.rows);
text_roi_boxes = filter_det_res(boxes, ratio_h, ratio_w, img); text_roi_boxes = filter_det_res(boxes, ratio_h, ratio_w, img);
visualize_boxes(img,text_roi_boxes); // visualize_boxes(img,text_roi_boxes);
// TextRecognition(img,boxes);
return true; return true;
} }
...@@ -620,9 +590,7 @@ namespace ppocr ...@@ -620,9 +590,7 @@ namespace ppocr
int channel, int channel,
int batch_size, int batch_size,
bool offload_copy, bool offload_copy,
std::string character_dict_path) std::string character_dict_path){
{
if(!Exists(rec_model_path)) if(!Exists(rec_model_path))
{ {
LOG_ERROR(stdout, "onnx file not exists!\n"); LOG_ERROR(stdout, "onnx file not exists!\n");
...@@ -634,7 +602,6 @@ namespace ppocr ...@@ -634,7 +602,6 @@ namespace ppocr
this->net_input_channel=channel; this->net_input_channel=channel;
this->precision_mode = precision_mode; this->precision_mode = precision_mode;
migraphx::onnx_options onnx_options; migraphx::onnx_options onnx_options;
onnx_options.map_input_dims["x"] = {1, 3, 48, 720}; onnx_options.map_input_dims["x"] = {1, 3, 48, 720};
...@@ -663,8 +630,6 @@ namespace ppocr ...@@ -663,8 +630,6 @@ namespace ppocr
this->feature_size = output_shape.lens()[2]; this->feature_size = output_shape.lens()[2];
n_channel = this->output_shape.lens()[1]; n_channel = this->output_shape.lens()[1];
std::cout<<"["<<this->output_shape.lens()[0]<<
","<<this->output_shape.lens()[1]<<","<<this->output_shape.lens()[2]<<"]\n";
this->offload_copy = offload_copy; this->offload_copy = offload_copy;
migraphx::compile_options options; migraphx::compile_options options;
...@@ -673,23 +638,37 @@ namespace ppocr ...@@ -673,23 +638,37 @@ namespace ppocr
migraphx::target gpuTarget = migraphx::gpu::target{}; migraphx::target gpuTarget = migraphx::gpu::target{};
net.compile(gpuTarget, options); net.compile(gpuTarget, options);
float *warm_data = (float*)malloc(this->input_shape.bytes());
memset(warm_data, 1.0, this->input_shape.bytes());
if( this->offload_copy ==false ) if( this->offload_copy ==false )
{ {
LOG_INFO(stdout, "Set copy mode ...\n");
hipMalloc(&input_buffer_device, this->input_shape.bytes()); hipMalloc(&input_buffer_device, this->input_shape.bytes());
hipMalloc(&output_buffer_device, this->output_shape.bytes()); hipMalloc(&output_buffer_device, this->output_shape.bytes());
output_buffer_host = (void*)malloc(this->output_shape.bytes()); output_buffer_host = (void*)malloc(this->output_shape.bytes());
dev_argument[input_name] = migraphx::argument{input_shape, input_buffer_device}; dev_argument[input_name] = migraphx::argument{input_shape, input_buffer_device};
dev_argument[output_name] = migraphx::argument{output_shape, output_buffer_device}; dev_argument[output_name] = migraphx::argument{output_shape, output_buffer_device};
}
hipMemcpy(input_buffer_device,
(void*)warm_data,
this->input_shape.bytes(),
hipMemcpyHostToDevice);
//warm up
std::vector<migraphx::argument> results = net.eval(dev_argument);
}else{
std::unordered_map<std::string, migraphx::argument> inputData;
inputData[input_name] = migraphx::argument{input_shape, (float *)warm_data};
//warm up
std::vector<migraphx::argument> results = net.eval(inputData);
}
free(warm_data);
std::ifstream infile; std::ifstream infile;
infile.open(character_dict_path,std::ios::in); infile.open(character_dict_path,std::ios::in);
assert(infile.is_open()); assert(infile.is_open());
std::string k_work=""; std::string k_work="";
k_words.clear(); k_words.clear();
//读取字典文件
while (std::getline(infile,k_work)) while (std::getline(infile,k_work))
{ {
k_words.push_back(k_work); k_words.push_back(k_work);
...@@ -697,8 +676,7 @@ namespace ppocr ...@@ -697,8 +676,7 @@ namespace ppocr
system("chcp 65001"); system("chcp 65001");
} }
CTCDecode::~CTCDecode() CTCDecode::~CTCDecode(){
{
if(data) if(data)
{ {
free(data); free(data);
...@@ -723,8 +701,7 @@ namespace ppocr ...@@ -723,8 +701,7 @@ namespace ppocr
} }
} }
bool CTCDecode::preproc(cv::Mat img,float* data,int img_w,int img_h) bool CTCDecode::preproc(cv::Mat img,float* data,int img_w,int img_h){
{
if (img.empty()) if (img.empty())
{ {
std::cout<<"WARNING image is empty!\n"; std::cout<<"WARNING image is empty!\n";
...@@ -754,25 +731,21 @@ namespace ppocr ...@@ -754,25 +731,21 @@ namespace ppocr
data[i*img_w+j] = (template_mat.at<cv::Vec3b>(i, j)[2]*scale-0.5)/0.5; data[i*img_w+j] = (template_mat.at<cv::Vec3b>(i, j)[2]*scale-0.5)/0.5;
data[i*img_w+j+img_h*img_w] = (template_mat.at<cv::Vec3b>(i, j)[1]*scale-0.5)/0.5; data[i*img_w+j+img_h*img_w] = (template_mat.at<cv::Vec3b>(i, j)[1]*scale-0.5)/0.5;
data[i*img_w+j+2*img_h*img_w] =( template_mat.at<cv::Vec3b>(i, j)[0]*scale-0.5)/0.5; data[i*img_w+j+2*img_h*img_w] =( template_mat.at<cv::Vec3b>(i, j)[0]*scale-0.5)/0.5;
} }
} }
} }
return true ; return true ;
} }
std::string CTCDecode::decode(std::vector<float>& probs,std::vector<int>& indexs,float& mean_prob) std::string CTCDecode::decode(std::vector<float>& probs,std::vector<int>& indexs,float& mean_prob){
{
int ignored_tokens=0; int ignored_tokens=0;
std::string text=""; std::string text="";
std::vector<float> n_probs; std::vector<float> n_probs;
std::vector<int> n_indexs; std::vector<int> n_indexs;
int eff_text_num=0; int eff_text_num=0;
for (int i=0;i<n_channel;i++) for (int i=0;i<n_channel;i++)
{ {
// std::cout<<"s :"<<i<<":"<<indexs[i]<<"-"<<probs[i]<<std::endl;
if(indexs[i]==ignored_tokens) if(indexs[i]==ignored_tokens)
{ {
continue; continue;
...@@ -784,7 +757,6 @@ namespace ppocr ...@@ -784,7 +757,6 @@ namespace ppocr
mean_prob+=probs[i]; mean_prob+=probs[i];
text+=k_words[indexs[i]-1]; text+=k_words[indexs[i]-1];
eff_text_num++; eff_text_num++;
} }
...@@ -801,38 +773,26 @@ namespace ppocr ...@@ -801,38 +773,26 @@ namespace ppocr
} }
std::string CTCDecode::postprocess(float* feature) std::string CTCDecode::postprocess(float* feature)
{ {
//shape 25*6625
std::vector<float> probs; std::vector<float> probs;
std::vector<int> indexs; std::vector<int> indexs;
float prob=0.; float prob=0.;
// std::cout<<"n_channel:"<<n_channel<<", feature_size:"<<feature_size<<std::endl;
for (int i=0;i<n_channel;i++) for (int i=0;i<n_channel;i++)
{ {
float* c_feat = feature+i*feature_size; float* c_feat = feature+i*feature_size;
int max_index = argmax<float*>(c_feat,c_feat+feature_size); int max_index = argmax<float*>(c_feat,c_feat+feature_size);
float max_pro = c_feat[max_index]; float max_pro = c_feat[max_index];
// std::cout<<"step:"<<i<<" max_pro:"<<max_pro<<", max_index:"<<max_index<<std::endl;
probs.push_back(max_pro); probs.push_back(max_pro);
indexs.push_back(max_index); indexs.push_back(max_index);
} }
std::string text = decode(probs,indexs,prob); std::string text = decode(probs,indexs,prob);
std::cout<<"ocr res :["<<text<<"]\n"; std::cout<<"ocr res :"<<text<<" "<<prob<<"\n";
return text; return text;
} }
std::string CTCDecode::forward(cv::Mat& img) std::string CTCDecode::forward(cv::Mat& img){
{
preproc(img,data,net_input_width,net_input_height); preproc(img,data,net_input_width,net_input_height);
// std::unordered_map<std::string, migraphx::argument> inputData;
// inputData[input_name] = migraphx::argument{input_shape, data};
// std::vector<migraphx::argument> results = net.eval(inputData);
// migraphx::argument result = results[0];
if( this->offload_copy ==false ) if( this->offload_copy ==false )
{ {
hipMemcpy(input_buffer_device, hipMemcpy(input_buffer_device,
...@@ -846,8 +806,6 @@ namespace ppocr ...@@ -846,8 +806,6 @@ namespace ppocr
(void*)output_buffer_device, (void*)output_buffer_device,
output_shape.bytes(), output_shape.bytes(),
hipMemcpyDeviceToHost); hipMemcpyDeviceToHost);
// std::cout<<"ctc: copy mode ..."<<std::endl;
std::string text = postprocess((float *)output_buffer_device); std::string text = postprocess((float *)output_buffer_device);
return text; return text;
}else{ }else{
...@@ -856,44 +814,63 @@ namespace ppocr ...@@ -856,44 +814,63 @@ namespace ppocr
std::vector<migraphx::argument> results = net.eval(inputData); std::vector<migraphx::argument> results = net.eval(inputData);
migraphx::argument result = results[0] ; migraphx::argument result = results[0] ;
std::string text = postprocess((float *)result.data()); std::string text = postprocess((float *)result.data());
// std::cout<<"ctc: offload copy mode ..."<<std::endl;
return text; return text;
} }
//get output data (first node)
// migraphx::shape outputShape = result.get_shape();
// int numberOfOutput = outputShape.elements();
// std::vector<std::size_t> outputSize = outputShape.lens();
// std::cout<<"output size:"<<outputSize.size()<<std::endl;
// for(int i = 0; i < outputSize.size(); i++)
// {
// std::cout << outputSize[i] << " ";
// }
} }
ppOcrEngine::ppOcrEngine(const std::string &det_model_path, ppOcrEngine::ppOcrEngine(const std::string &det_model_path,
const std::string &rec_model_path, const std::string &rec_model_path,
const std::string &character_dict_path, const std::string &character_dict_path,
const std::string front,
float segm_thres, float segm_thres,
float box_thresh, float box_thresh,
bool offload_copy, bool offload_copy,
std::string precision_mode ){ std::string precision_mode
){
text_detector = std::make_shared<OcrDet>(det_model_path,precision_mode,offload_copy,segm_thres,box_thresh); text_detector = std::make_shared<OcrDet>(det_model_path,precision_mode,offload_copy,segm_thres,box_thresh);
text_recognizer = std::make_shared<CTCDecode>(rec_model_path,precision_mode,720,48,3,1,offload_copy,character_dict_path); text_recognizer = std::make_shared<CTCDecode>(rec_model_path,precision_mode,720,48,3,1,offload_copy,character_dict_path);
ft2 = std::make_shared<PutText>(front.c_str());
} }
ppOcrEngine::~ppOcrEngine() ppOcrEngine::~ppOcrEngine(){
{
; ;
} }
std::vector<std::string> ppOcrEngine::forward(cv::Mat &srcimg) void ppOcrEngine::visualize_boxes(cv::Mat &srcimg,
const std::vector<std::vector<std::vector<int>>> &boxes) {
std::vector<std::vector<cv::Point>> contours;
for (const auto& box : boxes) {
std::vector<cv::Point> pts;
for (const auto& point : box) {
pts.emplace_back(point[0], point[1]);
}
contours.push_back(pts);
}
cv::polylines(
srcimg,
contours,
true, // 是否闭合
cv::Scalar(0, 255, 0), // 默认绿色
2, // 线宽
cv::LINE_8 // 8连通线
);
}
cv::Mat ppOcrEngine::visualize_text(std::vector<std::string> texts,std::vector<cv::Point> points, cv::Mat &img)
{ {
assert(texts.size()==points.size()),"error texts size != points size";
cv::Mat draw_img = cv::Mat(img.size(), CV_8UC3,cv::Scalar(255,255,255));
int width = img.cols*2;
int height = img.rows;
cv::Mat templete_img = cv::Mat(width,height, CV_8UC3,cv::Scalar(255,255,255));
for(int i = 0 ; i < texts.size(); i++)
{
ft2->putText(draw_img,texts[i],points[i].x,points[i].y,15);
}
cv::hconcat(img, draw_img, templete_img);
return templete_img;
}
std::vector<std::string> ppOcrEngine::forward(cv::Mat &srcimg){
std::vector<std::vector<std::vector<int>>> text_roi_boxes; std::vector<std::vector<std::vector<int>>> text_roi_boxes;
std::vector<std::string> text_vec; std::vector<std::string> text_vec;
...@@ -904,7 +881,8 @@ namespace ppocr ...@@ -904,7 +881,8 @@ namespace ppocr
std::cout<<"Not found text roi !\n"; std::cout<<"Not found text roi !\n";
return std::vector<std::string>(); return std::vector<std::string>();
} }
std::cout<<"text_roi_boxes.size(): "<<text_roi_boxes.size()<<"\n";
std::vector<cv::Point> points;
for (int n = 0; n < text_roi_boxes.size(); n++) { for (int n = 0; n < text_roi_boxes.size(); n++) {
cv::Rect rect; cv::Rect rect;
...@@ -920,10 +898,14 @@ namespace ppocr ...@@ -920,10 +898,14 @@ namespace ppocr
text_roi_mat = srcimg(rect).clone(); text_roi_mat = srcimg(rect).clone();
std::string text = text_recognizer->forward(text_roi_mat); std::string text = text_recognizer->forward(text_roi_mat);
text_vec.push_back(text); text_vec.push_back(text);
points.push_back(cv::Point(rect.x,rect.y));
} }
auto end = std::chrono::high_resolution_clock::now(); auto end = std::chrono::high_resolution_clock::now();
auto duration_ms = std::chrono::duration_cast<std::chrono::milliseconds>(end - start); auto duration_ms = std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
std::cout<<"Time taken by task: "<< duration_ms.count() <<" ms\n"; std::cout<<"[Time info] elapsed: "<< duration_ms.count() <<" ms\n";
visualize_boxes(srcimg,text_roi_boxes);
cv::Mat res_img = visualize_text(text_vec,points, srcimg);
cv::imwrite("res.jpg",res_img);
return text_vec; return text_vec;
} }
......
...@@ -10,15 +10,9 @@ ...@@ -10,15 +10,9 @@
#include "Filesystem.h" #include "Filesystem.h"
#include "SimpleLog.h" #include "SimpleLog.h"
#include "clipper.h" #include "clipper.h"
#include "cv_put_Text.hpp"
namespace ppocr{ namespace ppocr{
struct _TEXT_BOX
{
cv::Rect t_rect;
float score;
};
using T_BOX = struct _TEXT_BOX;
class CTCDecode class CTCDecode
{ {
private: private:
...@@ -38,7 +32,6 @@ namespace ppocr{ ...@@ -38,7 +32,6 @@ namespace ppocr{
void* output_buffer_device; void* output_buffer_device;
void* output_buffer_host; void* output_buffer_host;
migraphx::shape input_shape; migraphx::shape input_shape;
migraphx::shape output_shape; migraphx::shape output_shape;
std::string input_name; std::string input_name;
...@@ -51,7 +44,7 @@ namespace ppocr{ ...@@ -51,7 +44,7 @@ namespace ppocr{
public: public:
CTCDecode(std::string rec_model_path, CTCDecode(std::string rec_model_path,
std::string precision_mode="fp32", std::string precision_mode="fp16",
int image_width=480, int image_width=480,
int image_height=48, int image_height=48,
int channel=3, int channel=3,
...@@ -61,7 +54,9 @@ namespace ppocr{ ...@@ -61,7 +54,9 @@ namespace ppocr{
~CTCDecode(); ~CTCDecode();
/** /**
* @brief 字符识别编码,可支持,最长可支持预测90个字符,18385个字符 * @brief 字符识别、编码API 字符识别编码,可支持,最长可支持预测90个字符,18385个字符
* @param img 输入图片
* @return 编码后的字符串
*/ */
std::string forward(cv::Mat& img); std::string forward(cv::Mat& img);
...@@ -93,7 +88,6 @@ namespace ppocr{ ...@@ -93,7 +88,6 @@ namespace ppocr{
* @return 成功:text,失败:"" * @return 成功:text,失败:""
*/ */
std::string decode(std::vector<float>& probs,std::vector<int>& indexs,float& mean_prob); std::string decode(std::vector<float>& probs,std::vector<int>& indexs,float& mean_prob);
}; };
class OcrDet class OcrDet
...@@ -117,25 +111,33 @@ namespace ppocr{ ...@@ -117,25 +111,33 @@ namespace ppocr{
float* data; float* data;
//Allocate device buffer and host buffer,if offload_copy is false //当offload_copy为true时,分配设备内存
std::unordered_map<std::string, migraphx::argument> dev_argument; std::unordered_map<std::string, migraphx::argument> dev_argument;
void* input_buffer_device; void* input_buffer_device;
void* output_buffer_device; void* output_buffer_device;
void* output_buffer_host; void* output_buffer_host;
//postprocess //后处理
int n_channel; int n_channel;
int feature_size; //single channel feature map size. int feature_size; //单个通道的特征大小,例如模型输出[1,3,32,32],feature_size= 32x32.
int output_width; int output_width;
int output_height; int output_height;
int max_candidates;//maximun number of candidates contours. int max_candidates;//最大检测的候选区域.
public: public:
OcrDet(std::string det_model_path, OcrDet(std::string det_model_path,
std::string precision_mode="float32", std::string precision_mode="fp16",
bool offload_copy = true, bool offload_copy = true,
float segm_thres = 0.3, float segm_thres = 0.3,
float box_thresh = 0.7); float box_thresh = 0.7);
~OcrDet(); ~OcrDet();
/**
* @brief 字符检测模型推理API
* @param img 原始图片
* @param text_roi_boxes 字符区域坐标,格式:[[[tl.x, tl.y], [tr.x, tr.y],[], [br.x, br.y], [bl.x, bl.y]]]]
* | | | |
* 左上坐标 右上坐标 右下坐标 左下坐标
* @return 成功返回true,失败返回false
*/
bool forward(cv::Mat& img,std::vector<std::vector<std::vector<int>>>& text_roi_boxes); bool forward(cv::Mat& img,std::vector<std::vector<std::vector<int>>>& text_roi_boxes);
private: private:
...@@ -159,11 +161,12 @@ namespace ppocr{ ...@@ -159,11 +161,12 @@ namespace ppocr{
*/ */
int postprocess(float* feature, std::vector<std::vector<std::vector<int>>> &boxes); int postprocess(float* feature, std::vector<std::vector<std::vector<int>>> &boxes);
/**
* @brief 后处理,文本区域提取
* @param pred 二值图(这里字符检测使用了dbnet分割字符区域,二值图对应了文本区域)
int boxes_from_bitmap(cv::Mat& bit_map,std::vector<T_BOX>& box); * @param bitmap 二值图(pred做形态学运算输出bitmap,结合pred结算平均边框得分)
* @return 成功:0,失败:-1
*/
std::vector<std::vector<std::vector<int>>>boxes_from_bitmap( std::vector<std::vector<std::vector<int>>>boxes_from_bitmap(
const cv::Mat pred, const cv::Mat bitmap, const float &box_thresh, const cv::Mat pred, const cv::Mat bitmap, const float &box_thresh,
const float &det_db_unclip_ratio, const bool &use_polygon_score); const float &det_db_unclip_ratio, const bool &use_polygon_score);
...@@ -238,9 +241,6 @@ namespace ppocr{ ...@@ -238,9 +241,6 @@ namespace ppocr{
*/ */
float box_score_fast(std::vector<std::vector<float>> box_array,cv::Mat pred) ; float box_score_fast(std::vector<std::vector<float>> box_array,cv::Mat pred) ;
void visualize_boxes(const cv::Mat &srcimg,
const std::vector<std::vector<std::vector<int>>> &boxes) ;
bool text_recognition(const cv::Mat &srcimg, bool text_recognition(const cv::Mat &srcimg,
const std::vector<std::vector<std::vector<int>>> &boxes); const std::vector<std::vector<std::vector<int>>> &boxes);
...@@ -250,16 +250,21 @@ namespace ppocr{ ...@@ -250,16 +250,21 @@ namespace ppocr{
private: private:
std::shared_ptr<OcrDet> text_detector; std::shared_ptr<OcrDet> text_detector;
std::shared_ptr<CTCDecode> text_recognizer; std::shared_ptr<CTCDecode> text_recognizer;
std::shared_ptr<PutText> ft2 ;
public: public:
ppOcrEngine(const std::string &det_model_path, ppOcrEngine(const std::string &det_model_path,
const std::string &rec_model_path, const std::string &rec_model_path,
const std::string &character_dict_path, const std::string &character_dict_path,
const std::string front,
const float segm_thres=0.3, const float segm_thres=0.3,
const float box_thresh=0.7, const float box_thresh=0.7,
bool offload_copy =true, bool offload_copy =true,
std::string precision_mode = "fp32") ; std::string precision_mode = "fp16") ;
~ppOcrEngine(); ~ppOcrEngine();
std::vector<std::string> forward(cv::Mat &srcimg); std::vector<std::string> forward(cv::Mat &srcimg);
cv::Mat visualize_text(std::vector<std::string> texts,std::vector<cv::Point> points, cv::Mat &img);
void visualize_boxes(cv::Mat &srcimg,
const std::vector<std::vector<std::vector<int>>> &boxes) ;
}; };
} }
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment