Commit 417a4ca0 authored by liuhy's avatar liuhy
Browse files

1、新增warm up功能 2、新增图片叠加OCR字符功能

parent 369751c2
...@@ -10,6 +10,7 @@ set(CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS} -std=c++17) ...@@ -10,6 +10,7 @@ set(CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS} -std=c++17)
set(CMAKE_BUILD_TYPE release) set(CMAKE_BUILD_TYPE release)
set(INCLUDE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/Src/ set(INCLUDE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/Src/
/usr/include/freetype2
$ENV{DTKROOT}/include/ $ENV{DTKROOT}/include/
${CMAKE_CURRENT_SOURCE_DIR}/Src/Utility ${CMAKE_CURRENT_SOURCE_DIR}/Src/Utility
${CMAKE_CURRENT_SOURCE_DIR}/3rdParty/opencv/include) ${CMAKE_CURRENT_SOURCE_DIR}/3rdParty/opencv/include)
...@@ -17,6 +18,7 @@ include_directories(${INCLUDE_PATH}) ...@@ -17,6 +18,7 @@ include_directories(${INCLUDE_PATH})
# 添加依赖库路径 # 添加依赖库路径
set(LIBRARY_PATH ${CMAKE_CURRENT_SOURCE_DIR}/3rdParty/opencv/lib set(LIBRARY_PATH ${CMAKE_CURRENT_SOURCE_DIR}/3rdParty/opencv/lib
/usr/lib/x86_64-linux-gnu
$ENV{DTKROOT}/lib/) $ENV{DTKROOT}/lib/)
link_directories(${LIBRARY_PATH}) link_directories(${LIBRARY_PATH})
...@@ -24,6 +26,7 @@ link_directories(${LIBRARY_PATH}) ...@@ -24,6 +26,7 @@ link_directories(${LIBRARY_PATH})
set(LIBRARY opencv_core set(LIBRARY opencv_core
opencv_imgproc opencv_imgproc
opencv_imgcodecs opencv_imgcodecs
freetype
opencv_dnn opencv_dnn
migraphx migraphx
migraphx_gpu migraphx_gpu
...@@ -36,6 +39,7 @@ link_libraries(${LIBRARY}) ...@@ -36,6 +39,7 @@ link_libraries(${LIBRARY})
set(SOURCE_FILES ${CMAKE_CURRENT_SOURCE_DIR}/Src/main.cpp set(SOURCE_FILES ${CMAKE_CURRENT_SOURCE_DIR}/Src/main.cpp
${CMAKE_CURRENT_SOURCE_DIR}/Src/Utility/Filesystem.cpp ${CMAKE_CURRENT_SOURCE_DIR}/Src/Utility/Filesystem.cpp
${CMAKE_CURRENT_SOURCE_DIR}/Src/Utility/clipper.cpp ${CMAKE_CURRENT_SOURCE_DIR}/Src/Utility/clipper.cpp
${CMAKE_CURRENT_SOURCE_DIR}/Src/Utility/cv_put_Text.cpp
${CMAKE_CURRENT_SOURCE_DIR}/Src/ocr_engine.cpp) ${CMAKE_CURRENT_SOURCE_DIR}/Src/ocr_engine.cpp)
# 添加可执行目标 # 添加可执行目标
......
Doc/Images/CRNN.png

112 KB | W: | H:

Doc/Images/CRNN.png

96.4 KB | W: | H:

Doc/Images/CRNN.png
Doc/Images/CRNN.png
Doc/Images/CRNN.png
Doc/Images/CRNN.png
  • 2-up
  • Swipe
  • Onion skin
Doc/Images/DBNet.png

597 KB | W: | H:

Doc/Images/DBNet.png

311 KB | W: | H:

Doc/Images/DBNet.png
Doc/Images/DBNet.png
Doc/Images/DBNet.png
Doc/Images/DBNet.png
  • 2-up
  • Swipe
  • Onion skin
...@@ -4,11 +4,15 @@ PP-OCRv5 是PP-OCR新一代文字识别解决方案,该方案聚焦于多场 ...@@ -4,11 +4,15 @@ PP-OCRv5 是PP-OCR新一代文字识别解决方案,该方案聚焦于多场
## 模型简介 ## 模型简介
### 文本检测 ### 文本检测
文本检测使用了dbnet(论文地址:https://arxiv.org/pdf/1911.08947),网络结构:![alt text](Images/DBNet.png),模型输出概率图,并用Vatti Clipping算法对字符区域多边形简化处理,sample中借助Clipping 库。 sample中使用动态shape(N,3,H,C),最大输入shape是[1,3,640,640],模型地址:Resource/Models/ppocrv5_server_det_infer.onnx 文本检测使用了dbnet( 论文地址:https://arxiv.org/pdf/1911.08947 ),网络结构:
![alt text](Images/DBNet.png)
模型输出概率图,并用Vatti Clipping算法对字符区域多边形简化处理,sample中借助Clipping 库。 sample模型输入shape为[1,3,640,640],模型路径:Resource/Models/ppocrv5_server_det_infer.onnx
### 文本识别 ### 文本识别
文本识别使用了CRNN+CTCDecode(https://arxiv.org/pdf/2009.09941),网络结构:![alt text](Images/CRNN.png),sample中使用了动态shape (N,3,48,W),最大输入shape是[1,3,48,720],模型地址:Resource/Models/ppocrv5_server_rec_infer.onnx 文本识别使用了CRNN+CTCDecode( https://arxiv.org/pdf/2009.09941 ),网络结构:
![(Images/CRNN.png)](Images/CRNN.png)
sample中模型输入shape为[1,3,48,720],模型路径:Resource/Models/ppocrv5_server_rec_infer.onnx
## 预处理 ## 预处理
### 检测模型预处理 ### 检测模型预处理
检测模型输入数据预处理: 检测模型输入数据预处理:
...@@ -110,7 +114,7 @@ class ppOcrEngine { ...@@ -110,7 +114,7 @@ class ppOcrEngine {
const float segm_thres=0.3, const float segm_thres=0.3,
const float box_thresh=0.7, const float box_thresh=0.7,
bool offload_copy =true, bool offload_copy =true,
std::string precision_mode = "fp32") ; std::string precision_mode = "fp16") ;
/** /**
* @brief OCR engine初始化 * @brief OCR engine初始化
* @param det_model_path 字符检测模型路径 * @param det_model_path 字符检测模型路径
...@@ -119,7 +123,7 @@ class ppOcrEngine { ...@@ -119,7 +123,7 @@ class ppOcrEngine {
* @param segm_thres 像素分割阈值 * @param segm_thres 像素分割阈值
* @param box_thresh 字符区域box阈值 * @param box_thresh 字符区域box阈值
* @param offload_copy 内存拷贝存模式, 支持两种数据拷贝方式:*offload_copy=true、offload_copy=false。当offload_copy为true时,不需*要进行内存拷贝,如果为false,需要先预分配输入输出的设备内存,并在推理* *前,将预处理数据拷贝到设备内存,推理后将模型输出从设备内存中拷贝出来 * @param offload_copy 内存拷贝存模式, 支持两种数据拷贝方式:*offload_copy=true、offload_copy=false。当offload_copy为true时,不需*要进行内存拷贝,如果为false,需要先预分配输入输出的设备内存,并在推理* *前,将预处理数据拷贝到设备内存,推理后将模型输出从设备内存中拷贝出来
* @param precision_mode 精度模式,支持:fp32、fp16 * @param precision_mode 精度模式,支持:fp32、fp16,默认支持fp16
* *
* @return NONE * @return NONE
*/ */
...@@ -130,36 +134,11 @@ class ppOcrEngine { ...@@ -130,36 +134,11 @@ class ppOcrEngine {
class CTCDecode class CTCDecode
{ {
private: private:
//inference image ...
float* data;
std::unordered_map<std::string, migraphx::argument> device_data;
migraphx::program net;
int batch_size;
int net_input_width;
int net_input_height;
int net_input_channel;
bool offload_copy;
std::string precision_mode;
std::unordered_map<std::string, migraphx::argument> dev_argument;
void* input_buffer_device;
void* output_buffer_device;
void* output_buffer_host;
migraphx::shape input_shape;
migraphx::shape output_shape;
std::string input_name;
std::string output_name;
//postprocess: n_channel->model output channel,feature_size--> feature size one channel
int n_channel;
int feature_size;
std::vector<std::string> k_words;
public: public:
CTCDecode(std::string rec_model_path, CTCDecode(std::string rec_model_path,
std::string precision_mode="fp32", std::string precision_mode="fp16",
int image_width=480, int image_width=480,
int image_height=48, int image_height=48,
int channel=3, int channel=3,
...@@ -169,73 +148,21 @@ class ppOcrEngine { ...@@ -169,73 +148,21 @@ class ppOcrEngine {
~CTCDecode(); ~CTCDecode();
/** /**
* @brief 字符识别编码,可支持,最长可支持预测90个字符,18385个字符 * @brief 字符识别、编码API 字符识别编码,可支持,最长可支持预测90个字符,18385个字符
* @param img 输入图片
* @return 编码后的字符串
*/ */
std::string forward(cv::Mat& img); std::string forward(cv::Mat& img);
private: private:
/** ...
* @brief 预处理
* pixel = (src_img*scale-0.5)/0.5;
* scale = 1.0/255
* @param img 字符图片
* @param data 预处理输出
* @param img_w 模型输入宽
* @param img_h 模型输入高
* @return 成功:true,失败:false
*/
bool preproc(cv::Mat img,float* data,int img_w=480,int img_h=48);
/**
* @brief 模型预测后处理,获取每行中概率最大的字符,组成一句长度最大为90个字符的句子,模型预测输出shape=[1,90,18385]
* @param feature model output
* @return 成功:text,失败:""
*/
std::string postprocess(float* feature);
/**
* @brief 解码,将模型预测输出与字符集关联起来
* @param probs 模型预测的最大概率
* @param indexs 模型预测的最大概率的索引值
* @param mean_prob 预测句子的平均概率
* @return 成功:text,失败:""
*/
std::string decode(std::vector<float>& probs,std::vector<int>& indexs,float& mean_prob);
}; };
class OcrDet class OcrDet
{ {
private: private:
std::string precision_mode; ...
bool offload_copy;
migraphx::program net;
migraphx::shape input_shape;
migraphx::shape output_shape;
std::string input_name;
std::string output_name;
int det_batch_size;
int data_size ;
float segm_thres;
float box_thres;
int net_input_width;
int net_input_height;
int net_input_channel;
float* data;
//Allocate device buffer and host buffer,if offload_copy is false
std::unordered_map<std::string, migraphx::argument> dev_argument;
void* input_buffer_device;
void* output_buffer_device;
void* output_buffer_host;
//postprocess
int n_channel;
int feature_size; //single channel feature map size.
int output_width;
int output_height;
int max_candidates;//maximun number of candidates contours.
public: public:
OcrDet(std::string det_model_path, OcrDet(std::string det_model_path,
...@@ -244,113 +171,19 @@ class ppOcrEngine { ...@@ -244,113 +171,19 @@ class ppOcrEngine {
float segm_thres = 0.3, float segm_thres = 0.3,
float box_thresh = 0.7); float box_thresh = 0.7);
~OcrDet(); ~OcrDet();
bool forward(cv::Mat& img,std::vector<std::vector<std::vector<int>>>& text_roi_boxes);
private:
/**
* @brief 预处理
* pixel = (scale*src_img*mean/std);
* scale = 1.0/255
* mean = [0.485, 0.456, 0.406]
* std = [0.229, 0.224, 0.225]
* @param img 字符图片
* @param data 预处理输出
* @return 成功:w,h维度的缩放比例
*/
cv::Size preproc(cv::Mat img,float* data);
/**
* @brief 后处理,通过模型预测的二值图获取文本区域
* @param feature 模型预测tensor(这里字符检测使用了dbnet)
* @param boxes 字符区域坐标
* @return 成功:0,失败:-1
*/
int postprocess(float* feature, std::vector<std::vector<std::vector<int>>> &boxes);
int boxes_from_bitmap(cv::Mat& bit_map,std::vector<T_BOX>& box);
std::vector<std::vector<std::vector<int>>>boxes_from_bitmap(
const cv::Mat pred, const cv::Mat bitmap, const float &box_thresh,
const float &det_db_unclip_ratio, const bool &use_polygon_score);
std::vector<std::vector<float>> Mat2Vector(cv::Mat mat);
/**
* @brief 统计多边形区域的平均得分
* @param contour 字符区域的轮廓点集合
* @param pred 模型预测二值图
* @return score
*/
float polygon_score_acc(std::vector<cv::Point> contour,cv::Mat pred);
/**
* @brief 对模型预测的区域进行向内或向外扩散,扩散比例是unclip_ratio ,目的是找到更加合适的字符区域
* @param box 字符区域坐标
* @param pred 模型预测二值图
* @return 处理后的字符区域
*/
cv::RotatedRect unClip(std::vector<std::vector<float>> box,
const float &unclip_ratio);
/**
* @brief 计算偏移距离
* distance = area * unclip_ratio / dist;
* area = ∑(x_i*y_{i+1} - x_{i+1}*y_i)
* dist = sqrtf(dx * dx + dy * dy)
*
* @param box 字符区域坐标
* @param unclip_ratio 缩放比例
* @param distance 偏移距离
* @return NONE
*/
void get_contour_area(const std::vector<std::vector<float>> &box,
float unclip_ratio, float &distance) ;
/**
* @brief 无效字符区域过滤。首先将boxes映射回原始图像,然后过滤无效区域
* @param boxes 字符区域坐标
* @param ratio_h 垂直方向缩放比例
* @param ratio_w 水平方向缩放比例
* @param srcimg 原始图像
*
* @return 字符区域有效坐标
*/
std::vector<std::vector<std::vector<int>>> filter_det_res(std::vector<std::vector<std::vector<int>>> boxes,
float ratio_h, float ratio_w, cv::Mat srcimg);
/** /**
* @brief 对字符区域按照从上到下,从左到右的顺序排序 * @brief 字符检测模型推理API
* @param pts 字符区域坐标 * @param img 原始图片
* * @param text_roi_boxes 字符区域坐标,格式:[[[tl.x, tl.y], [tr.x, tr.y],[], [br.x, br.y], [bl.x, bl.y]]]]
* @return 字符区域有效坐标 * | | | |
*/ * 左上坐标 右上坐标 右下坐标 左下坐标
std::vector<std::vector<int>> order_points_clockwise(std::vector<std::vector<int>> pts); * @return 成功返回true,失败返回false
/**
* @brief 获取最小矩形坐标
* @param box 字符区域最小外接矩形的坐标
* @param ssid box的最大边
* @return 字符区域有效坐标
*/ */
std::vector<std::vector<float>> get_mini_boxes(cv::RotatedRect box,float &ssid) ; bool forward(cv::Mat& img,std::vector<std::vector<std::vector<int>>>& text_roi_boxes);
/**
* @brief 计算bitmap上的t_rect区域的平均分数
* @param box_array 模型预测的字符区域
* @param pred 模型预测二值图
* @return score
*/
float box_score_fast(std::vector<std::vector<float>> box_array,cv::Mat pred) ;
void visualize_boxes(const cv::Mat &srcimg,
const std::vector<std::vector<std::vector<int>>> &boxes) ;
bool text_recognition(const cv::Mat &srcimg, private:
const std::vector<std::vector<std::vector<int>>> &boxes); ...
}; };
...@@ -358,119 +191,84 @@ class ppOcrEngine { ...@@ -358,119 +191,84 @@ class ppOcrEngine {
## 推理 ## 推理
### 字符检测模型推理 - 字符检测
- 字符识别、解码
- 字符框可视化
- OCR结果可视化
```c++ ```c++
bool OcrDet::forward(cv::Mat& img,std::vector<std::vector<std::vector<int>>>& text_roi_boxes) std::vector<std::string> ppOcrEngine::forward(cv::Mat &srcimg){
{ std::vector<std::vector<std::vector<int>>> text_roi_boxes;
std::vector<std::vector<std::vector<int>>> boxes;
//输入数据预处理 std::vector<std::string> text_vec;
cv::Size ratio = preproc(img,data); auto start = std::chrono::high_resolution_clock::now();
/* //字符区域检测
支持两种数据拷贝方式:offload_copy=true、offload_copy=false。当offload_copy为true时,不需要进行内存拷贝,如果为false,需要先预分配输入输出的设备内存,并在推理前,将预处理数据拷贝到设备内存,推理后将模型输出从设备内存中拷贝出来,在做后处理。 text_detector->forward(srcimg,text_roi_boxes);
*/ if(text_roi_boxes.size() == 0)
if( this->offload_copy ==false )
{ {
hipMemcpy(input_buffer_device, std::cout<<"Not found text roi !\n";
(void*)data, return std::vector<std::string>();
this->input_shape.bytes(),
hipMemcpyHostToDevice);
std::vector<migraphx::argument> results = net.eval(dev_argument);
hipMemcpy(output_buffer_host,
(void*)output_buffer_device,
output_shape.bytes(),
hipMemcpyDeviceToHost);
postprocess((float *)output_buffer_host,boxes);
std::cout<<"copy mode ..."<<std::endl;
}else{
std::unordered_map<std::string, migraphx::argument> inputData;
inputData[input_name] = migraphx::argument{input_shape, (float *)data};
std::vector<migraphx::argument> results = net.eval(inputData);
migraphx::argument result = results[0] ; //get output data
postprocess((float *)result.data(),boxes);
std::cout<<"offload copy mode ..."<<std::endl;
} }
//计算等比缩放比例 std::vector<cv::Point> points;
float ratio_w = float(net_input_width) / float(img.cols); //字符识别+编码
float ratio_h = float(net_input_height) / float(img.rows); for (int n = 0; n < text_roi_boxes.size(); n++) {
//过滤无效框
text_roi_boxes = filter_det_res(boxes, ratio_h, ratio_w, img); cv::Rect rect;
//可视化检测结果 cv::Mat text_roi_mat;
visualize_boxes(img,text_roi_boxes); rect.x = text_roi_boxes[n][0][0];
// TextRecognition(img,boxes); rect.y = text_roi_boxes[n][0][1];
return true; rect.width = text_roi_boxes[n][2][0] - text_roi_boxes[n][0][0];
} rect.height = text_roi_boxes[n][2][1] - text_roi_boxes[n][0][1];
if(rect.width <3 || rect.height<3)
{
``` continue;
### 字符识别推理 }
```c++ text_roi_mat = srcimg(rect).clone();
std::string CTCDecode::forward(cv::Mat& img)
{
//预处理
preproc(img,data,net_input_width,net_input_height);
/*
支持两种数据拷贝方式:offload_copy=true、offload_copy=false。当offload_copy为true时,不需要进行内存拷贝,如果为false,需要先预分配输入输出的设备内存,并在推理前,将预处理数据拷贝到设备内存,推理后将模型输出从设备内存中拷贝出来,在做后处理。
*/
if( this->offload_copy ==false ) std::string text = text_recognizer->forward(text_roi_mat);
{ text_vec.push_back(text);
hipMemcpy(input_buffer_device, points.push_back(cv::Point(rect.x,rect.y));
(void*)data, }
this->input_shape.bytes(), auto end = std::chrono::high_resolution_clock::now();
hipMemcpyHostToDevice); auto duration_ms = std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
std::cout<<"[Time info] elapsed: "<< duration_ms.count() <<" ms\n";
//字符框可视化
visualize_boxes(srcimg,text_roi_boxes);
//OCR可视化
cv::Mat res_img = visualize_text(text_vec,points, srcimg);
...
}
std::vector<migraphx::argument> results = net.eval(dev_argument);
hipMemcpy(output_buffer_host,
(void*)output_buffer_device,
output_shape.bytes(),
hipMemcpyDeviceToHost);
//模型后处理,获取字符的最大概率和索引,并根据索引在字符库中查找对应的字符,然后合成一个句子
std::string text = postprocess((float *)output_buffer_device);
return text;
}else{
std::unordered_map<std::string, migraphx::argument> inputData;
inputData[input_name] = migraphx::argument{input_shape, (float *)data};
std::vector<migraphx::argument> results = net.eval(inputData);
migraphx::argument result = results[0] ;
std::string text = postprocess((float *)result.data());
// std::cout<<"ctc: offload copy mode ..."<<std::endl;
return text;
}
}
``` ```
# Ocrv5 API调用说明 # Ocrv5 API调用说明
API调用步骤如下: API调用步骤如下:
- 类实例化 - 类实例化
- 读取测试图片
- 识别接口调用 - 识别接口调用
例: 例:
```c++ ```c++
int main(int argc, char** argv) int main(int argc, char** argv){
{
std::string det_model_onnx = "../Resource/Models/ppocrv5_server_det_infer.onnx"; std::string det_model_onnx = "../Resource/Models/ppocrv5_server_det_infer.onnx";
std::string rec_model_onnx = "../Resource/Models/ppocrv5_server_rec_infer.onnx"; std::string rec_model_onnx = "../Resource/Models/ppocrv5_server_rec_infer.onnx";
std::string img_path = "../Resource/Images/20250703205038.png"; std::string img_path = "../Resource/Images/demo.png";
std::string character_dict_path = "../Resource/ppocr_keys_v5.txt"; std::string character_dict_path = "../Resource/ppocr_keys_v5.txt";
std::string front = "../Resource/fonts/SimHei.ttf";
float segm_thres=0.3; float segm_thres=0.3;
float box_thresh=0.3; float box_thresh=0.3;
ppOcrEngine ocr_engine(det_model_onnx, ppOcrEngine ocr_engine(det_model_onnx,
rec_model_onnx, rec_model_onnx,
character_dict_path, character_dict_path,
front,
segm_thres, segm_thres,
box_thresh, box_thresh,
true, true,
"fp32"); "fp16");
cv::Mat img=cv::imread(img_path); cv::Mat img=cv::imread(img_path);
ocr_engine.forward(img); ocr_engine.forward(img);
return 0; return 0;
} }
``` ```
sample支持两种精度推理(fp32和fp16),默认是fp32),精度和内存拷贝方式分别通过ocr_engine的构造函数传入参数来设置。 sample支持两种精度推理(fp32和fp16),默认是fp16),精度和内存拷贝方式分别通过ocr_engine的构造函数传入参数来设置。
\ No newline at end of file \ No newline at end of file
...@@ -4,10 +4,14 @@ PP-OCRv5 是PP-OCR新一代文字识别解决方案,该方案聚焦于多场 ...@@ -4,10 +4,14 @@ PP-OCRv5 是PP-OCR新一代文字识别解决方案,该方案聚焦于多场
## 模型简介 ## 模型简介
### 文本检测 ### 文本检测
文本检测使用了dbnet(论文地址:https://arxiv.org/pdf/1911.08947),网络结构:![alt text](Images/DBNet.png),模型输出概率图,并用Vatti Clipping算法对字符区域多边形简化处理。 sample中使用动态shape(N,3,H,C),最大输入shape是[1,3,640,640],模型地址:Resource/Models/ppocrv5_server_det_infer.onnx 文本检测使用了dbnet( 论文地址:https://arxiv.org/pdf/1911.08947 ),网络结构:
![alt text](Images/DBNet.png)
模型输出概率图,并用Vatti Clipping算法对字符区域多边形简化处理,sample中借助Clipping 库。 sample中模型输入shape为[1,3,640,640],模型路径:Resource/Models/ppocrv5_server_det_infer.onnx
### 文本识别 ### 文本识别
文本识别使用了CRNN+CTCDecode(https://arxiv.org/pdf/2009.09941),网络结构:![alt text](Images/CRNN.png),sample中使用了动态shape (N,3,48,W),最大输入shape是[1,3,48,720],模型地址:Resource/Models/ppocrv5_server_rec_infer.onnx 文本识别使用了CRNN+CTCDecode( https://arxiv.org/pdf/2009.09941 ),网络结构:
![(Images/CRNN.png)](Images/CRNN.png)
sample中模型输入shape为[1,3,48,720],模型路径:Resource/Models/ppocrv5_server_rec_infer.onnx
## 预处理 ## 预处理
### 检测模型预处理 ### 检测模型预处理
...@@ -80,28 +84,20 @@ def preprocess(self, img, max_wh_ratio): ...@@ -80,28 +84,20 @@ def preprocess(self, img, max_wh_ratio):
imgH, imgW = self.rec_input_size imgH, imgW = self.rec_input_size
max_h,max_w = self.rec_input_size max_h,max_w = self.rec_input_size
h, w = img.shape[:2] h, w = img.shape[:2]
# re_size = (max_w,max_h)
#保留H的原始维度 #保留H的原始维度
if h <= max_h: if h <= max_h:
ratio = max_h / h ratio = max_h / h
w = int(w*ratio) w = int(w*ratio)
if w <= max_w: if w <= max_w:
re_size =(w,max_h) re_size =(w,max_h)
else: else:
re_size = (max_w,max_h) re_size = (max_w,max_h)
else: else:
ratio = max_h/h ratio = max_h/h
w,h = int(w*ratio),max_h w,h = int(w*ratio),max_h
if w <= max_w: if w <= max_w:
re_size = (w,h) re_size = (w,h)
else: else:
re_size = (max_w,h) re_size = (max_w,h)
...@@ -112,12 +108,9 @@ def preprocess(self, img, max_wh_ratio): ...@@ -112,12 +108,9 @@ def preprocess(self, img, max_wh_ratio):
resized_image = resized_image.transpose((2, 0, 1)) / 255 resized_image = resized_image.transpose((2, 0, 1)) / 255
resized_image -= 0.5 resized_image -= 0.5
resized_image /= 0.5 resized_image /= 0.5
#填充,沿着右、下填充 #填充,沿着右、下填充
padding_im = np.zeros((3, imgH, imgW), dtype=np.float32) padding_im = np.zeros((3, imgH, imgW), dtype=np.float32)
padding_im[:, :, 0:re_size[0]] = resized_image padding_im[:, :, 0:re_size[0]] = resized_image
return padding_im return padding_im
``` ```
## 类介绍 ## 类介绍
...@@ -154,7 +147,7 @@ class PPOcrV5(): ...@@ -154,7 +147,7 @@ class PPOcrV5():
**kwargs :设置字符检测模型后处理相关参数 **kwargs :设置字符检测模型后处理相关参数
Returns: Returns:
return_type: NONE。 return_type: 无返回值
Examples: Examples:
det_onnx_path = "PATH/TO/det_onnx_model.onnx" det_onnx_path = "PATH/TO/det_onnx_model.onnx"
...@@ -198,7 +191,7 @@ class TextDetector(object): ...@@ -198,7 +191,7 @@ class TextDetector(object):
**kwargs :设置字符检测模型后处理相关参数 **kwargs :设置字符检测模型后处理相关参数
Returns: Returns:
return_type: NONE return_type: 无返回值
Examples: Examples:
self.db_detector = TextDetector( self.db_detector = TextDetector(
...@@ -216,7 +209,6 @@ class TextDetector(object): ...@@ -216,7 +209,6 @@ class TextDetector(object):
""" """
class TextRecgnizer(object): class TextRecgnizer(object):
"""Support SVTR_LCNet """
def __init__( def __init__(
self, self,
rec_model_path, rec_model_path,
...@@ -240,7 +232,7 @@ class TextRecgnizer(object): ...@@ -240,7 +232,7 @@ class TextRecgnizer(object):
**kwargs :设置字符识别模型后处理相关参数 **kwargs :设置字符识别模型后处理相关参数
Returns: Returns:
return_type: NONE return_type: 无返回值
Examples: Examples:
self.text_extractor = TextRecgnizer(rec_model_path=rec_model_path, self.text_extractor = TextRecgnizer(rec_model_path=rec_model_path,
...@@ -252,18 +244,15 @@ class TextRecgnizer(object): ...@@ -252,18 +244,15 @@ class TextRecgnizer(object):
class BaseRecLabelDecode(object): class BaseRecLabelDecode(object):
def __init__(self, character_dict_path=None, def __init__(self, character_dict_path=None,
use_space_char=False) use_space_char=False)
"""Convert between text-label and text-index """
字符识别(crnn+ctc)。 字符识别(crnn+ctc)。
Args: Args:
character_dict_path :字符集文件路径。 character_dict_path :字符集文件路径。
use_space_char :字符集中是否包含空格。 use_space_char :字符集中是否包含空格。
Returns: Returns:
return_type: NONE。 return_type: 无返回值。
Examples:
Examples:
""" """
class CTCLabelDecode(BaseRecLabelDecode): class CTCLabelDecode(BaseRecLabelDecode):
...@@ -277,140 +266,28 @@ class TextRecgnizer(object): ...@@ -277,140 +266,28 @@ class TextRecgnizer(object):
character_dict_path :字符集文件路径。 character_dict_path :字符集文件路径。
use_space_char :字符集中是否包含空格。 use_space_char :字符集中是否包含空格。
Returns: Returns:
return_type: NONE return_type: 无返回值
Examples: Examples:
""" """
``` ```
## 推理 ## 推理
### 字符检测模型推理
```python
def __call__(self, src_img):
data = self.preprocess(src_img)
"""支持两种数据拷贝方式:offload_copy=true、offload_copy=false。当offload_copy为true时,不需要进行内存拷贝,如果为false,需要先预分配输入输出的设备内存,并在推理前,将预处理数据拷贝到设备内存,推理后将模型输出从设备内存中拷贝出来,在做后处理。"""
if self.offload_copy==False:
self.d_mem[self.det_input_name] =migraphx.to_gpu(migraphx.argument(data["image"]))
results = self.db_model.run(self.d_mem)
else:
results = self.db_model.run({self.det_input_name:data["image"]})
if self.offload_copy==False :
#从gpu拷贝推理结果到cpu
result=migraphx.from_gpu(results[0])
print("offload copy model")
result = np.array(result)
else:
result = results[0]
shape_list = np.expand_dims(data["shape"], axis=0)
pred = np.array(result)
pred = pred[:, 0, :, :]
#获取大于阈值的概率
segmentation = pred > self.thresh
boxes_batch = []
for batch_index in range(pred.shape[0]):
src_h, src_w, ratio_h, ratio_w = shape_list[batch_index]
if self.dilation_kernel is not None:
mask = cv2.dilate(
np.array(segmentation[batch_index]).astype(np.uint8),
self.dilation_kernel,
)
else:
mask = segmentation[batch_index]
#根据预测的bitmap获取文本区域
if self.box_type == "poly":
boxes, scores = self.polygons_from_bitmap(
pred[batch_index], mask, ratio_w,ratio_h, src_w, src_h
)
elif self.box_type == "quad":
boxes, scores = self.boxes_from_bitmap(
pred[batch_index], mask, ratio_w,ratio_h, src_w, src_h
)
else:
raise ValueError("box_type can only be one of ['quad', 'poly']")
boxes_batch.append(boxes)
#文本区域按照从上到下,从左到右的顺序排序
det_box_batch = self.sorted_boxes(boxes_batch)
#文本区域按坐标映射到原始图像
dt_boxes,det_rects = self.box_standardization(det_box_batch,shape_list)
return dt_boxes,det_rects
```
### 字符识别推理
```python ```python
def __call__(self, batch_img_list): def __call__(self, src_img):
if len(batch_img_list) == 0: import time
return [] start = time.time()
width_list = [] #字符检测
#遍历图片列表(字符roi存放在图片列表中),为了支持多batch推理,这里还会将batch_size张图片进行拼接np.concatenate(batch_norm_imgs) dt_boxs,dt_rects = self.db_detector(src_img)
for b in range(len(batch_img_list)): res_img = self.vis_boxes(dt_boxs,src_img)
for img in batch_img_list[b]: #字符区域图片裁剪
width_list.append(img.shape[1] / float(img.shape[0])) batch_img_list = self.detection_roi_crop(src_img,dt_rects)
#字符特征提取
indices = np.argsort(np.array(width_list)) batch_outputs_pre ,batch_max_wh_ratio_pre = self.text_extractor(batch_img_list)
#字符编码
input_batch = self.rec_batch_num batch_text_list, batch_label_list = self.ctc_decoder(batch_outputs_pre,return_word_box=False,wh_ratio_list = batch_max_wh_ratio_pre)
batch_outputs_pre = []
batch_max_wh_ratio_pre = []
for b in range(len(batch_img_list)):
im_count = len(batch_img_list[b])
batch_outputs = []
batch_max_wh_ratio = []
for beg_img_no in range(0, im_count, input_batch):
end_img_no = min(im_count, beg_img_no + input_batch)
# for ino in range(beg_img_no, end_img_no):
# h, w = batch_img_list[b][indices[ino]].shape[0:2]
# wh_ratio = w * 1.0 / h
# max_wh_ratio = max(max_wh_ratio, wh_ratio)
batch_norm_imgs = []
max_wh_ratio = list()
# N batch
for ino in range(beg_img_no, end_img_no):
#单张图片预处理
norm_img = self.preprocess(batch_img_list[b][indices[ino]], max_wh_ratio)
norm_img = norm_img[np.newaxis, :].astype(np.float32)
batch_norm_imgs.append(norm_img)
batch_max_wh_ratio.append(max_wh_ratio)
#batch_size张图片进行拼接
if self.rec_batch_num >1:
norm_img_batch = np.concatenate(batch_norm_imgs)
norm_img_batch = norm_img_batch.copy()
else:
norm_img_batch = np.array([batch_norm_imgs.copy()])
if self.offload_copy==False:
print("offload copy model")
self.d_mem[self.rec_input_name] =migraphx.to_gpu(migraphx.argument(norm_img_batch))
results = self.rec_model.run(self.d_mem)
output = np.array(results[0])
else:
results = self.rec_model.run({self.rec_input_name:norm_img_batch})
output = results[0]
# batch_outputs.append(np.array(output))
#将所有batch的输出结果append到batch_outputs中方便后处理
[batch_outputs.append(out) for out in np.array(output)]
batch_outputs_pre.append(np.array(batch_outputs))
batch_max_wh_ratio_pre.append(batch_max_wh_ratio)
return batch_outputs_pre ,batch_max_wh_ratio_pre
``` ```
# Ocrv5 API调用说明 # Ocrv5 API调用说明
API调用步骤如下: API调用步骤如下:
...@@ -425,8 +302,8 @@ if __name__ == '__main__': ...@@ -425,8 +302,8 @@ if __name__ == '__main__':
rec_onnx_path = "../Resource/Models/ppocrv5_server_rec_infer.onnx" rec_onnx_path = "../Resource/Models/ppocrv5_server_rec_infer.onnx"
image_path = "../Resource/Images/lite_demo.png" image_path = "../Resource/Images/lite_demo.png"
img = cv2.imread(image_path) img = cv2.imread(image_path)
ppocrv5 = PPOcrV5(det_onnx_path,rec_onnx_path,offload_copy=True,precision_mode="fp32") ppocrv5 = PPOcrV5(det_onnx_path,rec_onnx_path,offload_copy=True,precision_mode="fp16")
res_img = ppocrv5(img) res_img = ppocrv5(img)
cv2.imwrite("res.jpg",res_img) cv2.imwrite("res.jpg",res_img)
``` ```
sample支持两种精度推理(fp32和fp16),默认是fp32),精度和内存拷贝方式分别通过precision_mode和offload_copy参数控制。 sample支持两种精度推理(fp32和fp16),默认是fp16),精度和内存拷贝方式分别通过precision_mode和offload_copy参数控制。
\ No newline at end of file \ No newline at end of file
...@@ -6,8 +6,6 @@ import pyclipper ...@@ -6,8 +6,6 @@ import pyclipper
import migraphx import migraphx
import os import os
from PIL import Image from PIL import Image
def AllocateOutputMemory(model): def AllocateOutputMemory(model):
outputData={} outputData={}
for key in model.get_outputs().keys(): for key in model.get_outputs().keys():
...@@ -15,7 +13,9 @@ def AllocateOutputMemory(model): ...@@ -15,7 +13,9 @@ def AllocateOutputMemory(model):
return outputData return outputData
class BaseRecLabelDecode(object): class BaseRecLabelDecode(object):
"""Convert between text-label and text-index""" """
特征空间映射到文本空间
"""
def __init__(self, character_dict_path=None, use_space_char=False): def __init__(self, character_dict_path=None, use_space_char=False):
self.beg_str = "sos" self.beg_str = "sos"
self.end_str = "eos" self.end_str = "eos"
...@@ -42,7 +42,6 @@ class BaseRecLabelDecode(object): ...@@ -42,7 +42,6 @@ class BaseRecLabelDecode(object):
for i, char in enumerate(dict_character): for i, char in enumerate(dict_character):
self.dict[char] = i self.dict[char] = i
self.character = dict_character self.character = dict_character
def pred_reverse(self, pred): def pred_reverse(self, pred):
pred_re = [] pred_re = []
c_current = "" c_current = ""
...@@ -84,11 +83,11 @@ class BaseRecLabelDecode(object): ...@@ -84,11 +83,11 @@ class BaseRecLabelDecode(object):
and state == "en&num" and state == "en&num"
and c_i + 1 < len(text) and c_i + 1 < len(text)
and bool(re.search("[0-9]", text[c_i + 1])) and bool(re.search("[0-9]", text[c_i + 1]))
): # grouping floating number ):
c_state = "en&num" c_state = "en&num"
if ( if (
char == "-" and state == "en&num" char == "-" and state == "en&num"
): # grouping word with '-', such as 'state-of-the-art' ):
c_state = "en&num" c_state = "en&num"
if state == None: if state == None:
...@@ -121,20 +120,16 @@ class BaseRecLabelDecode(object): ...@@ -121,20 +120,16 @@ class BaseRecLabelDecode(object):
is_remove_duplicate=False, is_remove_duplicate=False,
return_word_box=False, return_word_box=False,
): ):
"""convert text-index into text-label."""
result_list = [] result_list = []
ignored_tokens = self.get_ignored_tokens() ignored_tokens = self.get_ignored_tokens()
batch_size = len(text_index) batch_size = len(text_index)
print(f"Info:{text_index.shape},{text_prob.shape}")
for batch_idx in range(batch_size): for batch_idx in range(batch_size):
selection = np.ones(len(text_index[batch_idx]), dtype=bool) selection = np.ones(len(text_index[batch_idx]), dtype=bool)
if is_remove_duplicate: if is_remove_duplicate:
selection[1:] = text_index[batch_idx][1:] != text_index[batch_idx][:-1] selection[1:] = text_index[batch_idx][1:] != text_index[batch_idx][:-1]
for ignored_token in ignored_tokens: for ignored_token in ignored_tokens:
selection &= text_index[batch_idx] != ignored_token selection &= text_index[batch_idx] != ignored_token
# print(f"[debug] {len(text_index)},{batch_idx},{selection},{text_index[batch_idx][selection]},{len(self.character)}")
char_list = [ char_list = [
self.character[text_id] for text_id in text_index[batch_idx][selection] self.character[text_id] for text_id in text_index[batch_idx][selection]
...@@ -147,8 +142,8 @@ class BaseRecLabelDecode(object): ...@@ -147,8 +142,8 @@ class BaseRecLabelDecode(object):
conf_list = [0] conf_list = [0]
text = "".join(char_list) text = "".join(char_list)
if self.reverse: # for arabic rec if self.reverse:
text = self.pred_reverse(text) text = self.pred_reverse(text)
if return_word_box: if return_word_box:
...@@ -173,22 +168,24 @@ class BaseRecLabelDecode(object): ...@@ -173,22 +168,24 @@ class BaseRecLabelDecode(object):
return result_list return result_list
def get_ignored_tokens(self): def get_ignored_tokens(self):
return [0] # for ctc blank return [0]
class CTCLabelDecode(BaseRecLabelDecode): class CTCLabelDecode(BaseRecLabelDecode):
"""Convert between text-label and text-index"""
def __init__(self, character_dict_path=None, use_space_char=False, **kwargs): def __init__(self, character_dict_path=None, use_space_char=False, **kwargs):
super(CTCLabelDecode, self).__init__(character_dict_path, use_space_char) super(CTCLabelDecode, self).__init__(character_dict_path, use_space_char)
def __call__(self, preds, label=None, return_word_box=False, *args, **kwargs): def __call__(self, preds, label=None, return_word_box=False, *args, **kwargs):
"""
1、获取每个通道上的最大概率值(ppocrv5每次可预测18385个字符)
2、字符解码 ,从模型输出从特征空间向字符空间映射
3、输出字符串/字符
"""
batch_text_list = [] batch_text_list = []
batch_label_list = [] batch_label_list = []
for b in range(len(preds)): for b in range(len(preds)):
print(preds[b].shape) #获取最大概率和最大概率的索引
preds_idx = preds[b].argmax(axis=2) preds_idx = preds[b].argmax(axis=2)
preds_prob = preds[b].max(axis=2) preds_prob = preds[b].max(axis=2)
text = self.decode( text = self.decode(
preds_idx, preds_idx,
preds_prob, preds_prob,
...@@ -204,8 +201,8 @@ class CTCLabelDecode(BaseRecLabelDecode): ...@@ -204,8 +201,8 @@ class CTCLabelDecode(BaseRecLabelDecode):
continue continue
label = self.decode(label) label = self.decode(label)
batch_text_list.append(text)
batch_text_list.append(text)
batch_label_list.append(label) batch_label_list.append(label)
return batch_text_list, batch_label_list return batch_text_list, batch_label_list
...@@ -215,14 +212,13 @@ class CTCLabelDecode(BaseRecLabelDecode): ...@@ -215,14 +212,13 @@ class CTCLabelDecode(BaseRecLabelDecode):
return dict_character return dict_character
class TextRecgnizer(object): class TextRecgnizer(object):
"""Support SVTR_LCNet """
def __init__( def __init__(
self, self,
rec_model_path, rec_model_path,
rec_batch_num=1, rec_batch_num=1,
rec_input_size=(48, 480),#hw rec_input_size=(48, 480),#(h,w)
rec_algorithm="SVTR_LCNet", rec_algorithm="SVTR_LCNet",
precision_mode = "fp32", precision_mode = "fp16",
**kwargs **kwargs
): ):
...@@ -244,13 +240,27 @@ class TextRecgnizer(object): ...@@ -244,13 +240,27 @@ class TextRecgnizer(object):
outputs = self.rec_model.get_outputs() outputs = self.rec_model.get_outputs()
if self.offload_copy==False: if self.offload_copy==False:
self.d_mem = AllocateOutputMemory(self.rec_model) self.d_mem = AllocateOutputMemory(self.rec_model)
in_data = np.ones((rec_batch_num,3,self.rec_input_size[0],self.rec_input_size[1]),dtype=np.float32)
#推理前warm up一次
self.d_mem[self.rec_input_name] =migraphx.to_gpu(migraphx.argument(in_data))
self.rec_model.run(self.d_mem)
else:
#推理前warm up一次
in_data = np.ones((rec_batch_num,3,self.rec_input_size[0],self.rec_input_size[1]),dtype=np.float32)
self.rec_model.run({self.rec_input_name:in_data})
print("Text recognizition model info:") print("Text recognizition model info:")
print(f" inputs info:{inputs}") print(f" inputs info:{inputs}")
print(f" outputs info:{outputs}") print(f" outputs info:{outputs}")
def __call__(self, batch_img_list): def __call__(self, batch_img_list):
"""
1、输入预处理
2、拼batch
3、推理
4、输出字符特征的featmap
"""
if len(batch_img_list) == 0: if len(batch_img_list) == 0:
return [] return []
width_list = [] width_list = []
...@@ -258,12 +268,11 @@ class TextRecgnizer(object): ...@@ -258,12 +268,11 @@ class TextRecgnizer(object):
for img in batch_img_list[b]: for img in batch_img_list[b]:
width_list.append(img.shape[1] / float(img.shape[0])) width_list.append(img.shape[1] / float(img.shape[0]))
indices = np.argsort(np.array(width_list)) # indices = np.argsort(np.array(width_list))
input_batch = self.rec_batch_num input_batch = self.rec_batch_num
batch_outputs_pre = [] batch_outputs_pre = []
batch_max_wh_ratio_pre = [] batch_max_wh_ratio_pre = []
# print(f"Batch size :{input_batch}")
for b in range(len(batch_img_list)): for b in range(len(batch_img_list)):
im_count = len(batch_img_list[b]) im_count = len(batch_img_list[b])
batch_outputs = [] batch_outputs = []
...@@ -271,17 +280,11 @@ class TextRecgnizer(object): ...@@ -271,17 +280,11 @@ class TextRecgnizer(object):
for beg_img_no in range(0, im_count, input_batch): for beg_img_no in range(0, im_count, input_batch):
end_img_no = min(im_count, beg_img_no + input_batch) end_img_no = min(im_count, beg_img_no + input_batch)
# for ino in range(beg_img_no, end_img_no):
# h, w = batch_img_list[b][indices[ino]].shape[0:2]
# wh_ratio = w * 1.0 / h
# max_wh_ratio = max(max_wh_ratio, wh_ratio)
batch_norm_imgs = [] batch_norm_imgs = []
max_wh_ratio = list() max_wh_ratio = list()
# N batch # N batch
for ino in range(beg_img_no, end_img_no): for ino in range(beg_img_no, end_img_no):
norm_img = self.preprocess(batch_img_list[b][indices[ino]], max_wh_ratio) norm_img = self.preprocess(batch_img_list[b][ino], max_wh_ratio)
norm_img = norm_img[np.newaxis, :].astype(np.float32) norm_img = norm_img[np.newaxis, :].astype(np.float32)
batch_norm_imgs.append(norm_img) batch_norm_imgs.append(norm_img)
...@@ -289,17 +292,10 @@ class TextRecgnizer(object): ...@@ -289,17 +292,10 @@ class TextRecgnizer(object):
if len(batch_norm_imgs)==0: if len(batch_norm_imgs)==0:
continue continue
batch_max_wh_ratio.append(max_wh_ratio) batch_max_wh_ratio.append(max_wh_ratio)
# if self.rec_batch_num >1:
# norm_img_batch = np.concatenate(batch_norm_imgs)
# norm_img_batch = norm_img_batch.copy()
# else:
# norm_img_batch = np.concatenate(batch_norm_imgs)
# norm_img_batch = norm_img_batch.copy()
norm_img_batch = np.concatenate(batch_norm_imgs) norm_img_batch = np.concatenate(batch_norm_imgs)
norm_img_batch = norm_img_batch.copy() norm_img_batch = norm_img_batch.copy()
# print(f"batch shape:{norm_img_batch.shape}")
if self.offload_copy==False: if self.offload_copy==False:
print("offload copy model") print("offload copy model")
self.d_mem[self.rec_input_name] =migraphx.to_gpu(migraphx.argument(norm_img_batch)) self.d_mem[self.rec_input_name] =migraphx.to_gpu(migraphx.argument(norm_img_batch))
...@@ -309,9 +305,6 @@ class TextRecgnizer(object): ...@@ -309,9 +305,6 @@ class TextRecgnizer(object):
results = self.rec_model.run({self.rec_input_name:norm_img_batch}) results = self.rec_model.run({self.rec_input_name:norm_img_batch})
output = results[0] output = results[0]
# batch_outputs.append(np.array(output))
[batch_outputs.append(out) for out in np.array(output)] [batch_outputs.append(out) for out in np.array(output)]
batch_outputs_pre.append(np.array(batch_outputs)) batch_outputs_pre.append(np.array(batch_outputs))
...@@ -326,26 +319,19 @@ class TextRecgnizer(object): ...@@ -326,26 +319,19 @@ class TextRecgnizer(object):
imgH, imgW = self.rec_input_size imgH, imgW = self.rec_input_size
max_h,max_w = self.rec_input_size max_h,max_w = self.rec_input_size
h, w = img.shape[:2] h, w = img.shape[:2]
# re_size = (max_w,max_h)
#沿着h axixientation 轴进行resize #沿着h axixientation 轴进行resize
if h <= max_h: if h <= max_h:
ratio = max_h / h ratio = max_h / h
w = int(w*ratio) w = int(w*ratio)
if w <= max_w: if w <= max_w:
re_size =(w,max_h) re_size =(w,max_h)
else: else:
re_size = (max_w,max_h) re_size = (max_w,max_h)
else: else:
ratio = max_h/h ratio = max_h/h
w,h = int(w*ratio),max_h w,h = int(w*ratio),max_h
if w <= max_w: if w <= max_w:
re_size = (w,h) re_size = (w,h)
else: else:
re_size = (max_w,h) re_size = (max_w,h)
...@@ -356,9 +342,7 @@ class TextRecgnizer(object): ...@@ -356,9 +342,7 @@ class TextRecgnizer(object):
resized_image -= 0.5 resized_image -= 0.5
resized_image /= 0.5 resized_image /= 0.5
padding_im = np.zeros((3, imgH, imgW), dtype=np.float32) padding_im = np.zeros((3, imgH, imgW), dtype=np.float32)
padding_im[:, :, 0:re_size[0]] = resized_image padding_im[:, :, 0:re_size[0]] = resized_image
return padding_im return padding_im
class TextDetector(object): class TextDetector(object):
...@@ -389,61 +373,46 @@ class TextDetector(object): ...@@ -389,61 +373,46 @@ class TextDetector(object):
assert score_mode in [ assert score_mode in [
"slow", "slow",
"fast", "fast",
], "Score mode must be in [slow, fast] but got: {}".format(score_mode) ], "Score mode not support: {}".format(score_mode)
self.dilation_kernel = None if not use_dilation else np.array([[1, 1], [1, 1]]) self.dilation_kernel = None if not use_dilation else np.array([[1, 1], [1, 1]])
self.offload_copy = kwargs.get("offload_copy", True) self.offload_copy = kwargs.get("offload_copy", True)
if os.path.exists(det_model_path) and det_model_path.endswith(".onnx"): if os.path.exists(det_model_path) and det_model_path.endswith(".onnx"):
self.det_input_name = "x" self.det_input_name = "x"
maxInput={self.det_input_name:[1,3,db_input_size[0],db_input_size[1]]} maxInput={self.det_input_name:[1,3,db_input_size[0],db_input_size[1]]}
self.db_model = migraphx.parse_onnx(det_model_path,map_input_dims=maxInput) self.db_model = migraphx.parse_onnx(det_model_path,map_input_dims=maxInput)
inputs = self.db_model.get_inputs() inputs = self.db_model.get_inputs()
outputs = self.db_model.get_outputs() outputs = self.db_model.get_outputs()
# if self.precision_mode == "int8":
# print("int8 quantization")
# dic = dict()
# image_path = "../Resource/Images/lite_demo.png"
# img = cv2.imread(image_path)
# data = self.preprocess(img)
# print(data["image"].shape)
# print(data["image"].dtype)
# dic[self.det_input_name] = migraphx.argument(data["image"].copy())
# calibration = [dic]
# migraphx.quantize_int8(self.db_model, migraphx.get_target("gpu"), calibration)
if self.precision_mode == "fp16": if self.precision_mode == "fp16":
migraphx.quantize_fp16(self.db_model) migraphx.quantize_fp16(self.db_model)
self.db_model.compile(t=migraphx.get_target("gpu"),offload_copy=self.offload_copy,device_id=0) self.db_model.compile(t=migraphx.get_target("gpu"),offload_copy=self.offload_copy,device_id=0)
if self.offload_copy==False: if self.offload_copy==False:
self.d_mem = AllocateOutputMemory(self.db_model) self.d_mem = AllocateOutputMemory(self.db_model)
in_data = np.ones((1,3,db_input_size[0],db_input_size[1]),dtype=np.float32)
#推理前warm up一次
self.d_mem[self.det_input_name] =migraphx.to_gpu(migraphx.argument(in_data))
self.db_model.run(self.d_mem)
else:
#推理前warm up一次
in_data = np.ones((1,3,db_input_size[0],db_input_size[1]),dtype=np.float32)
self.db_model.run({self.det_input_name:in_data})
print("Detection model info:") print("Detection model info:")
print(f" inputs info:{inputs}") print(f" inputs info:{inputs}")
print(f" outputs info:{outputs}") print(f" outputs info:{outputs}")
def polygons_from_bitmap(self, pred, _bitmap, ratio_w,ratio_h,dest_width, dest_height): def polygons_from_bitmap(self, pred, _bitmap, ratio_w,ratio_h,dest_width, dest_height):
"""
_bitmap: single map with shape (1, H, W),
whose values are binarized as {0, 1}
"""
bitmap = _bitmap bitmap = _bitmap
height, width = bitmap.shape height, width = bitmap.shape
boxes = [] boxes = []
scores = [] scores = []
#字符区域提取
contours, _ = cv2.findContours( contours, _ = cv2.findContours(
(bitmap * 255).astype(np.uint8), cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE (bitmap * 255).astype(np.uint8), cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE
) )
...@@ -483,11 +452,6 @@ class TextDetector(object): ...@@ -483,11 +452,6 @@ class TextDetector(object):
return boxes, scores return boxes, scores
def boxes_from_bitmap(self, pred, _bitmap, ratio_w,ratio_h, dest_width, dest_height): def boxes_from_bitmap(self, pred, _bitmap, ratio_w,ratio_h, dest_width, dest_height):
"""
_bitmap: single map with shape (1, H, W),
whose values are binarized as {0, 1}
"""
bitmap = _bitmap bitmap = _bitmap
height, width = bitmap.shape height, width = bitmap.shape
...@@ -563,9 +527,6 @@ class TextDetector(object): ...@@ -563,9 +527,6 @@ class TextDetector(object):
return box, min(bounding_box[1]) return box, min(bounding_box[1])
def box_score_fast(self, bitmap, _box): def box_score_fast(self, bitmap, _box):
"""
box_score_fast: use bbox mean score as the mean score
"""
h, w = bitmap.shape[:2] h, w = bitmap.shape[:2]
box = _box.copy() box = _box.copy()
xmin = np.clip(np.floor(box[:, 0].min()).astype("int32"), 0, w - 1) xmin = np.clip(np.floor(box[:, 0].min()).astype("int32"), 0, w - 1)
...@@ -578,11 +539,7 @@ class TextDetector(object): ...@@ -578,11 +539,7 @@ class TextDetector(object):
box[:, 1] = box[:, 1] - ymin box[:, 1] = box[:, 1] - ymin
cv2.fillPoly(mask, box.reshape(1, -1, 2).astype("int32"), 1) cv2.fillPoly(mask, box.reshape(1, -1, 2).astype("int32"), 1)
return cv2.mean(bitmap[ymin : ymax + 1, xmin : xmax + 1], mask)[0] return cv2.mean(bitmap[ymin : ymax + 1, xmin : xmax + 1], mask)[0]
def box_score_slow(self, bitmap, contour): def box_score_slow(self, bitmap, contour):
"""
box_score_slow: use polyon mean score as the mean score
"""
h, w = bitmap.shape[:2] h, w = bitmap.shape[:2]
contour = contour.copy() contour = contour.copy()
contour = np.reshape(contour, (-1, 2)) contour = np.reshape(contour, (-1, 2))
...@@ -591,9 +548,7 @@ class TextDetector(object): ...@@ -591,9 +548,7 @@ class TextDetector(object):
xmax = np.clip(np.max(contour[:, 0]), 0, w - 1) xmax = np.clip(np.max(contour[:, 0]), 0, w - 1)
ymin = np.clip(np.min(contour[:, 1]), 0, h - 1) ymin = np.clip(np.min(contour[:, 1]), 0, h - 1)
ymax = np.clip(np.max(contour[:, 1]), 0, h - 1) ymax = np.clip(np.max(contour[:, 1]), 0, h - 1)
mask = np.zeros((ymax - ymin + 1, xmax - xmin + 1), dtype=np.uint8) mask = np.zeros((ymax - ymin + 1, xmax - xmin + 1), dtype=np.uint8)
contour[:, 0] = contour[:, 0] - xmin contour[:, 0] = contour[:, 0] - xmin
contour[:, 1] = contour[:, 1] - ymin contour[:, 1] = contour[:, 1] - ymin
...@@ -606,7 +561,6 @@ class TextDetector(object): ...@@ -606,7 +561,6 @@ class TextDetector(object):
for b in range(len(boxes_batch)): for b in range(len(boxes_batch)):
src_h, src_w, _, _ = shape_list[b] src_h, src_w, _, _ = shape_list[b]
det_boxs = [] det_boxs = []
det_rects = []
for box in boxes_batch[b]: for box in boxes_batch[b]:
if isinstance(box,list): if isinstance(box,list):
box = np.array(box) box = np.array(box)
...@@ -628,25 +582,28 @@ class TextDetector(object): ...@@ -628,25 +582,28 @@ class TextDetector(object):
b_h = int(np.linalg.norm(box[0] - box[3])) b_h = int(np.linalg.norm(box[0] - box[3]))
if b_w <= 3 or b_h <= 3: if b_w <= 3 or b_h <= 3:
continue continue
_rect = [int(rect[0][0]),int(rect[0][1]),int(rect[2][0]),int(rect[2][1])]
det_boxs.append(rect) det_boxs.append(rect)
det_rects.append(_rect)
dt_batch_boxs.append(det_boxs) dt_batch_boxs.append(det_boxs)
dt_batch_rects.append(det_rects)
return dt_batch_boxs,dt_batch_rects return dt_batch_boxs
def __call__(self, src_img): def __call__(self, src_img):
"""
1、预处理
2、推理
3、后处理,输出字符区域的边界框
4、边界框排序,按照从上到下,从左到右的顺序
5、边界框坐标映射到原始图片
"""
data = self.preprocess(src_img) data = self.preprocess(src_img)
if self.offload_copy==False: if self.offload_copy==False:
self.d_mem[self.det_input_name] =migraphx.to_gpu(migraphx.argument()) self.d_mem[self.det_input_name] = migraphx.to_gpu(migraphx.argument(data["image"]))
results = self.db_model.run(self.d_mem) results = self.db_model.run(self.d_mem)
else: else:
results = self.db_model.run({self.det_input_name:data["image"]}) results = self.db_model.run({self.det_input_name:data["image"]})
if self.offload_copy==False : if self.offload_copy==False :
result=migraphx.from_gpu(results[0]) result=migraphx.from_gpu(results[0])
print("offload copy model") print("offload copy model")
...@@ -682,8 +639,8 @@ class TextDetector(object): ...@@ -682,8 +639,8 @@ class TextDetector(object):
boxes_batch.append(boxes) boxes_batch.append(boxes)
det_box_batch = self.sorted_boxes(boxes_batch) det_box_batch = self.sorted_boxes(boxes_batch)
dt_boxes,det_rects = self.box_standardization(det_box_batch,shape_list) dt_boxes = self.box_standardization(det_box_batch,shape_list)
return dt_boxes,det_rects return dt_boxes
def preprocess(self, src_img, def preprocess(self, src_img,
mean: list = [0.485, 0.456, 0.406], mean: list = [0.485, 0.456, 0.406],
...@@ -729,36 +686,39 @@ class TextDetector(object): ...@@ -729,36 +686,39 @@ class TextDetector(object):
im_pad = np.zeros((self.db_input_size[1], self.db_input_size[0], 3), np.float32) im_pad = np.zeros((self.db_input_size[1], self.db_input_size[0], 3), np.float32)
im_pad[:resize_h, :resize_w, :] = img im_pad[:resize_h, :resize_w, :] = img
return im_pad, [ratio_h, ratio_w] return im_pad, [ratio_h, ratio_w]
def sorted_boxes(self,dt_boxes): def sorted_boxes(self,dt_boxes):
""" dt_boxes = dt_boxes[0]
Sort text boxes in order from top to bottom, left to right boxes_np = np.array(dt_boxes, dtype=np.int32)
args:
dt_boxes(array):detected text boxes with shape [4, 2]
return:
sorted boxes(array) with shape [4, 2]
"""
batch_boxes = list() batch_boxes = list()
# print(dt_boxes) # 计算每个框的参考点(左上角)和几何特征
for b in range(len(dt_boxes)): top_left = boxes_np[:, 0, :]
widths = boxes_np[:, 1, 0] - boxes_np[:, 0, 0]
num_boxes = dt_boxes[b].shape[0] heights = boxes_np[:, 2, 1] - boxes_np[:, 0, 1]
batch_sorted_boxes = sorted(dt_boxes[b], key=lambda x: (x[0][1], x[0][0])) avg_height = np.median(heights)
_boxes = list(batch_sorted_boxes)
for i in range(num_boxes - 1): # 按y坐标主要排序,x坐标次要排序
for j in range(i, -1, -1): sorted_indices = np.lexsort((top_left[:, 0], top_left[:, 1]))
if abs(_boxes[j + 1][0][1] - _boxes[j][0][1]) < 10 and (
_boxes[j + 1][0][0] < _boxes[j][0][0] # 分组调整:将y坐标相近的框视为同一行
): final_order = []
tmp = _boxes[j] original_indices = []
_boxes[j] = _boxes[j + 1] current_row = [(0, sorted_indices[0])] # (x_coord, original_idx)
_boxes[j + 1] = tmp
else: for idx in sorted_indices[1:]:
break # 如果当前框与前一框的y坐标差小于行高的0.6倍,视为同一行
if abs(top_left[idx,1] - top_left[current_row[-1][1],1]) < avg_height * 0.6:
batch_boxes.append(_boxes) current_row.append((top_left[idx,0], idx))
# print("----------------------------------------") else:
# print(batch_boxes) # 对当前行按x坐标排序
current_row_sorted = sorted(current_row, key=lambda x: x[0])
final_order.extend([x[1] for x in current_row_sorted])
current_row = [(top_left[idx,0], idx)]
# 添加最后一行
current_row_sorted = sorted(current_row, key=lambda x: x[0])
final_order.extend([x[1] for x in current_row_sorted])
batch_boxes.append(boxes_np[final_order])
# 返回排序后的框
return batch_boxes return batch_boxes
...@@ -771,16 +731,16 @@ class PPOcrV5(): ...@@ -771,16 +731,16 @@ class PPOcrV5():
rec_input_size :list = (48,720), rec_input_size :list = (48,720),
seg_thresh:float=0.3, seg_thresh:float=0.3,
box_thresh:float=0.7, box_thresh:float=0.7,
precision_mode:str='fp32', precision_mode:str='fp16',
offload_copy:bool=True, offload_copy:bool=True,
**kwargs **kwargs
): ):
""" """
det_model_path: detection model path det_model_path: 字符检测模型路径
rec_model_path: recognition model path rec_model_path: 字符识别模型路径
seg_thresh: dbnet segmentation threshold seg_thresh: dbnet 像素分割阈值
box_thresh: box threshold box_thresh: 字符边界框阈值
db_input_size: dbnet input size db_input_size: 模型输入size
""" """
self.seg_thres = seg_thresh self.seg_thres = seg_thresh
self.box_thresh = box_thresh self.box_thresh = box_thresh
...@@ -837,19 +797,18 @@ class PPOcrV5(): ...@@ -837,19 +797,18 @@ class PPOcrV5():
def __call__(self, src_img): def __call__(self, src_img):
import time import time
start = time.time() start = time.time()
dt_boxs,dt_rects = self.db_detector(src_img) dt_boxs = self.db_detector(src_img)
res_img = self.vis_boxes(dt_boxs,src_img)
batch_img_list = self.detection_roi_crop(src_img,dt_rects) batch_img_list = self.detection_roi_crop(src_img,dt_boxs)
batch_outputs_pre ,batch_max_wh_ratio_pre = self.text_extractor(batch_img_list) batch_outputs_pre ,batch_max_wh_ratio_pre = self.text_extractor(batch_img_list)
batch_text_list, batch_label_list = self.ctc_decoder(batch_outputs_pre,return_word_box=False,wh_ratio_list = batch_max_wh_ratio_pre) batch_text_list, batch_label_list = self.ctc_decoder(batch_outputs_pre,return_word_box=False,wh_ratio_list = batch_max_wh_ratio_pre)
end = time.time() end = time.time()
batch_text_out = [] batch_text_out = []
batch_boxes_out = [] batch_boxes_out = []
for b in range(len(dt_boxs)): for b in range(len(dt_boxs)):
text_out = [] text_out = []
boxex_out = [] boxex_out = []
print("text box num:",len(dt_boxs[b]))
for box, rec_result in zip(dt_boxs[b], batch_text_list[b]): for box, rec_result in zip(dt_boxs[b], batch_text_list[b]):
text, score = rec_result[0], rec_result[1] text, score = rec_result[0], rec_result[1]
if score >= 0.5: if score >= 0.5:
...@@ -859,37 +818,31 @@ class PPOcrV5(): ...@@ -859,37 +818,31 @@ class PPOcrV5():
batch_text_out.append(text_out) batch_text_out.append(text_out)
batch_boxes_out.append(boxex_out) batch_boxes_out.append(boxex_out)
for b in range(len(batch_text_out)): for b in range(len(batch_text_out)):
for text, score in batch_text_out[b]: for text, score in batch_text_out[b]:
print("{}, {:.3f}".format(text, score)) print("{}, {:.3f}".format(text, score))
# res_img = self.vis_oct_text(batch_text_out,dt_rects,res_img)
print(f"[Time info] elapsed:{end-start:.4f}") res_img = self.vis_boxes(batch_boxes_out,src_img)
res_img = self.vis_oct_text(batch_text_out,batch_boxes_out,res_img)
print(f"[Time info] elapsed:{(end-start)*1000:.4f} ms")
return res_img return res_img
def detection_roi_crop(self,src_img,rects): def detection_roi_crop(self,src_img,boxes):
batch_cut_imgs = list() batch_cut_imgs = list()
for b in range(len(rects)): for b in range(len(boxes)):
crop_imgs = list() crop_imgs = list()
for rect in rects[b]: for tl,tr,br,bl in boxes[b]:
x_min,y_min,x_max,y_max = rect box = [int(tl[0]),int(tl[1]),int(br[0]),int(br[1])]
rect_w ,rect_h = x_max-x_min,y_max-y_min crop_img = src_img[box[1]:box[3], box[0]:box[2],:]
# if rect_w<3 or rect_h<3:
# continue
# print(x_min,y_min,x_max,y_max)
crop_img = src_img[y_min:y_max, x_min:x_max,:]
crop_imgs.append(crop_img) crop_imgs.append(crop_img)
batch_cut_imgs.append(crop_imgs) batch_cut_imgs.append(crop_imgs)
return batch_cut_imgs return batch_cut_imgs
def vis_oct_text(self,batch_text,batch_boxes,src_img,fornt_path="../Resource/fonts/simfang.ttf"):
def vis_oct_text(self,batch_text,batch_rect,src_img,fornt_path="../Resource/fonts/simfang.ttf"):
from PIL import Image, ImageDraw, ImageFont from PIL import Image, ImageDraw, ImageFont
img = np.zeros(src_img.shape, dtype=np.uint8) img = np.zeros(src_img.shape, dtype=np.uint8)
img.fill(114) img.fill(255)
pil_img = Image.fromarray(cv2.cvtColor(img, cv2.COLOR_BGR2RGB)) pil_img = Image.fromarray(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
draw = ImageDraw.Draw(pil_img) draw = ImageDraw.Draw(pil_img)
...@@ -897,33 +850,31 @@ class PPOcrV5(): ...@@ -897,33 +850,31 @@ class PPOcrV5():
for b in range(len(batch_text)): for b in range(len(batch_text)):
for id,text in enumerate(batch_text[b]): for id,text in enumerate(batch_text[b]):
text,conf = text text,conf = text
f_start = (batch_boxes[b][id][0][0],batch_boxes[b][id][0][1])
f_start = batch_rect[b][id][0:2] f_end = (batch_boxes[b][id][2][0],batch_boxes[b][id][2][1])
w,h = np.array(batch_rect[b][id][2:]) - np.array(batch_rect[b][id][0:2]) w,h = np.array(f_end) - np.array(f_start)
font_size = int(h*0.9) font_size = int(h*0.9)
font = ImageFont.truetype(fornt_path, font_size,encoding="utf-8") font = ImageFont.truetype(fornt_path, font_size,encoding="utf-8")
draw.text(f_start, text, font=font, fill=(0, 255, 0)) draw.text(f_start, text, font=font, fill=(0, 255, 0))
res_img = np.concatenate([src_img, np.array(pil_img)], axis=1) res_img = np.concatenate([src_img, np.array(pil_img)], axis=1)
return res_img return res_img
def vis_boxes(self,boxes, img, colors=(0,255,0), thickness=2):
def vis_boxes(self,boxes, img, colors=(255,0,0), thickness=2):
for b in range(len(boxes)): for b in range(len(boxes)):
for tl,tr,br,bl in boxes[b]: for tl,tr,br,bl in boxes[b]:
box = [int(tl[0]),int(tl[1]),int(br[0]),int(br[1])] box = [int(tl[0]),int(tl[1]),int(br[0]),int(br[1])]
cv2.rectangle(img, (box[0],box[1]), (box[2],box[3]), colors, thickness) cv2.rectangle(img, (box[0],box[1]), (box[2],box[3]), colors, thickness)
return img return img
if __name__ == '__main__': if __name__ == '__main__':
det_onnx_path = "../Resource/Models/ppocrv5_server_det_infer.onnx" det_onnx_path = "../Resource/Models/ppocrv5_server_det_infer.onnx"
rec_onnx_path = "../Resource/Models/ppocrv5_server_rec_infer.onnx" rec_onnx_path = "../Resource/Models/ppocrv5_server_rec_infer.onnx"
image_path = "../Resource/Images/lite_demo.png" image_path = "../Resource/Images/demo.png"
img = cv2.imread(image_path) img = cv2.imread(image_path)
ppocrv5 = PPOcrV5(det_onnx_path,rec_onnx_path,offload_copy=True,precision_mode="fp32") ppocrv5 = PPOcrV5(det_onnx_path,rec_onnx_path,offload_copy=True,precision_mode="fp16")
res_img = ppocrv5(img) res_img = ppocrv5(img)
cv2.imwrite("res.jpg",res_img) cv2.imwrite("res.jpg",res_img)
\ No newline at end of file
\ No newline at end of file
...@@ -81,6 +81,7 @@ pip install -r requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple ...@@ -81,6 +81,7 @@ pip install -r requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple
``` ```
python inference.py python inference.py
``` ```
结果图片保存在当前目录下:res.jpg
offload_copy和precision_mode设置可参考[Tutorial_Python.md](Doc/Tutorial_Python.md),在main中示例。 offload_copy和precision_mode设置可参考[Tutorial_Python.md](Doc/Tutorial_Python.md),在main中示例。
### C++版本推理 ### C++版本推理
...@@ -104,7 +105,6 @@ cd <path_to_ppocrv5_migraphx> ...@@ -104,7 +105,6 @@ cd <path_to_ppocrv5_migraphx>
sh ./3rdParty/InstallOpenCVDependences.sh sh ./3rdParty/InstallOpenCVDependences.sh
``` ```
#### 安装OpenCV并构建工程 #### 安装OpenCV并构建工程
``` ```
...@@ -116,30 +116,9 @@ rbuild build -d depend ...@@ -116,30 +116,9 @@ rbuild build -d depend
- 进入到opencv-3.4.11_mini目录下创建build目录,cd build - 进入到opencv-3.4.11_mini目录下创建build目录,cd build
- 执行以下命令: - 执行以下命令:
``` ```
cmake -D CMAKE_BUILD_TYPE=RELEASE -D CMAKE_INSTALL_PREFIX=./opencv_dep -D INSTALL_C_EXAMPLES=ON -D INSTALL_PYTHON_EXAMPLES=ON -D OPENCV_GENERATE_PKGCONFIG=ON -D BUILD_EXAMPLES=ON -D OPENCV_EXTRA_MODULES_PATH=../modules/ .. cmake -D CMAKE_BUILD_TYPE=RELEASE -D CMAKE_INSTALL_PREFIX=./opencv_dep -D INSTALL_C_EXAMPLES=ON -D INSTALL_PYTHON_EXAMPLES=ON -D OPENCV_GENERATE_PKGCONFIG=ON -D BUILD_EXAMPLES=ON -D OPENCV_EXTRA_MODULES_PATH=../modules/ ..
``` ```
- 执行make -j8 && make install,编译的头文件和库目录存放在opencv_dep,将opencv_dep目录拷贝到3rdParty,并命名为opencv - 执行make -j8 && make install,编译的头文件和库目录存放在opencv_dep,将opencv_dep目录拷贝到3rdParty,并命名为opencv
#### 设置环境变量
将依赖库依赖加入环境变量LD_LIBRARY_PATH,在~/.bashrc中添加如下语句:
当操作系统是ubuntu系统时:
```
export LD_LIBRARY_PATH=<path_to_ppocrv5_migraphx>/depend/lib/:$LD_LIBRARY_PATH
```
当操作系统是centos系统时:
```
export LD_LIBRARY_PATH=<path_to_ppocrv5_migraphx>/depend/lib64/:$LD_LIBRARY_PATH
```
然后执行:
```
source ~/.bashrc
```
#### 运行示例 #### 运行示例
...@@ -155,58 +134,112 @@ cmake .. && make ...@@ -155,58 +134,112 @@ cmake .. && make
#运行 #运行
./ppOcrV5cd ./ppOcrV5cd
``` ```
结果图片保存在当前目录下:res.jpg
## result ## result
### Python版本 ### Python版本
输出结果中,每个值分别对应每个label的实际概率 输出结果中展示了识别到的字符,每个字符后面跟着一个置信度,置信度值越大,识别结果越准确
``` ```
产品信息/参数, 0.954 '0', 0.991
发足够的滋养, 1.000 纯臻营养护发素, 1.000
纯臻宫乔护发素, 0.883 '0'.'9''9''3''6''0''4', 0.999
花费了'0'.'4''5''7''3''3''5'秒, 0.993 '1', 0.998
【净含量】:'2''2''0'ml, 0.993 产品信息/参数, 0.934
'0'.'9''9''2''7''2''8', 0.999
'2', 0.999
('4''5'元/每公斤,'1''0''0'公斤起订), 0.970
'0'.'9''7''4''1''7', 0.999
'3', 0.999
每瓶'2''2'元,'1''0''0''0'瓶起订), 0.998 每瓶'2''2'元,'1''0''0''0'瓶起订), 0.998
【品名】:纯臻营养护发素, 0.998 '0'.'9''9''3''9''7''6', 0.999
【品牌】:代加工方式/'0'EMODM, 0.968 '4', 0.998
糖、椰油酰胺丙基甜菜碱、泛醒, 0.997 【品牌】:代加工方式/'0'EMODM, 0.959
【适用人群】:适合所有肤质, 0.998 '0'.'9''8''5''1''3''3', 0.998
【产品编号】:YM'-'X'-''3''0''1''1''0'.'9''6''8''9''9', 0.993 '5', 0.998
('4''5'元/每公斤,'1''0''0'公斤起订), 0.972 【品名】:纯臻营养护发素, 0.997
【主要成分】:鲸蜡硬脂醇、燕麦B'-'葡聚, 0.966 '0'.'9''9''5''0''0''7', 0.999
【主要功能】:可紧致头发磷层,从而达到, 0.994 '6', 0.995
即时持久改善头发光泽的效果,给干燥的头, 0.997 【产品编号】:YM'-'X'-''3''0''1''1''0'.'9''6''8''9''9', 0.973
The detectionvisualizedimagsavedin./vis.jpg, 0.940 '7', 0.999
[Time info] elapsed:3.5736 【净含量】:'2''2''0'ml, 0.994
'0'.'9''9''6''5''7''7', 0.999
'8', 0.998
【适用人群】:适合所有肤质, 0.997
'0'.'9''9''5''8''4''2', 0.999
'9', 0.997
【主要成分】:鲸蜡硬脂醇、燕麦B'-'葡聚, 0.976
'0'.'9''6''1''9''2''8', 0.999
'1''0', 1.000
糖、椰油酰胺丙基甜菜碱、泛醒, 0.996
'0'.'9''2''5''8''9''8', 0.999
'1''1', 0.999
(成品包材), 0.998
'0'.'9''7''2''5''7''3', 0.999
'1''2', 1.000
【主要功能】:可紧致头发磷层,从而达到, 0.992
'0'.'9''9''4''4''4''8', 0.999
'1''3', 0.999
即时持久改善头发光泽的效果,给干燥的头, 0.989
'0'.'9''9''0''1''9''8', 0.999
'1''4', 0.999
发足够的滋养, 0.999
'0'.'9''9''7''6''6''8', 0.999
花费了'0'.'4''5''7''3''3''5'秒, 0.993
[Time info] elapsed:578.6152 ms
``` ```
### C++版本 ### C++版本
``` ```
ocr res :[生成一幅画,负向提示词为:画中不要出现人物。正负提示词结合会] ocr res :花费了'0'.'4''5''7''3''3''5'秒 0.984009
ocr res :[Text_encode_'2'.副文本编码器,补充描述性细节(如材质、光照、] ocr res :'0'.'9''9''7' 0.773633
ocr res :[图片的准确性,过滤掉不需要的元素,例如正向提示词为:提示模型] ocr res :发足够的滋养 0.96818
ocr res :[编码器特征融合提升模型的理解能力。] ocr res :'1' 0.697754
ocr res :[正负 prompt 设置:正向 prompt 和负向 prompt 结合可以提升生成] ocr res :'0''0'.'9''9''0''1''9' 0.656647
ocr res :[语义表示捕获提示词的基础含义和全局语境(如对象、动作),与副] ocr res :即时持久改善头发光泽的效果,给干燥的头 0.996608
ocr res :[的图像不会发生变化,随机种子可以增加生成图像的多样性。] ocr res : 0
ocr res :[Text_encode.主文本编码器,将prompt序列转换为一个综合的] ocr res :【主要功能】:可紧致头发磷层,从而达到 0.993421
ocr res :[响初始噪声和生成结果的确定性,固定种子后,同一个prompt生成] ocr res :'0'.'9''9''4''4' 0.677327
ocr res :[声转化为目标图像。] ocr res : 0
ocr res :[随机数设置:随机数种子是控制生成过程随机性的关键参数,直接影] ocr res :'0'.'9''7''2' 0.637158
ocr res :[Scheduler:调度器,控制图像生成,决定了如何逐步将随机噪] ocr res :(成品包材) 0.901937
ocr res :[程和图像生成过程中有着至关重要的作用。] ocr res :'1' 0.32251
ocr res :[在stable'-'dffusion'-'xl'-'base'-''1'.'0'模型中主要包含一下子组件:] ocr res :糖、椰油酰胺丙基甜菜碱、泛醒 0.993478
ocr res :[Pipeline的配置参数控制图像生成的质量和速度,在扩散模型预测过] ocr res :'0'.'9''2''5' 0.586279
ocr res :[具配置文件中的定义手动加载各个子组件。] ocr res :'1''0' 0.547241
ocr res :[这里使用了扩散模型加载器统一加载了所有的子组件,也可以更] ocr res :【主要成分】:鲸蜡硬脂醇、燕麦B'-'葡聚 0.975303
ocr res :[·'2'.'3'pipeline 配置] ocr res :'0'.'9''1''9' 0.568408
Time taken by task: 3475 ms ocr res : 0
ocr res :'0'.'9''9''5''2' 0.613647
ocr res :【适用人群】:适合所有肤质 0.996882
ocr res :'8' 0.378906
ocr res :'0'.'9''9' 0.595581
ocr res :【净含量】:'2''2''0'ml 0.835671
ocr res :'7' 0.356689
ocr res :【产品编号】:YM'-'X'-''3''0''1''1''0'.'9''6''8''9''9' 0.993695
ocr res :'6' 0.214355
ocr res :'0'.'9''9''5' 0.478052
ocr res :【品名】:纯臻营养护发素 0.996175
ocr res :'5' 0.594727
ocr res : 0
ocr res :'0'.'9''8''5' 0.55166
ocr res :【品牌】:代加工方式/'0'EMODM 0.917768
ocr res :每瓶'2''2'元,'1''0''0''0'瓶起订) 0.974644
ocr res :'0'.'9''9''3''9''7''6' 0.736755
ocr res :'3' 0.486572
ocr res :('4''5'元/每公斤,'1''0''0'公斤起订) 0.940028
ocr res :'0'.'9'm'7' 0.534668
ocr res :'2' 0.961426
ocr res : 0
ocr res :'0'.'9''9''2' 0.524121
ocr res :产品信息/参数 0.913853
ocr res :纯臻营养护发素'0'.'9''9''3''6''0''4' 0.964128
ocr res :'0' 0.380127
ocr res :The detection visualized imagesavedin./vis.jpg 0.94302
[Time info] elapsed: 389 ms
``` ```
### 精度 ### 精度
......
#include "cv_put_Text.hpp"
PutText::PutText(const char* font_path) {
// 初始化 FreeType
if (FT_Init_FreeType(&ft)) {
std::cerr << "Error: Could not init FreeType !" << std::endl;
return;
}
// 加载字体文件( 这里使用 SimHei.ttf 字体文件)
if (FT_New_Face(ft, font_path, 0, &face)) {
std::cerr << "Error: Load front failed!" << std::endl;
exit(-1);
}
}
PutText::~PutText() {
// 释放 FreeType 资源
FT_Done_Face(face);
FT_Done_FreeType(ft);
}
void PutText::putText(cv::Mat& img, const std::string& text, int x, int y, int fontSize, cv::Scalar color) {
if(img.empty())
{
std::cerr << "Empty image!";
return ;
}
// 设置字体大小
FT_Set_Pixel_Sizes(face, 0, fontSize);
int start_point_x = x;
int start_point_y = y + fontSize; // 调整基线
// 循环处理每个字符
for (size_t i = 0; i < text.size(); ) {
// 解析 UTF-8 字符
unsigned long unicode = 0;
if ((text[i] & 0x80) == 0) {
unicode = text[i];
i += 1;
} else if ((text[i] & 0xE0) == 0xC0) {
unicode = ((text[i] & 0x1F) << 6) | (text[i + 1] & 0x3F);
i += 2;
} else if ((text[i] & 0xF0) == 0xE0) {
unicode = ((text[i] & 0x0F) << 12) | ((text[i + 1] & 0x3F) << 6) | (text[i + 2] & 0x3F);
i += 3;
} else {
i++; // 无效 UTF-8
continue;
}
// 加载字符字形
if (FT_Load_Char(face, unicode, FT_LOAD_RENDER)) {
std::cerr << "Error: Could not load glyph" << std::endl;
continue;
}
// 绘制到 OpenCV 图像
FT_Bitmap& bitmap = face->glyph->bitmap;
for (int row = 0; row < bitmap.rows; ++row) {
for (int col = 0; col < bitmap.width; ++col) {
unsigned char intensity = bitmap.buffer[row * bitmap.width + col];
if (intensity > 0) {
cv::Vec3b& pixel = img.at<cv::Vec3b>(start_point_y - face->glyph->bitmap_top + row, start_point_x + face->glyph->bitmap_left + col);
pixel[0] = color[0] * (intensity / 255.0) + pixel[0] * (1 - intensity / 255.0);
pixel[1] = color[1] * (intensity / 255.0) + pixel[1] * (1 - intensity / 255.0);
pixel[2] = color[2] * (intensity / 255.0) + pixel[2] * (1 - intensity / 255.0);
}
}
}
start_point_x += face->glyph->advance.x >> 6;
}
}
\ No newline at end of file
#pragma once
#include <ft2build.h>
#include FT_FREETYPE_H
#include <opencv2/opencv.hpp>
class PutText {
private:
FT_Library ft;
FT_Face face;
public:
PutText(const char* font_path);
~PutText();
/**
* @brief 向图片写文字(支持中文)
* @param img 待叠加字符的图片
* @param text 待叠加的字符
* @param x 垂直方向缩放比例
* @param y 水平方向缩放比例
* @param fontSize 原始图像
* @param color 字体颜色,默认绿色
*
* @return 无返回值
*/
void putText(cv::Mat& img, const std::string& text, int x, int y, int fontSize=2, cv::Scalar color=cv::Scalar(0, 255, 0));
};
#include "ocr_engine.hpp" #include "ocr_engine.hpp"
using namespace ppocr; using namespace ppocr;
int main(int argc, char** argv) int main(int argc, char** argv){
{
std::string det_model_onnx = "../Resource/Models/ppocrv5_server_det_infer.onnx"; std::string det_model_onnx = "../Resource/Models/ppocrv5_server_det_infer.onnx";
std::string rec_model_onnx = "../Resource/Models/ppocrv5_server_rec_infer.onnx"; std::string rec_model_onnx = "../Resource/Models/ppocrv5_server_rec_infer.onnx";
std::string img_path = "../Resource/Images/20250703205038.png"; std::string img_path = "../Resource/Images/demo.png";
std::string character_dict_path = "../Resource/ppocr_keys_v5.txt"; std::string character_dict_path = "../Resource/ppocr_keys_v5.txt";
std::string front = "../Resource/fonts/SimHei.ttf";
float segm_thres=0.3; float segm_thres=0.3;
float box_thresh=0.3; float box_thresh=0.3;
ppOcrEngine ocr_engine(det_model_onnx, ppOcrEngine ocr_engine(det_model_onnx,
rec_model_onnx, rec_model_onnx,
character_dict_path, character_dict_path,
front,
segm_thres, segm_thres,
box_thresh, box_thresh,
true, true,
"fp32"); "fp16");
cv::Mat img=cv::imread(img_path); cv::Mat img=cv::imread(img_path);
ocr_engine.forward(img); ocr_engine.forward(img);
return 0; return 0;
} }
\ No newline at end of file
...@@ -53,15 +53,12 @@ bool XsortFp32(std::vector<float> a, std::vector<float> b) { ...@@ -53,15 +53,12 @@ bool XsortFp32(std::vector<float> a, std::vector<float> b) {
return a[0] < b[0]; return a[0] < b[0];
return false; return false;
} }
namespace ppocr namespace ppocr{
{
OcrDet::OcrDet(const std::string det_model_path, OcrDet::OcrDet(const std::string det_model_path,
std::string precision_mode, std::string precision_mode,
bool offload_copy, bool offload_copy,
float segm_thres, float segm_thres,
float box_thresh ) float box_thresh ){
{
if(!Exists(det_model_path)) if(!Exists(det_model_path))
{ {
LOG_ERROR(stdout, "onnx file not exists!\n"); LOG_ERROR(stdout, "onnx file not exists!\n");
...@@ -119,6 +116,9 @@ namespace ppocr ...@@ -119,6 +116,9 @@ namespace ppocr
options.offload_copy = offload_copy; options.offload_copy = offload_copy;
migraphx::target gpuTarget = migraphx::gpu::target{}; migraphx::target gpuTarget = migraphx::gpu::target{};
net.compile(gpuTarget, options); net.compile(gpuTarget, options);
float *warm_data = (float*)malloc(this->input_shape.bytes());
memset(warm_data, 1.0, this->input_shape.bytes());
if( this->offload_copy ==false ) if( this->offload_copy ==false )
{ {
hipMalloc(&input_buffer_device, this->input_shape.bytes()); hipMalloc(&input_buffer_device, this->input_shape.bytes());
...@@ -127,14 +127,23 @@ namespace ppocr ...@@ -127,14 +127,23 @@ namespace ppocr
dev_argument[input_name] = migraphx::argument{input_shape, input_buffer_device}; dev_argument[input_name] = migraphx::argument{input_shape, input_buffer_device};
dev_argument[output_name] = migraphx::argument{output_shape, output_buffer_device}; dev_argument[output_name] = migraphx::argument{output_shape, output_buffer_device};
}
//decode hipMemcpy(input_buffer_device,
// ocr = std::make_shared<CTCDecode>(res_mpath,100,32,3,keys_path); (void*)warm_data,
this->input_shape.bytes(),
hipMemcpyHostToDevice);
//warm up
std::vector<migraphx::argument> results = net.eval(dev_argument);
}else{
std::unordered_map<std::string, migraphx::argument> inputData;
inputData[input_name] = migraphx::argument{input_shape, (float *)warm_data};
//warm up
std::vector<migraphx::argument> results = net.eval(inputData);
}
free(warm_data);
} }
OcrDet::~OcrDet() OcrDet::~OcrDet(){
{
if(data) if(data)
{ {
free(data); free(data);
...@@ -142,6 +151,7 @@ namespace ppocr ...@@ -142,6 +151,7 @@ namespace ppocr
} }
if( offload_copy == false ) if( offload_copy == false )
{ {
//内存释放
if(input_buffer_device) if(input_buffer_device)
{ {
hipFree(input_buffer_device); hipFree(input_buffer_device);
...@@ -158,8 +168,7 @@ namespace ppocr ...@@ -158,8 +168,7 @@ namespace ppocr
} }
} }
cv::Size OcrDet::preproc(cv::Mat img,float* data) cv::Size OcrDet::preproc(cv::Mat img,float* data){
{
float scale = 1.0/255.0; float scale = 1.0/255.0;
std::vector<float> s_mean={0.485, 0.456, 0.406}; std::vector<float> s_mean={0.485, 0.456, 0.406};
std::vector<float> s_stdv={0.229, 0.224, 0.225}; std::vector<float> s_stdv={0.229, 0.224, 0.225};
...@@ -189,8 +198,7 @@ namespace ppocr ...@@ -189,8 +198,7 @@ namespace ppocr
return scale_r ; return scale_r ;
} }
std::vector<std::vector<float>> OcrDet::get_mini_boxes(cv::RotatedRect box,float &ssid) std::vector<std::vector<float>> OcrDet::get_mini_boxes(cv::RotatedRect box,float &ssid) {
{
ssid = max(box.size.width, box.size.height); ssid = max(box.size.width, box.size.height);
cv::Mat points; cv::Mat points;
cv::boxPoints(box, points); cv::boxPoints(box, points);
...@@ -252,7 +260,6 @@ namespace ppocr ...@@ -252,7 +260,6 @@ namespace ppocr
auto array = get_mini_boxes(box, ssid); auto array = get_mini_boxes(box, ssid);
auto box_for_unclip = array; auto box_for_unclip = array;
// end get_mini_box
if (ssid < min_size) { if (ssid < min_size) {
continue; continue;
...@@ -260,20 +267,19 @@ namespace ppocr ...@@ -260,20 +267,19 @@ namespace ppocr
float score; float score;
if (use_polygon_score) if (use_polygon_score)
/* compute using polygon*/ //多边形区域的平均得分作为box的分数
score = polygon_score_acc(contours[_i], pred); score = polygon_score_acc(contours[_i], pred);
else else
score = box_score_fast(array, pred); score = box_score_fast(array, pred);
if (score < box_thresh) if (score < box_thresh)
continue; continue;
// start for unclip //简化边界得到准确的边界
cv::RotatedRect points = unClip(box_for_unclip, det_db_unclip_ratio); cv::RotatedRect points = unClip(box_for_unclip, det_db_unclip_ratio);
if (points.size.height < 1.001 && points.size.width < 1.001) { if (points.size.height < 1.001 && points.size.width < 1.001) {
continue; continue;
} }
// end for unclip
cv::RotatedRect clipbox = points; cv::RotatedRect clipbox = points;
auto cliparray = get_mini_boxes(clipbox, ssid); auto cliparray = get_mini_boxes(clipbox, ssid);
...@@ -286,22 +292,21 @@ namespace ppocr ...@@ -286,22 +292,21 @@ namespace ppocr
std::vector<std::vector<int>> intcliparray; std::vector<std::vector<int>> intcliparray;
for (int num_pt = 0; num_pt < 4; num_pt++) { for (int num_pt = 0; num_pt < 4; num_pt++) {
std::vector<int> a{int(clampf(roundf(cliparray[num_pt][0] / float(width) * std::vector<int> a{int(clampf(roundf(cliparray[num_pt][0] / float(width) *
float(dest_width)), float(dest_width)),
0, float(dest_width))), 0, float(dest_width))),
int(clampf(roundf(cliparray[num_pt][1] / int(clampf(roundf(cliparray[num_pt][1] /
float(height) * float(dest_height)), float(height) * float(dest_height)),
0, float(dest_height)))}; 0, float(dest_height)))};
intcliparray.push_back(a); intcliparray.push_back(a);
} }
boxes.push_back(intcliparray); boxes.push_back(intcliparray);
} // end for }
return boxes; return boxes;
} }
std::vector<std::vector<float>> OcrDet::Mat2Vector(cv::Mat mat) std::vector<std::vector<float>> OcrDet::Mat2Vector(cv::Mat mat){
{
std::vector<std::vector<float>> img_vec; std::vector<std::vector<float>> img_vec;
std::vector<float> tmp; std::vector<float> tmp;
...@@ -316,8 +321,7 @@ namespace ppocr ...@@ -316,8 +321,7 @@ namespace ppocr
} }
float OcrDet::polygon_score_acc(std::vector<cv::Point> contour, float OcrDet::polygon_score_acc(std::vector<cv::Point> contour,
cv::Mat pred) cv::Mat pred){
{
int width = pred.cols; int width = pred.cols;
int height = pred.rows; int height = pred.rows;
std::vector<float> box_x; std::vector<float> box_x;
...@@ -364,8 +368,7 @@ namespace ppocr ...@@ -364,8 +368,7 @@ namespace ppocr
} }
float OcrDet::box_score_fast(std::vector<std::vector<float>> box_array, float OcrDet::box_score_fast(std::vector<std::vector<float>> box_array,
cv::Mat pred) cv::Mat pred) {
{
auto array = box_array; auto array = box_array;
int width = pred.cols; int width = pred.cols;
int height = pred.rows; int height = pred.rows;
...@@ -402,8 +405,7 @@ namespace ppocr ...@@ -402,8 +405,7 @@ namespace ppocr
return score; return score;
} }
cv::RotatedRect OcrDet::unClip(std::vector<std::vector<float>> box, cv::RotatedRect OcrDet::unClip(std::vector<std::vector<float>> box,
const float &unclip_ratio) const float &unclip_ratio){
{
float distance = 1.0; float distance = 1.0;
get_contour_area(box, unclip_ratio, distance); get_contour_area(box, unclip_ratio, distance);
ClipperLib::ClipperOffset offset; ClipperLib::ClipperOffset offset;
...@@ -433,8 +435,7 @@ namespace ppocr ...@@ -433,8 +435,7 @@ namespace ppocr
} }
void OcrDet::get_contour_area(const std::vector<std::vector<float>> &box, void OcrDet::get_contour_area(const std::vector<std::vector<float>> &box,
float unclip_ratio, float &distance) float unclip_ratio, float &distance) {
{
int pts_num = 4; int pts_num = 4;
float area = 0.0f; float area = 0.0f;
float dist = 0.0f; float dist = 0.0f;
...@@ -452,8 +453,7 @@ namespace ppocr ...@@ -452,8 +453,7 @@ namespace ppocr
std::vector<std::vector<std::vector<int>>> std::vector<std::vector<std::vector<int>>>
OcrDet::filter_det_res(std::vector<std::vector<std::vector<int>>> boxes, OcrDet::filter_det_res(std::vector<std::vector<std::vector<int>>> boxes,
float ratio_h, float ratio_w, cv::Mat srcimg) float ratio_h, float ratio_w, cv::Mat srcimg){
{
int oriimg_h = srcimg.rows; int oriimg_h = srcimg.rows;
int oriimg_w = srcimg.cols; int oriimg_w = srcimg.cols;
...@@ -482,8 +482,7 @@ namespace ppocr ...@@ -482,8 +482,7 @@ namespace ppocr
return root_points; return root_points;
} }
std::vector<std::vector<int>> OcrDet::order_points_clockwise(std::vector<std::vector<int>> pts) std::vector<std::vector<int>> OcrDet::order_points_clockwise(std::vector<std::vector<int>> pts){
{
std::vector<std::vector<int>> box = pts; std::vector<std::vector<int>> box = pts;
std::sort(box.begin(), box.end(), XsortInt); std::sort(box.begin(), box.end(), XsortInt);
std::vector<std::vector<int>> leftmost = {box[0], box[1]}; std::vector<std::vector<int>> leftmost = {box[0], box[1]};
...@@ -500,31 +499,8 @@ namespace ppocr ...@@ -500,31 +499,8 @@ namespace ppocr
return rect; return rect;
} }
void OcrDet::visualize_boxes(const cv::Mat &srcimg,
const std::vector<std::vector<std::vector<int>>> &boxes)
{
cv::Mat img_vis;
srcimg.copyTo(img_vis);
for (int n = 0; n < boxes.size(); n++) {
cv::Point rook_points[4];
// std::cout<<"size :"<<boxes[n].size()<<'\n';
for (int m = 0; m < boxes[n].size(); m++) {
rook_points[m] = cv::Point(int(boxes[n][m][0]), int(boxes[n][m][1]));
}
const cv::Point *ppt[1] = {rook_points};
int npt[] = {4};
cv::polylines(img_vis, ppt, npt, 1, 1, CV_RGB(0, 255, 0), 2, 8, 0);
}
cv::imwrite("./ocr_debug.png", img_vis);
std::cout << "image saved in ./ocr_result.png"
<< std::endl;
}
bool OcrDet::text_recognition(const cv::Mat &srcimg, bool OcrDet::text_recognition(const cv::Mat &srcimg,
const std::vector<std::vector<std::vector<int>>> &boxes) const std::vector<std::vector<std::vector<int>>> &boxes){
{
if(boxes.size() == 0) if(boxes.size() == 0)
{ {
std::cout<<"Not found text roi !\n"; std::cout<<"Not found text roi !\n";
...@@ -540,15 +516,11 @@ namespace ppocr ...@@ -540,15 +516,11 @@ namespace ppocr
rect.width = boxes[n][2][0] - boxes[n][0][0]; rect.width = boxes[n][2][0] - boxes[n][0][0];
rect.height = boxes[n][2][1] - boxes[n][0][1]; rect.height = boxes[n][2][1] - boxes[n][0][1];
text_mat = srcimg(rect).clone(); text_mat = srcimg(rect).clone();
// ocr->forward(text_mat);
// cv::rectangle(srcimg,rect,cv::Scalar(0,255,0),2);
} }
// cv::imwrite("region_debug.jpg",srcimg);
return true; return true;
} }
int OcrDet::postprocess(float* feature, std::vector<std::vector<std::vector<int>>> &boxes) int OcrDet::postprocess(float* feature, std::vector<std::vector<std::vector<int>>> &boxes){
{
int batch_s = 1; int batch_s = 1;
float conf_thres = 0.6; float conf_thres = 0.6;
cv::Mat thres_mat = cv::Mat(cv::Size(output_height,output_width), CV_8UC1); cv::Mat thres_mat = cv::Mat(cv::Size(output_height,output_width), CV_8UC1);
...@@ -574,8 +546,7 @@ namespace ppocr ...@@ -574,8 +546,7 @@ namespace ppocr
return 0; return 0;
} }
bool OcrDet::forward(cv::Mat& img,std::vector<std::vector<std::vector<int>>>& text_roi_boxes) bool OcrDet::forward(cv::Mat& img,std::vector<std::vector<std::vector<int>>>& text_roi_boxes){
{
std::vector<std::vector<std::vector<int>>> boxes; std::vector<std::vector<std::vector<int>>> boxes;
cv::Size ratio = preproc(img,data); cv::Size ratio = preproc(img,data);
...@@ -608,8 +579,7 @@ namespace ppocr ...@@ -608,8 +579,7 @@ namespace ppocr
float ratio_h = float(net_input_height) / float(img.rows); float ratio_h = float(net_input_height) / float(img.rows);
text_roi_boxes = filter_det_res(boxes, ratio_h, ratio_w, img); text_roi_boxes = filter_det_res(boxes, ratio_h, ratio_w, img);
visualize_boxes(img,text_roi_boxes); // visualize_boxes(img,text_roi_boxes);
// TextRecognition(img,boxes);
return true; return true;
} }
...@@ -620,9 +590,7 @@ namespace ppocr ...@@ -620,9 +590,7 @@ namespace ppocr
int channel, int channel,
int batch_size, int batch_size,
bool offload_copy, bool offload_copy,
std::string character_dict_path) std::string character_dict_path){
{
if(!Exists(rec_model_path)) if(!Exists(rec_model_path))
{ {
LOG_ERROR(stdout, "onnx file not exists!\n"); LOG_ERROR(stdout, "onnx file not exists!\n");
...@@ -633,7 +601,6 @@ namespace ppocr ...@@ -633,7 +601,6 @@ namespace ppocr
this->net_input_height=image_height; this->net_input_height=image_height;
this->net_input_channel=channel; this->net_input_channel=channel;
this->precision_mode = precision_mode; this->precision_mode = precision_mode;
migraphx::onnx_options onnx_options; migraphx::onnx_options onnx_options;
onnx_options.map_input_dims["x"] = {1, 3, 48, 720}; onnx_options.map_input_dims["x"] = {1, 3, 48, 720};
...@@ -663,8 +630,6 @@ namespace ppocr ...@@ -663,8 +630,6 @@ namespace ppocr
this->feature_size = output_shape.lens()[2]; this->feature_size = output_shape.lens()[2];
n_channel = this->output_shape.lens()[1]; n_channel = this->output_shape.lens()[1];
std::cout<<"["<<this->output_shape.lens()[0]<<
","<<this->output_shape.lens()[1]<<","<<this->output_shape.lens()[2]<<"]\n";
this->offload_copy = offload_copy; this->offload_copy = offload_copy;
migraphx::compile_options options; migraphx::compile_options options;
...@@ -673,23 +638,37 @@ namespace ppocr ...@@ -673,23 +638,37 @@ namespace ppocr
migraphx::target gpuTarget = migraphx::gpu::target{}; migraphx::target gpuTarget = migraphx::gpu::target{};
net.compile(gpuTarget, options); net.compile(gpuTarget, options);
float *warm_data = (float*)malloc(this->input_shape.bytes());
memset(warm_data, 1.0, this->input_shape.bytes());
if( this->offload_copy ==false ) if( this->offload_copy ==false )
{ {
LOG_INFO(stdout, "Set copy mode ...\n");
hipMalloc(&input_buffer_device, this->input_shape.bytes()); hipMalloc(&input_buffer_device, this->input_shape.bytes());
hipMalloc(&output_buffer_device, this->output_shape.bytes()); hipMalloc(&output_buffer_device, this->output_shape.bytes());
output_buffer_host = (void*)malloc(this->output_shape.bytes()); output_buffer_host = (void*)malloc(this->output_shape.bytes());
dev_argument[input_name] = migraphx::argument{input_shape, input_buffer_device}; dev_argument[input_name] = migraphx::argument{input_shape, input_buffer_device};
dev_argument[output_name] = migraphx::argument{output_shape, output_buffer_device}; dev_argument[output_name] = migraphx::argument{output_shape, output_buffer_device};
}
hipMemcpy(input_buffer_device,
(void*)warm_data,
this->input_shape.bytes(),
hipMemcpyHostToDevice);
//warm up
std::vector<migraphx::argument> results = net.eval(dev_argument);
}else{
std::unordered_map<std::string, migraphx::argument> inputData;
inputData[input_name] = migraphx::argument{input_shape, (float *)warm_data};
//warm up
std::vector<migraphx::argument> results = net.eval(inputData);
}
free(warm_data);
std::ifstream infile; std::ifstream infile;
infile.open(character_dict_path,std::ios::in); infile.open(character_dict_path,std::ios::in);
assert(infile.is_open()); assert(infile.is_open());
std::string k_work=""; std::string k_work="";
k_words.clear(); k_words.clear();
//读取字典文件
while (std::getline(infile,k_work)) while (std::getline(infile,k_work))
{ {
k_words.push_back(k_work); k_words.push_back(k_work);
...@@ -697,8 +676,7 @@ namespace ppocr ...@@ -697,8 +676,7 @@ namespace ppocr
system("chcp 65001"); system("chcp 65001");
} }
CTCDecode::~CTCDecode() CTCDecode::~CTCDecode(){
{
if(data) if(data)
{ {
free(data); free(data);
...@@ -723,8 +701,7 @@ namespace ppocr ...@@ -723,8 +701,7 @@ namespace ppocr
} }
} }
bool CTCDecode::preproc(cv::Mat img,float* data,int img_w,int img_h) bool CTCDecode::preproc(cv::Mat img,float* data,int img_w,int img_h){
{
if (img.empty()) if (img.empty())
{ {
std::cout<<"WARNING image is empty!\n"; std::cout<<"WARNING image is empty!\n";
...@@ -754,25 +731,21 @@ namespace ppocr ...@@ -754,25 +731,21 @@ namespace ppocr
data[i*img_w+j] = (template_mat.at<cv::Vec3b>(i, j)[2]*scale-0.5)/0.5; data[i*img_w+j] = (template_mat.at<cv::Vec3b>(i, j)[2]*scale-0.5)/0.5;
data[i*img_w+j+img_h*img_w] = (template_mat.at<cv::Vec3b>(i, j)[1]*scale-0.5)/0.5; data[i*img_w+j+img_h*img_w] = (template_mat.at<cv::Vec3b>(i, j)[1]*scale-0.5)/0.5;
data[i*img_w+j+2*img_h*img_w] =( template_mat.at<cv::Vec3b>(i, j)[0]*scale-0.5)/0.5; data[i*img_w+j+2*img_h*img_w] =( template_mat.at<cv::Vec3b>(i, j)[0]*scale-0.5)/0.5;
} }
} }
} }
return true ; return true ;
} }
std::string CTCDecode::decode(std::vector<float>& probs,std::vector<int>& indexs,float& mean_prob) std::string CTCDecode::decode(std::vector<float>& probs,std::vector<int>& indexs,float& mean_prob){
{
int ignored_tokens=0; int ignored_tokens=0;
std::string text=""; std::string text="";
std::vector<float> n_probs; std::vector<float> n_probs;
std::vector<int> n_indexs; std::vector<int> n_indexs;
int eff_text_num=0; int eff_text_num=0;
for (int i=0;i<n_channel;i++) for (int i=0;i<n_channel;i++)
{ {
// std::cout<<"s :"<<i<<":"<<indexs[i]<<"-"<<probs[i]<<std::endl;
if(indexs[i]==ignored_tokens) if(indexs[i]==ignored_tokens)
{ {
continue; continue;
...@@ -784,7 +757,6 @@ namespace ppocr ...@@ -784,7 +757,6 @@ namespace ppocr
mean_prob+=probs[i]; mean_prob+=probs[i];
text+=k_words[indexs[i]-1]; text+=k_words[indexs[i]-1];
eff_text_num++; eff_text_num++;
} }
...@@ -801,38 +773,26 @@ namespace ppocr ...@@ -801,38 +773,26 @@ namespace ppocr
} }
std::string CTCDecode::postprocess(float* feature) std::string CTCDecode::postprocess(float* feature)
{ {
//shape 25*6625
std::vector<float> probs; std::vector<float> probs;
std::vector<int> indexs; std::vector<int> indexs;
float prob=0.; float prob=0.;
// std::cout<<"n_channel:"<<n_channel<<", feature_size:"<<feature_size<<std::endl;
for (int i=0;i<n_channel;i++) for (int i=0;i<n_channel;i++)
{ {
float* c_feat = feature+i*feature_size; float* c_feat = feature+i*feature_size;
int max_index = argmax<float*>(c_feat,c_feat+feature_size); int max_index = argmax<float*>(c_feat,c_feat+feature_size);
float max_pro = c_feat[max_index]; float max_pro = c_feat[max_index];
// std::cout<<"step:"<<i<<" max_pro:"<<max_pro<<", max_index:"<<max_index<<std::endl;
probs.push_back(max_pro); probs.push_back(max_pro);
indexs.push_back(max_index); indexs.push_back(max_index);
} }
std::string text = decode(probs,indexs,prob); std::string text = decode(probs,indexs,prob);
std::cout<<"ocr res :["<<text<<"]\n"; std::cout<<"ocr res :"<<text<<" "<<prob<<"\n";
return text; return text;
} }
std::string CTCDecode::forward(cv::Mat& img) std::string CTCDecode::forward(cv::Mat& img){
{
preproc(img,data,net_input_width,net_input_height); preproc(img,data,net_input_width,net_input_height);
// std::unordered_map<std::string, migraphx::argument> inputData;
// inputData[input_name] = migraphx::argument{input_shape, data};
// std::vector<migraphx::argument> results = net.eval(inputData);
// migraphx::argument result = results[0];
if( this->offload_copy ==false ) if( this->offload_copy ==false )
{ {
hipMemcpy(input_buffer_device, hipMemcpy(input_buffer_device,
...@@ -846,8 +806,6 @@ namespace ppocr ...@@ -846,8 +806,6 @@ namespace ppocr
(void*)output_buffer_device, (void*)output_buffer_device,
output_shape.bytes(), output_shape.bytes(),
hipMemcpyDeviceToHost); hipMemcpyDeviceToHost);
// std::cout<<"ctc: copy mode ..."<<std::endl;
std::string text = postprocess((float *)output_buffer_device); std::string text = postprocess((float *)output_buffer_device);
return text; return text;
}else{ }else{
...@@ -856,46 +814,65 @@ namespace ppocr ...@@ -856,46 +814,65 @@ namespace ppocr
std::vector<migraphx::argument> results = net.eval(inputData); std::vector<migraphx::argument> results = net.eval(inputData);
migraphx::argument result = results[0] ; migraphx::argument result = results[0] ;
std::string text = postprocess((float *)result.data()); std::string text = postprocess((float *)result.data());
// std::cout<<"ctc: offload copy mode ..."<<std::endl;
return text; return text;
} }
//get output data (first node)
// migraphx::shape outputShape = result.get_shape();
// int numberOfOutput = outputShape.elements();
// std::vector<std::size_t> outputSize = outputShape.lens();
// std::cout<<"output size:"<<outputSize.size()<<std::endl;
// for(int i = 0; i < outputSize.size(); i++)
// {
// std::cout << outputSize[i] << " ";
// }
} }
ppOcrEngine::ppOcrEngine(const std::string &det_model_path, ppOcrEngine::ppOcrEngine(const std::string &det_model_path,
const std::string &rec_model_path, const std::string &rec_model_path,
const std::string &character_dict_path, const std::string &character_dict_path,
const std::string front,
float segm_thres, float segm_thres,
float box_thresh, float box_thresh,
bool offload_copy, bool offload_copy,
std::string precision_mode ){ std::string precision_mode
){
text_detector = std::make_shared<OcrDet>(det_model_path,precision_mode,offload_copy,segm_thres,box_thresh); text_detector = std::make_shared<OcrDet>(det_model_path,precision_mode,offload_copy,segm_thres,box_thresh);
text_recognizer = std::make_shared<CTCDecode>(rec_model_path,precision_mode,720,48,3,1,offload_copy,character_dict_path); text_recognizer = std::make_shared<CTCDecode>(rec_model_path,precision_mode,720,48,3,1,offload_copy,character_dict_path);
ft2 = std::make_shared<PutText>(front.c_str());
} }
ppOcrEngine::~ppOcrEngine() ppOcrEngine::~ppOcrEngine(){
{
; ;
} }
std::vector<std::string> ppOcrEngine::forward(cv::Mat &srcimg) void ppOcrEngine::visualize_boxes(cv::Mat &srcimg,
const std::vector<std::vector<std::vector<int>>> &boxes) {
std::vector<std::vector<cv::Point>> contours;
for (const auto& box : boxes) {
std::vector<cv::Point> pts;
for (const auto& point : box) {
pts.emplace_back(point[0], point[1]);
}
contours.push_back(pts);
}
cv::polylines(
srcimg,
contours,
true, // 是否闭合
cv::Scalar(0, 255, 0), // 默认绿色
2, // 线宽
cv::LINE_8 // 8连通线
);
}
cv::Mat ppOcrEngine::visualize_text(std::vector<std::string> texts,std::vector<cv::Point> points, cv::Mat &img)
{ {
assert(texts.size()==points.size()),"error texts size != points size";
cv::Mat draw_img = cv::Mat(img.size(), CV_8UC3,cv::Scalar(255,255,255));
int width = img.cols*2;
int height = img.rows;
cv::Mat templete_img = cv::Mat(width,height, CV_8UC3,cv::Scalar(255,255,255));
for(int i = 0 ; i < texts.size(); i++)
{
ft2->putText(draw_img,texts[i],points[i].x,points[i].y,15);
}
cv::hconcat(img, draw_img, templete_img);
return templete_img;
}
std::vector<std::string> ppOcrEngine::forward(cv::Mat &srcimg){
std::vector<std::vector<std::vector<int>>> text_roi_boxes; std::vector<std::vector<std::vector<int>>> text_roi_boxes;
std::vector<std::string> text_vec; std::vector<std::string> text_vec;
auto start = std::chrono::high_resolution_clock::now(); auto start = std::chrono::high_resolution_clock::now();
text_detector->forward(srcimg,text_roi_boxes); text_detector->forward(srcimg,text_roi_boxes);
...@@ -904,7 +881,8 @@ namespace ppocr ...@@ -904,7 +881,8 @@ namespace ppocr
std::cout<<"Not found text roi !\n"; std::cout<<"Not found text roi !\n";
return std::vector<std::string>(); return std::vector<std::string>();
} }
std::cout<<"text_roi_boxes.size(): "<<text_roi_boxes.size()<<"\n";
std::vector<cv::Point> points;
for (int n = 0; n < text_roi_boxes.size(); n++) { for (int n = 0; n < text_roi_boxes.size(); n++) {
cv::Rect rect; cv::Rect rect;
...@@ -920,10 +898,14 @@ namespace ppocr ...@@ -920,10 +898,14 @@ namespace ppocr
text_roi_mat = srcimg(rect).clone(); text_roi_mat = srcimg(rect).clone();
std::string text = text_recognizer->forward(text_roi_mat); std::string text = text_recognizer->forward(text_roi_mat);
text_vec.push_back(text); text_vec.push_back(text);
points.push_back(cv::Point(rect.x,rect.y));
} }
auto end = std::chrono::high_resolution_clock::now(); auto end = std::chrono::high_resolution_clock::now();
auto duration_ms = std::chrono::duration_cast<std::chrono::milliseconds>(end - start); auto duration_ms = std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
std::cout<<"Time taken by task: "<< duration_ms.count() <<" ms\n"; std::cout<<"[Time info] elapsed: "<< duration_ms.count() <<" ms\n";
visualize_boxes(srcimg,text_roi_boxes);
cv::Mat res_img = visualize_text(text_vec,points, srcimg);
cv::imwrite("res.jpg",res_img);
return text_vec; return text_vec;
} }
......
...@@ -10,15 +10,9 @@ ...@@ -10,15 +10,9 @@
#include "Filesystem.h" #include "Filesystem.h"
#include "SimpleLog.h" #include "SimpleLog.h"
#include "clipper.h" #include "clipper.h"
#include "cv_put_Text.hpp"
namespace ppocr{ namespace ppocr{
struct _TEXT_BOX
{
cv::Rect t_rect;
float score;
};
using T_BOX = struct _TEXT_BOX;
class CTCDecode class CTCDecode
{ {
private: private:
...@@ -38,7 +32,6 @@ namespace ppocr{ ...@@ -38,7 +32,6 @@ namespace ppocr{
void* output_buffer_device; void* output_buffer_device;
void* output_buffer_host; void* output_buffer_host;
migraphx::shape input_shape; migraphx::shape input_shape;
migraphx::shape output_shape; migraphx::shape output_shape;
std::string input_name; std::string input_name;
...@@ -51,7 +44,7 @@ namespace ppocr{ ...@@ -51,7 +44,7 @@ namespace ppocr{
public: public:
CTCDecode(std::string rec_model_path, CTCDecode(std::string rec_model_path,
std::string precision_mode="fp32", std::string precision_mode="fp16",
int image_width=480, int image_width=480,
int image_height=48, int image_height=48,
int channel=3, int channel=3,
...@@ -61,7 +54,9 @@ namespace ppocr{ ...@@ -61,7 +54,9 @@ namespace ppocr{
~CTCDecode(); ~CTCDecode();
/** /**
* @brief 字符识别编码,可支持,最长可支持预测90个字符,18385个字符 * @brief 字符识别、编码API 字符识别编码,可支持,最长可支持预测90个字符,18385个字符
* @param img 输入图片
* @return 编码后的字符串
*/ */
std::string forward(cv::Mat& img); std::string forward(cv::Mat& img);
...@@ -93,7 +88,6 @@ namespace ppocr{ ...@@ -93,7 +88,6 @@ namespace ppocr{
* @return 成功:text,失败:"" * @return 成功:text,失败:""
*/ */
std::string decode(std::vector<float>& probs,std::vector<int>& indexs,float& mean_prob); std::string decode(std::vector<float>& probs,std::vector<int>& indexs,float& mean_prob);
}; };
class OcrDet class OcrDet
...@@ -117,25 +111,33 @@ namespace ppocr{ ...@@ -117,25 +111,33 @@ namespace ppocr{
float* data; float* data;
//Allocate device buffer and host buffer,if offload_copy is false //当offload_copy为true时,分配设备内存
std::unordered_map<std::string, migraphx::argument> dev_argument; std::unordered_map<std::string, migraphx::argument> dev_argument;
void* input_buffer_device; void* input_buffer_device;
void* output_buffer_device; void* output_buffer_device;
void* output_buffer_host; void* output_buffer_host;
//postprocess //后处理
int n_channel; int n_channel;
int feature_size; //single channel feature map size. int feature_size; //单个通道的特征大小,例如模型输出[1,3,32,32],feature_size= 32x32.
int output_width; int output_width;
int output_height; int output_height;
int max_candidates;//maximun number of candidates contours. int max_candidates;//最大检测的候选区域.
public: public:
OcrDet(std::string det_model_path, OcrDet(std::string det_model_path,
std::string precision_mode="float32", std::string precision_mode="fp16",
bool offload_copy = true, bool offload_copy = true,
float segm_thres = 0.3, float segm_thres = 0.3,
float box_thresh = 0.7); float box_thresh = 0.7);
~OcrDet(); ~OcrDet();
/**
* @brief 字符检测模型推理API
* @param img 原始图片
* @param text_roi_boxes 字符区域坐标,格式:[[[tl.x, tl.y], [tr.x, tr.y],[], [br.x, br.y], [bl.x, bl.y]]]]
* | | | |
* 左上坐标 右上坐标 右下坐标 左下坐标
* @return 成功返回true,失败返回false
*/
bool forward(cv::Mat& img,std::vector<std::vector<std::vector<int>>>& text_roi_boxes); bool forward(cv::Mat& img,std::vector<std::vector<std::vector<int>>>& text_roi_boxes);
private: private:
...@@ -159,17 +161,18 @@ namespace ppocr{ ...@@ -159,17 +161,18 @@ namespace ppocr{
*/ */
int postprocess(float* feature, std::vector<std::vector<std::vector<int>>> &boxes); int postprocess(float* feature, std::vector<std::vector<std::vector<int>>> &boxes);
/**
* @brief 后处理,文本区域提取
* @param pred 二值图(这里字符检测使用了dbnet分割字符区域,二值图对应了文本区域)
int boxes_from_bitmap(cv::Mat& bit_map,std::vector<T_BOX>& box); * @param bitmap 二值图(pred做形态学运算输出bitmap,结合pred结算平均边框得分)
* @return 成功:0,失败:-1
*/
std::vector<std::vector<std::vector<int>>>boxes_from_bitmap( std::vector<std::vector<std::vector<int>>>boxes_from_bitmap(
const cv::Mat pred, const cv::Mat bitmap, const float &box_thresh, const cv::Mat pred, const cv::Mat bitmap, const float &box_thresh,
const float &det_db_unclip_ratio, const bool &use_polygon_score); const float &det_db_unclip_ratio, const bool &use_polygon_score);
std::vector<std::vector<float>> Mat2Vector(cv::Mat mat); std::vector<std::vector<float>> Mat2Vector(cv::Mat mat);
/** /**
* @brief 统计多边形区域的平均得分 * @brief 统计多边形区域的平均得分
* @param contour 字符区域的轮廓点集合 * @param contour 字符区域的轮廓点集合
...@@ -238,9 +241,6 @@ namespace ppocr{ ...@@ -238,9 +241,6 @@ namespace ppocr{
*/ */
float box_score_fast(std::vector<std::vector<float>> box_array,cv::Mat pred) ; float box_score_fast(std::vector<std::vector<float>> box_array,cv::Mat pred) ;
void visualize_boxes(const cv::Mat &srcimg,
const std::vector<std::vector<std::vector<int>>> &boxes) ;
bool text_recognition(const cv::Mat &srcimg, bool text_recognition(const cv::Mat &srcimg,
const std::vector<std::vector<std::vector<int>>> &boxes); const std::vector<std::vector<std::vector<int>>> &boxes);
...@@ -250,16 +250,21 @@ namespace ppocr{ ...@@ -250,16 +250,21 @@ namespace ppocr{
private: private:
std::shared_ptr<OcrDet> text_detector; std::shared_ptr<OcrDet> text_detector;
std::shared_ptr<CTCDecode> text_recognizer; std::shared_ptr<CTCDecode> text_recognizer;
std::shared_ptr<PutText> ft2 ;
public: public:
ppOcrEngine(const std::string &det_model_path, ppOcrEngine(const std::string &det_model_path,
const std::string &rec_model_path, const std::string &rec_model_path,
const std::string &character_dict_path, const std::string &character_dict_path,
const std::string front,
const float segm_thres=0.3, const float segm_thres=0.3,
const float box_thresh=0.7, const float box_thresh=0.7,
bool offload_copy =true, bool offload_copy =true,
std::string precision_mode = "fp32") ; std::string precision_mode = "fp16") ;
~ppOcrEngine(); ~ppOcrEngine();
std::vector<std::string> forward(cv::Mat &srcimg); std::vector<std::string> forward(cv::Mat &srcimg);
cv::Mat visualize_text(std::vector<std::string> texts,std::vector<cv::Point> points, cv::Mat &img);
void visualize_boxes(cv::Mat &srcimg,
const std::vector<std::vector<std::vector<int>>> &boxes) ;
}; };
} }
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment