Commit 22b7c574 authored by liuhy's avatar liuhy
Browse files

paddleOcr v5

parents
#! /bin/sh
############### Ubuntu ###############
# 参考:https://docs.opencv.org/3.4.11/d7/d9f/tutorial_linux_install.html
apt-get install build-essential -y
apt-get install cmake git libgtk2.0-dev pkg-config libavcodec-dev libavformat-dev libswscale-dev -y
apt-get install python-dev python-numpy libtbb2 libtbb-dev libjpeg-dev libpng-dev libtiff-dev libjasper-dev libdc1394-22-dev -y # 处理图像所需的包,可选
############### CentOS ###############
# yum install gcc gcc-c++ gtk2-devel gimp-devel gimp-devel-tools gimp-help-browser zlib-devel libtiff-devel libjpeg-devel libpng-devel gstreamer-devel libavc1394-devel libraw1394-devel libdc1394-devel jasper-devel jasper-utils swig python libtool nasm -y
############################ 在线安装依赖 ###############################
#cd ./3rdParty
#pip install rbuild-master.tar.gz
############################ 离线安装依赖 ###############################
# 安装依赖
cd ./3rdParty/rbuild_depend
# pip install click-6.6-py2.py3-none-any.whl
# pip install six-1.15.0-py2.py3-none-any.whl
pip install subprocess32-3.5.4.tar.gz
pip install cget-0.1.9.tar.gz
# pip install shapely-2.1.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
# pip install pyclipper-1.3.0.post6-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl
# 安装rbuild
cd ../
pip install rbuild-master.tar.gz
52 comment=e8d4259f9ab787b512b9aa1203fc816fb9f19231
# 设置cmake的最低版本
cmake_minimum_required(VERSION 3.5)
# 设置项目名
project(ppOcrV5)
# 设置编译器
set(CMAKE_CXX_COMPILER hipcc)
set(CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS} -std=c++17)
set(CMAKE_BUILD_TYPE release)
set(INCLUDE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/Src/
$ENV{DTKROOT}/include/
${CMAKE_CURRENT_SOURCE_DIR}/Src/Utility
${CMAKE_CURRENT_SOURCE_DIR}/3rdParty/opencv/include)
include_directories(${INCLUDE_PATH})
# 添加依赖库路径
set(LIBRARY_PATH ${CMAKE_CURRENT_SOURCE_DIR}/3rdParty/opencv/lib
$ENV{DTKROOT}/lib/)
link_directories(${LIBRARY_PATH})
# 添加依赖库
set(LIBRARY opencv_core
opencv_imgproc
opencv_imgcodecs
opencv_dnn
migraphx
migraphx_gpu
migraphx_onnx)
link_libraries(${LIBRARY})
set(SOURCE_FILES ${CMAKE_CURRENT_SOURCE_DIR}/Src/main.cpp
${CMAKE_CURRENT_SOURCE_DIR}/Src/Utility/Filesystem.cpp
${CMAKE_CURRENT_SOURCE_DIR}/Src/Utility/clipper.cpp
${CMAKE_CURRENT_SOURCE_DIR}/Src/ocr_engine.cpp)
# 添加可执行目标
add_executable(ppOcrV5 ${SOURCE_FILES})
# 概述
PP-OCRv5 是PP-OCR新一代文字识别解决方案,该方案聚焦于多场景、多文字类型的文字识别。在文字类型方面,PP-OCRv5支持简体中文、中文拼音、繁体中文、英文、日文5大主流文字类型,在场景方面,PP-OCRv5升级了中英复杂手写体、竖排文本、生僻字等多种挑战性场景的识别能力。在内部多场景复杂评估集上,PP-OCRv5较PP-OCRv4端到端提升13个百分点,本sample适配了PPOcrV5字符检测和识别模型,并使用MIGraphX 的C++接口实现推理。
## 模型简介
### 文本检测
文本检测使用了dbnet(论文地址:https://arxiv.org/pdf/1911.08947),网络结构:![alt text](Images/DBNet.png),模型输出概率图,并用Vatti Clipping算法对字符区域多边形简化处理,sample中借助Clipping 库。 sample中使用动态shape(N,3,H,C),最大输入shape是[1,3,640,640],模型地址:Resource/Models/ppocrv5_server_det_infer.onnx
### 文本识别
文本识别使用了CRNN+CTCDecode(https://arxiv.org/pdf/2009.09941),网络结构:![alt text](Images/CRNN.png),sample中使用了动态shape (N,3,48,W),最大输入shape是[1,3,48,720],模型地址:Resource/Models/ppocrv5_server_rec_infer.onnx
## 预处理
### 检测模型预处理
检测模型输入数据预处理:
- 图片等比缩放,填充(沿着右、下填充)
- 图片归一化,减均值除方差
- transpose ,MigraphX的输入数据排布顺序为[N,C,H,W]
```c++
cv::Size OcrDet::preproc(cv::Mat img,float* data)
{
float scale = 1.0/255.0;
std::vector<float> s_mean={0.485, 0.456, 0.406};
std::vector<float> s_stdv={0.229, 0.224, 0.225};
if(img.empty())
{
std::cout<<"Source image is empty!\n";
return cv::Size(1.0,1.0);
}
cv::Mat res_img;
cv::Size scale_r;
scale_r.width = float(net_input_width)/float(img.cols);
scale_r.height = float(net_input_height)/float(img.rows);
//等比缩放
cv::resize(img,res_img,cv::Size(net_input_width,net_input_height));
int iw = res_img.cols;
int ih = res_img.rows;
memset(data,0.0,3*iw*ih*sizeof(float));
//HWC->CHW
for(int i=0;i<net_input_height;i++)
{
for(int j=0;j<net_input_width;j++)
{
data[i*net_input_width+j+2*net_input_height*net_input_width] = (float(res_img.at<cv::Vec3b>(i, j)[2])*scale-s_mean[2])/s_stdv[2];
data[i*net_input_width+j+net_input_height*net_input_width] = (float(res_img.at<cv::Vec3b>(i, j)[1])*scale-s_mean[1])/s_stdv[1];
data[i*net_input_width+j] = (float(res_img.at<cv::Vec3b>(i, j)[0])*scale-s_mean[0])/s_stdv[0];
}
}
return scale_r ;
}
```
### 字符识别模型预处理
字符识别模型输入数据预处理:
- 等比缩放,保留H维度的原始比例,填充(沿着右、下)
- 图片归一化,均值方差默认为0.5
- transpose ,MigraphX的输入数据排布顺序为[N,C,H,W]
```c++
bool CTCDecode::preproc(cv::Mat img,float* data,int img_w,int img_h)
{
if (img.empty())
{
std::cout<<"WARNING image is empty!\n";
return false;
}
float scale=1.0/255.;
int iw=img.cols;
int ih=img.rows;
float ratio=min(img_h*1.0/ih,img_w*1.0/iw);
int nw=static_cast<int> (iw*ratio);
int nh=img_h;
cv::Mat res_mat;
cv::resize(img,res_mat,cv::Size(nw,nh));
cv::Mat template_mat=cv::Mat(img_h,img_w,CV_8UC3,cv::Scalar(0,0,0));
int xdet=img_w-nw;
int ydet=img_h-nh;
cv::copyMakeBorder(res_mat, template_mat, 0,ydet, 0, xdet, 0);
memset(data,0.0,this->batch_size*3*img_w*img_h*sizeof(float));
for(int b =0 ; b < this->batch_size;b++ )
{
for(int i=0;i<img_h;i++)
{
for(int j=0;j<img_w;j++)
{
data[i*img_w+j] = (template_mat.at<cv::Vec3b>(i, j)[2]*scale-0.5)/0.5;
data[i*img_w+j+img_h*img_w] = (template_mat.at<cv::Vec3b>(i, j)[1]*scale-0.5)/0.5;
data[i*img_w+j+2*img_h*img_w] =( template_mat.at<cv::Vec3b>(i, j)[0]*scale-0.5)/0.5;
}
}
}
return true ;
}
```
## 类介绍
ppOcrEngine 封装了对外提供的API,OcrDet为文本检测类,CTCDecode为文本识别类。文本检测和文本识别在ppOcrEngine中是两个智能指针变量,在forward,首先调用text_detector检测到图片中的所有字符区域,然后分别将检测到的区域传入到text_recognizer中识别字符区域的内容。
```c++
class ppOcrEngine {
private:
std::shared_ptr<OcrDet> text_detector;
std::shared_ptr<CTCDecode> text_recognizer;
public:
ppOcrEngine(const std::string &det_model_path,
const std::string &rec_model_path,
const std::string &character_dict_path,
const float segm_thres=0.3,
const float box_thresh=0.7,
bool offload_copy =true,
std::string precision_mode = "fp32") ;
/**
* @brief OCR engine初始化
* @param det_model_path 字符检测模型路径
* @param rec_model_path 识别模型路径
* @param character_dict_path 字符字典路径
* @param segm_thres 像素分割阈值
* @param box_thresh 字符区域box阈值
* @param offload_copy 内存拷贝存模式, 支持两种数据拷贝方式:*offload_copy=true、offload_copy=false。当offload_copy为true时,不需*要进行内存拷贝,如果为false,需要先预分配输入输出的设备内存,并在推理* *前,将预处理数据拷贝到设备内存,推理后将模型输出从设备内存中拷贝出来
* @param precision_mode 精度模式,支持:fp32、fp16
*
* @return NONE
*/
~ppOcrEngine();
std::vector<std::string> forward(cv::Mat &srcimg);
};
class CTCDecode
{
private:
//inference image
float* data;
std::unordered_map<std::string, migraphx::argument> device_data;
migraphx::program net;
int batch_size;
int net_input_width;
int net_input_height;
int net_input_channel;
bool offload_copy;
std::string precision_mode;
std::unordered_map<std::string, migraphx::argument> dev_argument;
void* input_buffer_device;
void* output_buffer_device;
void* output_buffer_host;
migraphx::shape input_shape;
migraphx::shape output_shape;
std::string input_name;
std::string output_name;
//postprocess: n_channel->model output channel,feature_size--> feature size one channel
int n_channel;
int feature_size;
std::vector<std::string> k_words;
public:
CTCDecode(std::string rec_model_path,
std::string precision_mode="fp32",
int image_width=480,
int image_height=48,
int channel=3,
int batch_size = 1,
bool offload_copy = true,
std::string character_dict_path="./ppocr_keys_v5.txt");
~CTCDecode();
/**
* @brief 字符识别编码,可支持,最长可支持预测90个字符,18385个字符
*/
std::string forward(cv::Mat& img);
private:
/**
* @brief 预处理
* pixel = (src_img*scale-0.5)/0.5;
* scale = 1.0/255
* @param img 字符图片
* @param data 预处理输出
* @param img_w 模型输入宽
* @param img_h 模型输入高
* @return 成功:true,失败:false
*/
bool preproc(cv::Mat img,float* data,int img_w=480,int img_h=48);
/**
* @brief 模型预测后处理,获取每行中概率最大的字符,组成一句长度最大为90个字符的句子,模型预测输出shape=[1,90,18385]
* @param feature model output
* @return 成功:text,失败:""
*/
std::string postprocess(float* feature);
/**
* @brief 解码,将模型预测输出与字符集关联起来
* @param probs 模型预测的最大概率
* @param indexs 模型预测的最大概率的索引值
* @param mean_prob 预测句子的平均概率
* @return 成功:text,失败:""
*/
std::string decode(std::vector<float>& probs,std::vector<int>& indexs,float& mean_prob);
};
class OcrDet
{
private:
std::string precision_mode;
bool offload_copy;
migraphx::program net;
migraphx::shape input_shape;
migraphx::shape output_shape;
std::string input_name;
std::string output_name;
int det_batch_size;
int data_size ;
float segm_thres;
float box_thres;
int net_input_width;
int net_input_height;
int net_input_channel;
float* data;
//Allocate device buffer and host buffer,if offload_copy is false
std::unordered_map<std::string, migraphx::argument> dev_argument;
void* input_buffer_device;
void* output_buffer_device;
void* output_buffer_host;
//postprocess
int n_channel;
int feature_size; //single channel feature map size.
int output_width;
int output_height;
int max_candidates;//maximun number of candidates contours.
public:
OcrDet(std::string det_model_path,
std::string precision_mode="float32",
bool offload_copy = true,
float segm_thres = 0.3,
float box_thresh = 0.7);
~OcrDet();
bool forward(cv::Mat& img,std::vector<std::vector<std::vector<int>>>& text_roi_boxes);
private:
/**
* @brief 预处理
* pixel = (scale*src_img*mean/std);
* scale = 1.0/255
* mean = [0.485, 0.456, 0.406]
* std = [0.229, 0.224, 0.225]
* @param img 字符图片
* @param data 预处理输出
* @return 成功:w,h维度的缩放比例
*/
cv::Size preproc(cv::Mat img,float* data);
/**
* @brief 后处理,通过模型预测的二值图获取文本区域
* @param feature 模型预测tensor(这里字符检测使用了dbnet)
* @param boxes 字符区域坐标
* @return 成功:0,失败:-1
*/
int postprocess(float* feature, std::vector<std::vector<std::vector<int>>> &boxes);
int boxes_from_bitmap(cv::Mat& bit_map,std::vector<T_BOX>& box);
std::vector<std::vector<std::vector<int>>>boxes_from_bitmap(
const cv::Mat pred, const cv::Mat bitmap, const float &box_thresh,
const float &det_db_unclip_ratio, const bool &use_polygon_score);
std::vector<std::vector<float>> Mat2Vector(cv::Mat mat);
/**
* @brief 统计多边形区域的平均得分
* @param contour 字符区域的轮廓点集合
* @param pred 模型预测二值图
* @return score
*/
float polygon_score_acc(std::vector<cv::Point> contour,cv::Mat pred);
/**
* @brief 对模型预测的区域进行向内或向外扩散,扩散比例是unclip_ratio ,目的是找到更加合适的字符区域
* @param box 字符区域坐标
* @param pred 模型预测二值图
* @return 处理后的字符区域
*/
cv::RotatedRect unClip(std::vector<std::vector<float>> box,
const float &unclip_ratio);
/**
* @brief 计算偏移距离
* distance = area * unclip_ratio / dist;
* area = ∑(x_i*y_{i+1} - x_{i+1}*y_i)
* dist = sqrtf(dx * dx + dy * dy)
*
* @param box 字符区域坐标
* @param unclip_ratio 缩放比例
* @param distance 偏移距离
* @return NONE
*/
void get_contour_area(const std::vector<std::vector<float>> &box,
float unclip_ratio, float &distance) ;
/**
* @brief 无效字符区域过滤。首先将boxes映射回原始图像,然后过滤无效区域
* @param boxes 字符区域坐标
* @param ratio_h 垂直方向缩放比例
* @param ratio_w 水平方向缩放比例
* @param srcimg 原始图像
*
* @return 字符区域有效坐标
*/
std::vector<std::vector<std::vector<int>>> filter_det_res(std::vector<std::vector<std::vector<int>>> boxes,
float ratio_h, float ratio_w, cv::Mat srcimg);
/**
* @brief 对字符区域按照从上到下,从左到右的顺序排序
* @param pts 字符区域坐标
*
* @return 字符区域有效坐标
*/
std::vector<std::vector<int>> order_points_clockwise(std::vector<std::vector<int>> pts);
/**
* @brief 获取最小矩形坐标
* @param box 字符区域最小外接矩形的坐标
* @param ssid box的最大边
* @return 字符区域有效坐标
*/
std::vector<std::vector<float>> get_mini_boxes(cv::RotatedRect box,float &ssid) ;
/**
* @brief 计算bitmap上的t_rect区域的平均分数
* @param box_array 模型预测的字符区域
* @param pred 模型预测二值图
* @return score
*/
float box_score_fast(std::vector<std::vector<float>> box_array,cv::Mat pred) ;
void visualize_boxes(const cv::Mat &srcimg,
const std::vector<std::vector<std::vector<int>>> &boxes) ;
bool text_recognition(const cv::Mat &srcimg,
const std::vector<std::vector<std::vector<int>>> &boxes);
};
```
## 推理
### 字符检测模型推理
```c++
bool OcrDet::forward(cv::Mat& img,std::vector<std::vector<std::vector<int>>>& text_roi_boxes)
{
std::vector<std::vector<std::vector<int>>> boxes;
//输入数据预处理
cv::Size ratio = preproc(img,data);
/*
支持两种数据拷贝方式:offload_copy=true、offload_copy=false。当offload_copy为true时,不需要进行内存拷贝,如果为false,需要先预分配输入输出的设备内存,并在推理前,将预处理数据拷贝到设备内存,推理后将模型输出从设备内存中拷贝出来,在做后处理。
*/
if( this->offload_copy ==false )
{
hipMemcpy(input_buffer_device,
(void*)data,
this->input_shape.bytes(),
hipMemcpyHostToDevice);
std::vector<migraphx::argument> results = net.eval(dev_argument);
hipMemcpy(output_buffer_host,
(void*)output_buffer_device,
output_shape.bytes(),
hipMemcpyDeviceToHost);
postprocess((float *)output_buffer_host,boxes);
std::cout<<"copy mode ..."<<std::endl;
}else{
std::unordered_map<std::string, migraphx::argument> inputData;
inputData[input_name] = migraphx::argument{input_shape, (float *)data};
std::vector<migraphx::argument> results = net.eval(inputData);
migraphx::argument result = results[0] ; //get output data
postprocess((float *)result.data(),boxes);
std::cout<<"offload copy mode ..."<<std::endl;
}
//计算等比缩放比例
float ratio_w = float(net_input_width) / float(img.cols);
float ratio_h = float(net_input_height) / float(img.rows);
//过滤无效框
text_roi_boxes = filter_det_res(boxes, ratio_h, ratio_w, img);
//可视化检测结果
visualize_boxes(img,text_roi_boxes);
// TextRecognition(img,boxes);
return true;
}
```
### 字符识别推理
```c++
std::string CTCDecode::forward(cv::Mat& img)
{
//预处理
preproc(img,data,net_input_width,net_input_height);
/*
支持两种数据拷贝方式:offload_copy=true、offload_copy=false。当offload_copy为true时,不需要进行内存拷贝,如果为false,需要先预分配输入输出的设备内存,并在推理前,将预处理数据拷贝到设备内存,推理后将模型输出从设备内存中拷贝出来,在做后处理。
*/
if( this->offload_copy ==false )
{
hipMemcpy(input_buffer_device,
(void*)data,
this->input_shape.bytes(),
hipMemcpyHostToDevice);
std::vector<migraphx::argument> results = net.eval(dev_argument);
hipMemcpy(output_buffer_host,
(void*)output_buffer_device,
output_shape.bytes(),
hipMemcpyDeviceToHost);
//模型后处理,获取字符的最大概率和索引,并根据索引在字符库中查找对应的字符,然后合成一个句子
std::string text = postprocess((float *)output_buffer_device);
return text;
}else{
std::unordered_map<std::string, migraphx::argument> inputData;
inputData[input_name] = migraphx::argument{input_shape, (float *)data};
std::vector<migraphx::argument> results = net.eval(inputData);
migraphx::argument result = results[0] ;
std::string text = postprocess((float *)result.data());
// std::cout<<"ctc: offload copy mode ..."<<std::endl;
return text;
}
}
```
# Ocrv5 API调用说明
API调用步骤如下:
- 类实例化
- 识别接口调用
例:
```c++
int main(int argc, char** argv)
{
std::string det_model_onnx = "../Resource/Models/ppocrv5_server_det_infer.onnx";
std::string rec_model_onnx = "../Resource/Models/ppocrv5_server_rec_infer.onnx";
std::string img_path = "../Resource/Images/20250703205038.png";
std::string character_dict_path = "../Resource/ppocr_keys_v5.txt";
float segm_thres=0.3;
float box_thresh=0.3;
ppOcrEngine ocr_engine(det_model_onnx,
rec_model_onnx,
character_dict_path,
segm_thres,
box_thresh,
true,
"fp32");
cv::Mat img=cv::imread(img_path);
ocr_engine.forward(img);
return 0;
}
```
sample支持两种精度推理(fp32和fp16),默认是fp32),精度和内存拷贝方式分别通过ocr_engine的构造函数传入参数来设置。
\ No newline at end of file
# 概述
PP-OCRv5 是PP-OCR新一代文字识别解决方案,该方案聚焦于多场景、多文字类型的文字识别。在文字类型方面,PP-OCRv5支持简体中文、中文拼音、繁体中文、英文、日文5大主流文字类型,在场景方面,PP-OCRv5升级了中英复杂手写体、竖排文本、生僻字等多种挑战性场景的识别能力。在内部多场景复杂评估集上,PP-OCRv5较PP-OCRv4端到端提升13个百分点,本sample适配了PPOcrV5字符检测和识别模型,并使用MIGraphX 5.0 的python接口实现推理。
## 模型简介
### 文本检测
文本检测使用了dbnet(论文地址:https://arxiv.org/pdf/1911.08947),网络结构:![alt text](Images/DBNet.png),模型输出概率图,并用Vatti Clipping算法对字符区域多边形简化处理。 sample中使用动态shape(N,3,H,C),最大输入shape是[1,3,640,640],模型地址:Resource/Models/ppocrv5_server_det_infer.onnx
### 文本识别
文本识别使用了CRNN+CTCDecode(https://arxiv.org/pdf/2009.09941),网络结构:![alt text](Images/CRNN.png),sample中使用了动态shape (N,3,48,W),最大输入shape是[1,3,48,720],模型地址:Resource/Models/ppocrv5_server_rec_infer.onnx
## 预处理
### 检测模型预处理
检测模型输入数据预处理:
- 图片等比缩放,填充(沿着右、下填充)
- 图片归一化,减均值除方差
- transpose ,MigraphX的输入数据排布顺序为[N,C,H,W]
本示例代码主要采用了OpenCV实现了预处理操作:
```python
def preprocess(self, src_img,
mean: list = [0.485, 0.456, 0.406],
std: list = [0.229, 0.224, 0.225],
scale: float = 1.0/255):
data = dict()
img = src_img.copy()
src_h, src_w, _ = img.shape
#对输入图片等比缩放,确保字符区域不会变形
res_img, [ratio_h, ratio_w] = self.resize_image(img)
norm_img = (res_img* scale - mean) / std
#HWC->CHW
image_data = norm_img.transpose(2, 0, 1)
#HWC->NCHW
image_data = np.expand_dims(image_data, axis=0).astype(np.float32)
image_data = np.ascontiguousarray(image_data)
data["image"] = image_data
data["shape"] = np.array([src_h, src_w, ratio_h, ratio_w])
return data
def resize_image(self, img):
h, w, _ = img.shape
if h > w:
ratio = float(self.db_input_size[1]) / h
else:
ratio = float(self.db_input_size[0]) / w
resize_h = int(h * ratio)
resize_w = int(w * ratio)
resize_h = max(int(round(resize_h / 32) * 32), 32)
resize_w = max(int(round(resize_w / 32) * 32), 32)
try:
if int(resize_w) <= 0 or int(resize_h) <= 0:
return None, (None, None)
img = cv2.resize(img, (int(resize_w), int(resize_h)))
except:
print(img.shape, resize_w, resize_h)
raise ValueError("resize error")
ratio_h = resize_h / float(h)
ratio_w = resize_w / float(w)
im_pad = np.zeros((self.db_input_size[1], self.db_input_size[0], 3), np.float32)
im_pad[:resize_h, :resize_w, :] = img
return im_pad, [ratio_h, ratio_w]
```
### 字符识别模型预处理
```python
字符识别模型输入数据预处理
- 等比缩放保留H维度的原始比例填充(沿着右)
- 图片归一化均值方差默认为0.5
- transpose ,MigraphX的输入数据排布顺序为[N,C,H,W]
def preprocess(self, img, max_wh_ratio):
if isinstance(max_wh_ratio,list) ==False:
raise TypeError("max_wh_ratio must be list")
imgH, imgW = self.rec_input_size
max_h,max_w = self.rec_input_size
h, w = img.shape[:2]
# re_size = (max_w,max_h)
#保留H的原始维度
if h <= max_h:
ratio = max_h / h
w = int(w*ratio)
if w <= max_w:
re_size =(w,max_h)
else:
re_size = (max_w,max_h)
else:
ratio = max_h/h
w,h = int(w*ratio),max_h
if w <= max_w:
re_size = (w,h)
else:
re_size = (max_w,h)
max_wh_ratio.append(ratio)
resized_image = cv2.resize(img, re_size)
resized_image = resized_image.astype("float32")
#归一化
resized_image = resized_image.transpose((2, 0, 1)) / 255
resized_image -= 0.5
resized_image /= 0.5
#填充,沿着右、下填充
padding_im = np.zeros((3, imgH, imgW), dtype=np.float32)
padding_im[:, :, 0:re_size[0]] = resized_image
return padding_im
```
## 类介绍
PPOcrV5 封装了对外提供的API,TextDetector为文本检测类,TextRecgnizer为文本识别类,BaseRecLabelDecode,实现模型输出的索引序列和实际文本标签之间进行转换,CTCLabelDecode继承BaseRecLabelDecode,实现文本识别模型的输出解码将模型输出的概率值转换为字符并连接成句子。
```python
class PPOcrV5():
def __init__(self,
det_model_path:str,
rec_model_path:str,
char_dict_path:str = "../Resource/ppocr_keys_v5.txt",
db_input_size :list = (640,640),
rec_input_size :list = (48,720),
seg_thresh:float=0.3,
box_thresh:float=0.7,
precision_mode:str='fp32',
offload_copy:bool=True,
**kwargs
)
"""Ocr检测识别推理初始化
字符检测、字符编码、识别。
Args:
det_model_path :字符检测模型路径
rec_model_path : 字符分割模型路径。
char_dict_path :字符集路径
db_input_size :检测模型输入size
rec_input_size :是被模型输入size
seg_thresh :像素分割阈值
box_thresh :字符区域box阈值
precision_mode :精度模式。可选 fp32、fp16
offload_copy : 数据拷贝模式 ,支持两种数据拷贝方式:offload_copy=true、offload_copy=false。当offload_copy为true时,不需要进行内存拷贝,如果为false,需要先预分配输入输出的设备内存,并在推理前,将预处理数据拷贝到设备内存,推理后将模型输出从设备内存中拷贝出来
**kwargs :设置字符检测模型后处理相关参数
Returns:
return_type: NONE。
Examples:
det_onnx_path = "PATH/TO/det_onnx_model.onnx"
rec_onnx_path = "PATH/TO/rec_onnx_model.onnx"
image_path = "PATH/TO/test.png"
ppocrv5 = PPOcrV5(det_onnx_path,rec_onnx_path,offload_copy=True)
"""
class TextDetector(object):
def __init__(
self,
det_model_path,
db_input_size=(640,640),
thresh=0.3,
box_thresh=0.7,
max_candidates=1000,
unclip_ratio=2.0,
use_dilation=False,
score_mode="fast",
box_type="quad",
precision_mode="float32",
**kwargs,
)
"""字符检测模型初始化
字符检测(dbnet)。
Args:
det_model_path :字符检测模型路径。
db_input_size :检测模型输入size
thresh :像素分割阈值
box_thresh :字符区域box阈值
max_candidates : 字符最大候选数
unclip_ratio :polygon 扩散比例
precision_mode :精度模式。可选 "fp16","int8","float32"
use_dilation : 是否对二值图进行膨胀处理
score_mode :评分模式。
box_type :box类型,可选矩形和多边形,这里默认为矩形
offload_copy : 数据拷贝模式 ,支持两种数据拷贝方式:offload_copy=true、offload_copy=false。当offload_copy为true时,不需要进行内存拷贝,如果为false,需要先预分配输入输出的设备内存,并在推理前,将预处理数据拷贝到设备内存,推理后将模型输出从设备内存中拷贝出来
**kwargs :设置字符检测模型后处理相关参数
Returns:
return_type: NONE。
Examples:
self.db_detector = TextDetector(
det_model_path,
db_input_size,
thresh=self.seg_thres,
box_thresh=self.box_thresh,
max_candidates=self.max_candidates,
unclip_ratio=self.unclip_ratio,
box_type=self.box_type,
use_dilation=self.use_dilation,
score_mode=self.score_mode,
precision_mode=precision_mode,
offload_copy=offload_copy
"""
class TextRecgnizer(object):
"""Support SVTR_LCNet """
def __init__(
self,
rec_model_path,
rec_batch_num=2,
rec_input_size=(48, 480),#hw
rec_algorithm="SVTR_LCNet",
precision_mode = "fp32",
**kwargs
)
"""字符识别模型初始化
字符识别(crnn+ctc)。
Args:
rec_model_path :字符识别模型路径。
rec_batch_num :模型推理batch size
rec_input_size :模型推理的最大size
rec_algorithm : 后处理算法类型
unclip_ratio :polygon 扩散比例
precision_mode :精度模式。可选 "fp16","float32"
**kwargs :设置字符识别模型后处理相关参数
Returns:
return_type: NONE。
Examples:
self.text_extractor = TextRecgnizer(rec_model_path=rec_model_path,
rec_input_size=rec_input_size,
precision_mode=precision_mode,
offload_copy=offload_copy)
"""
class BaseRecLabelDecode(object):
def __init__(self, character_dict_path=None,
use_space_char=False)
"""Convert between text-label and text-index
字符识别(crnn+ctc)。
Args:
character_dict_path :字符集文件路径。
use_space_char :字符集中是否包含空格。
Returns:
return_type: NONE。
Examples:
"""
class CTCLabelDecode(BaseRecLabelDecode):
def __init__(self, character_dict_path=None, use_space_char=False, **kwargs):
super(CTCLabelDecode, self).__init__(character_dict_path, use_space_char)
"""Convert between text-label and text-index
字符识别(crnn+ctc)。
Args:
character_dict_path :字符集文件路径。
use_space_char :字符集中是否包含空格。
Returns:
return_type: NONE。
Examples:
"""
```
## 推理
### 字符检测模型推理
```python
def __call__(self, src_img):
data = self.preprocess(src_img)
"""支持两种数据拷贝方式:offload_copy=true、offload_copy=false。当offload_copy为true时,不需要进行内存拷贝,如果为false,需要先预分配输入输出的设备内存,并在推理前,将预处理数据拷贝到设备内存,推理后将模型输出从设备内存中拷贝出来,在做后处理。"""
if self.offload_copy==False:
self.d_mem[self.det_input_name] =migraphx.to_gpu(migraphx.argument(data["image"]))
results = self.db_model.run(self.d_mem)
else:
results = self.db_model.run({self.det_input_name:data["image"]})
if self.offload_copy==False :
#从gpu拷贝推理结果到cpu
result=migraphx.from_gpu(results[0])
print("offload copy model")
result = np.array(result)
else:
result = results[0]
shape_list = np.expand_dims(data["shape"], axis=0)
pred = np.array(result)
pred = pred[:, 0, :, :]
#获取大于阈值的概率
segmentation = pred > self.thresh
boxes_batch = []
for batch_index in range(pred.shape[0]):
src_h, src_w, ratio_h, ratio_w = shape_list[batch_index]
if self.dilation_kernel is not None:
mask = cv2.dilate(
np.array(segmentation[batch_index]).astype(np.uint8),
self.dilation_kernel,
)
else:
mask = segmentation[batch_index]
#根据预测的bitmap获取文本区域
if self.box_type == "poly":
boxes, scores = self.polygons_from_bitmap(
pred[batch_index], mask, ratio_w,ratio_h, src_w, src_h
)
elif self.box_type == "quad":
boxes, scores = self.boxes_from_bitmap(
pred[batch_index], mask, ratio_w,ratio_h, src_w, src_h
)
else:
raise ValueError("box_type can only be one of ['quad', 'poly']")
boxes_batch.append(boxes)
#文本区域按照从上到下,从左到右的顺序排序
det_box_batch = self.sorted_boxes(boxes_batch)
#文本区域按坐标映射到原始图像
dt_boxes,det_rects = self.box_standardization(det_box_batch,shape_list)
return dt_boxes,det_rects
```
### 字符识别推理
```python
def __call__(self, batch_img_list):
if len(batch_img_list) == 0:
return []
width_list = []
#遍历图片列表(字符roi存放在图片列表中),为了支持多batch推理,这里还会将batch_size张图片进行拼接np.concatenate(batch_norm_imgs)
for b in range(len(batch_img_list)):
for img in batch_img_list[b]:
width_list.append(img.shape[1] / float(img.shape[0]))
indices = np.argsort(np.array(width_list))
input_batch = self.rec_batch_num
batch_outputs_pre = []
batch_max_wh_ratio_pre = []
for b in range(len(batch_img_list)):
im_count = len(batch_img_list[b])
batch_outputs = []
batch_max_wh_ratio = []
for beg_img_no in range(0, im_count, input_batch):
end_img_no = min(im_count, beg_img_no + input_batch)
# for ino in range(beg_img_no, end_img_no):
# h, w = batch_img_list[b][indices[ino]].shape[0:2]
# wh_ratio = w * 1.0 / h
# max_wh_ratio = max(max_wh_ratio, wh_ratio)
batch_norm_imgs = []
max_wh_ratio = list()
# N batch
for ino in range(beg_img_no, end_img_no):
#单张图片预处理
norm_img = self.preprocess(batch_img_list[b][indices[ino]], max_wh_ratio)
norm_img = norm_img[np.newaxis, :].astype(np.float32)
batch_norm_imgs.append(norm_img)
batch_max_wh_ratio.append(max_wh_ratio)
#batch_size张图片进行拼接
if self.rec_batch_num >1:
norm_img_batch = np.concatenate(batch_norm_imgs)
norm_img_batch = norm_img_batch.copy()
else:
norm_img_batch = np.array([batch_norm_imgs.copy()])
if self.offload_copy==False:
print("offload copy model")
self.d_mem[self.rec_input_name] =migraphx.to_gpu(migraphx.argument(norm_img_batch))
results = self.rec_model.run(self.d_mem)
output = np.array(results[0])
else:
results = self.rec_model.run({self.rec_input_name:norm_img_batch})
output = results[0]
# batch_outputs.append(np.array(output))
#将所有batch的输出结果append到batch_outputs中方便后处理
[batch_outputs.append(out) for out in np.array(output)]
batch_outputs_pre.append(np.array(batch_outputs))
batch_max_wh_ratio_pre.append(batch_max_wh_ratio)
return batch_outputs_pre ,batch_max_wh_ratio_pre
```
# Ocrv5 API调用说明
API调用步骤如下:
- 类实例化
- 识别接口调用
例:
```python
if __name__ == '__main__':
det_onnx_path = "../Resource/Models/ppocrv5_server_det_infer.onnx"
rec_onnx_path = "../Resource/Models/ppocrv5_server_rec_infer.onnx"
image_path = "../Resource/Images/lite_demo.png"
img = cv2.imread(image_path)
ppocrv5 = PPOcrV5(det_onnx_path,rec_onnx_path,offload_copy=True,precision_mode="fp32")
res_img = ppocrv5(img)
cv2.imwrite("res.jpg",res_img)
```
sample支持两种精度推理(fp32和fp16),默认是fp32),精度和内存拷贝方式分别通过precision_mode和offload_copy参数控制。
\ No newline at end of file
This diff is collapsed.
opencv-python
numpy
shapely
pyclipper
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment