Commit 22b7c574 authored by liuhy's avatar liuhy
Browse files

paddleOcr v5

parents
#! /bin/sh
############### Ubuntu ###############
# 参考:https://docs.opencv.org/3.4.11/d7/d9f/tutorial_linux_install.html
apt-get install build-essential -y
apt-get install cmake git libgtk2.0-dev pkg-config libavcodec-dev libavformat-dev libswscale-dev -y
apt-get install python-dev python-numpy libtbb2 libtbb-dev libjpeg-dev libpng-dev libtiff-dev libjasper-dev libdc1394-22-dev -y # 处理图像所需的包,可选
############### CentOS ###############
# yum install gcc gcc-c++ gtk2-devel gimp-devel gimp-devel-tools gimp-help-browser zlib-devel libtiff-devel libjpeg-devel libpng-devel gstreamer-devel libavc1394-devel libraw1394-devel libdc1394-devel jasper-devel jasper-utils swig python libtool nasm -y
############################ 在线安装依赖 ###############################
#cd ./3rdParty
#pip install rbuild-master.tar.gz
############################ 离线安装依赖 ###############################
# 安装依赖
cd ./3rdParty/rbuild_depend
# pip install click-6.6-py2.py3-none-any.whl
# pip install six-1.15.0-py2.py3-none-any.whl
pip install subprocess32-3.5.4.tar.gz
pip install cget-0.1.9.tar.gz
# pip install shapely-2.1.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
# pip install pyclipper-1.3.0.post6-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl
# 安装rbuild
cd ../
pip install rbuild-master.tar.gz
52 comment=e8d4259f9ab787b512b9aa1203fc816fb9f19231
# 设置cmake的最低版本
cmake_minimum_required(VERSION 3.5)
# 设置项目名
project(ppOcrV5)
# 设置编译器
set(CMAKE_CXX_COMPILER hipcc)
set(CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS} -std=c++17)
set(CMAKE_BUILD_TYPE release)
set(INCLUDE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/Src/
$ENV{DTKROOT}/include/
${CMAKE_CURRENT_SOURCE_DIR}/Src/Utility
${CMAKE_CURRENT_SOURCE_DIR}/3rdParty/opencv/include)
include_directories(${INCLUDE_PATH})
# 添加依赖库路径
set(LIBRARY_PATH ${CMAKE_CURRENT_SOURCE_DIR}/3rdParty/opencv/lib
$ENV{DTKROOT}/lib/)
link_directories(${LIBRARY_PATH})
# 添加依赖库
set(LIBRARY opencv_core
opencv_imgproc
opencv_imgcodecs
opencv_dnn
migraphx
migraphx_gpu
migraphx_onnx)
link_libraries(${LIBRARY})
set(SOURCE_FILES ${CMAKE_CURRENT_SOURCE_DIR}/Src/main.cpp
${CMAKE_CURRENT_SOURCE_DIR}/Src/Utility/Filesystem.cpp
${CMAKE_CURRENT_SOURCE_DIR}/Src/Utility/clipper.cpp
${CMAKE_CURRENT_SOURCE_DIR}/Src/ocr_engine.cpp)
# 添加可执行目标
add_executable(ppOcrV5 ${SOURCE_FILES})
# 概述
PP-OCRv5 是PP-OCR新一代文字识别解决方案,该方案聚焦于多场景、多文字类型的文字识别。在文字类型方面,PP-OCRv5支持简体中文、中文拼音、繁体中文、英文、日文5大主流文字类型,在场景方面,PP-OCRv5升级了中英复杂手写体、竖排文本、生僻字等多种挑战性场景的识别能力。在内部多场景复杂评估集上,PP-OCRv5较PP-OCRv4端到端提升13个百分点,本sample适配了PPOcrV5字符检测和识别模型,并使用MIGraphX 的C++接口实现推理。
## 模型简介
### 文本检测
文本检测使用了dbnet(论文地址:https://arxiv.org/pdf/1911.08947),网络结构:![alt text](Images/DBNet.png),模型输出概率图,并用Vatti Clipping算法对字符区域多边形简化处理,sample中借助Clipping 库。 sample中使用动态shape(N,3,H,C),最大输入shape是[1,3,640,640],模型地址:Resource/Models/ppocrv5_server_det_infer.onnx
### 文本识别
文本识别使用了CRNN+CTCDecode(https://arxiv.org/pdf/2009.09941),网络结构:![alt text](Images/CRNN.png),sample中使用了动态shape (N,3,48,W),最大输入shape是[1,3,48,720],模型地址:Resource/Models/ppocrv5_server_rec_infer.onnx
## 预处理
### 检测模型预处理
检测模型输入数据预处理:
- 图片等比缩放,填充(沿着右、下填充)
- 图片归一化,减均值除方差
- transpose ,MigraphX的输入数据排布顺序为[N,C,H,W]
```c++
cv::Size OcrDet::preproc(cv::Mat img,float* data)
{
float scale = 1.0/255.0;
std::vector<float> s_mean={0.485, 0.456, 0.406};
std::vector<float> s_stdv={0.229, 0.224, 0.225};
if(img.empty())
{
std::cout<<"Source image is empty!\n";
return cv::Size(1.0,1.0);
}
cv::Mat res_img;
cv::Size scale_r;
scale_r.width = float(net_input_width)/float(img.cols);
scale_r.height = float(net_input_height)/float(img.rows);
//等比缩放
cv::resize(img,res_img,cv::Size(net_input_width,net_input_height));
int iw = res_img.cols;
int ih = res_img.rows;
memset(data,0.0,3*iw*ih*sizeof(float));
//HWC->CHW
for(int i=0;i<net_input_height;i++)
{
for(int j=0;j<net_input_width;j++)
{
data[i*net_input_width+j+2*net_input_height*net_input_width] = (float(res_img.at<cv::Vec3b>(i, j)[2])*scale-s_mean[2])/s_stdv[2];
data[i*net_input_width+j+net_input_height*net_input_width] = (float(res_img.at<cv::Vec3b>(i, j)[1])*scale-s_mean[1])/s_stdv[1];
data[i*net_input_width+j] = (float(res_img.at<cv::Vec3b>(i, j)[0])*scale-s_mean[0])/s_stdv[0];
}
}
return scale_r ;
}
```
### 字符识别模型预处理
字符识别模型输入数据预处理:
- 等比缩放,保留H维度的原始比例,填充(沿着右、下)
- 图片归一化,均值方差默认为0.5
- transpose ,MigraphX的输入数据排布顺序为[N,C,H,W]
```c++
bool CTCDecode::preproc(cv::Mat img,float* data,int img_w,int img_h)
{
if (img.empty())
{
std::cout<<"WARNING image is empty!\n";
return false;
}
float scale=1.0/255.;
int iw=img.cols;
int ih=img.rows;
float ratio=min(img_h*1.0/ih,img_w*1.0/iw);
int nw=static_cast<int> (iw*ratio);
int nh=img_h;
cv::Mat res_mat;
cv::resize(img,res_mat,cv::Size(nw,nh));
cv::Mat template_mat=cv::Mat(img_h,img_w,CV_8UC3,cv::Scalar(0,0,0));
int xdet=img_w-nw;
int ydet=img_h-nh;
cv::copyMakeBorder(res_mat, template_mat, 0,ydet, 0, xdet, 0);
memset(data,0.0,this->batch_size*3*img_w*img_h*sizeof(float));
for(int b =0 ; b < this->batch_size;b++ )
{
for(int i=0;i<img_h;i++)
{
for(int j=0;j<img_w;j++)
{
data[i*img_w+j] = (template_mat.at<cv::Vec3b>(i, j)[2]*scale-0.5)/0.5;
data[i*img_w+j+img_h*img_w] = (template_mat.at<cv::Vec3b>(i, j)[1]*scale-0.5)/0.5;
data[i*img_w+j+2*img_h*img_w] =( template_mat.at<cv::Vec3b>(i, j)[0]*scale-0.5)/0.5;
}
}
}
return true ;
}
```
## 类介绍
ppOcrEngine 封装了对外提供的API,OcrDet为文本检测类,CTCDecode为文本识别类。文本检测和文本识别在ppOcrEngine中是两个智能指针变量,在forward,首先调用text_detector检测到图片中的所有字符区域,然后分别将检测到的区域传入到text_recognizer中识别字符区域的内容。
```c++
class ppOcrEngine {
private:
std::shared_ptr<OcrDet> text_detector;
std::shared_ptr<CTCDecode> text_recognizer;
public:
ppOcrEngine(const std::string &det_model_path,
const std::string &rec_model_path,
const std::string &character_dict_path,
const float segm_thres=0.3,
const float box_thresh=0.7,
bool offload_copy =true,
std::string precision_mode = "fp32") ;
/**
* @brief OCR engine初始化
* @param det_model_path 字符检测模型路径
* @param rec_model_path 识别模型路径
* @param character_dict_path 字符字典路径
* @param segm_thres 像素分割阈值
* @param box_thresh 字符区域box阈值
* @param offload_copy 内存拷贝存模式, 支持两种数据拷贝方式:*offload_copy=true、offload_copy=false。当offload_copy为true时,不需*要进行内存拷贝,如果为false,需要先预分配输入输出的设备内存,并在推理* *前,将预处理数据拷贝到设备内存,推理后将模型输出从设备内存中拷贝出来
* @param precision_mode 精度模式,支持:fp32、fp16
*
* @return NONE
*/
~ppOcrEngine();
std::vector<std::string> forward(cv::Mat &srcimg);
};
class CTCDecode
{
private:
//inference image
float* data;
std::unordered_map<std::string, migraphx::argument> device_data;
migraphx::program net;
int batch_size;
int net_input_width;
int net_input_height;
int net_input_channel;
bool offload_copy;
std::string precision_mode;
std::unordered_map<std::string, migraphx::argument> dev_argument;
void* input_buffer_device;
void* output_buffer_device;
void* output_buffer_host;
migraphx::shape input_shape;
migraphx::shape output_shape;
std::string input_name;
std::string output_name;
//postprocess: n_channel->model output channel,feature_size--> feature size one channel
int n_channel;
int feature_size;
std::vector<std::string> k_words;
public:
CTCDecode(std::string rec_model_path,
std::string precision_mode="fp32",
int image_width=480,
int image_height=48,
int channel=3,
int batch_size = 1,
bool offload_copy = true,
std::string character_dict_path="./ppocr_keys_v5.txt");
~CTCDecode();
/**
* @brief 字符识别编码,可支持,最长可支持预测90个字符,18385个字符
*/
std::string forward(cv::Mat& img);
private:
/**
* @brief 预处理
* pixel = (src_img*scale-0.5)/0.5;
* scale = 1.0/255
* @param img 字符图片
* @param data 预处理输出
* @param img_w 模型输入宽
* @param img_h 模型输入高
* @return 成功:true,失败:false
*/
bool preproc(cv::Mat img,float* data,int img_w=480,int img_h=48);
/**
* @brief 模型预测后处理,获取每行中概率最大的字符,组成一句长度最大为90个字符的句子,模型预测输出shape=[1,90,18385]
* @param feature model output
* @return 成功:text,失败:""
*/
std::string postprocess(float* feature);
/**
* @brief 解码,将模型预测输出与字符集关联起来
* @param probs 模型预测的最大概率
* @param indexs 模型预测的最大概率的索引值
* @param mean_prob 预测句子的平均概率
* @return 成功:text,失败:""
*/
std::string decode(std::vector<float>& probs,std::vector<int>& indexs,float& mean_prob);
};
class OcrDet
{
private:
std::string precision_mode;
bool offload_copy;
migraphx::program net;
migraphx::shape input_shape;
migraphx::shape output_shape;
std::string input_name;
std::string output_name;
int det_batch_size;
int data_size ;
float segm_thres;
float box_thres;
int net_input_width;
int net_input_height;
int net_input_channel;
float* data;
//Allocate device buffer and host buffer,if offload_copy is false
std::unordered_map<std::string, migraphx::argument> dev_argument;
void* input_buffer_device;
void* output_buffer_device;
void* output_buffer_host;
//postprocess
int n_channel;
int feature_size; //single channel feature map size.
int output_width;
int output_height;
int max_candidates;//maximun number of candidates contours.
public:
OcrDet(std::string det_model_path,
std::string precision_mode="float32",
bool offload_copy = true,
float segm_thres = 0.3,
float box_thresh = 0.7);
~OcrDet();
bool forward(cv::Mat& img,std::vector<std::vector<std::vector<int>>>& text_roi_boxes);
private:
/**
* @brief 预处理
* pixel = (scale*src_img*mean/std);
* scale = 1.0/255
* mean = [0.485, 0.456, 0.406]
* std = [0.229, 0.224, 0.225]
* @param img 字符图片
* @param data 预处理输出
* @return 成功:w,h维度的缩放比例
*/
cv::Size preproc(cv::Mat img,float* data);
/**
* @brief 后处理,通过模型预测的二值图获取文本区域
* @param feature 模型预测tensor(这里字符检测使用了dbnet)
* @param boxes 字符区域坐标
* @return 成功:0,失败:-1
*/
int postprocess(float* feature, std::vector<std::vector<std::vector<int>>> &boxes);
int boxes_from_bitmap(cv::Mat& bit_map,std::vector<T_BOX>& box);
std::vector<std::vector<std::vector<int>>>boxes_from_bitmap(
const cv::Mat pred, const cv::Mat bitmap, const float &box_thresh,
const float &det_db_unclip_ratio, const bool &use_polygon_score);
std::vector<std::vector<float>> Mat2Vector(cv::Mat mat);
/**
* @brief 统计多边形区域的平均得分
* @param contour 字符区域的轮廓点集合
* @param pred 模型预测二值图
* @return score
*/
float polygon_score_acc(std::vector<cv::Point> contour,cv::Mat pred);
/**
* @brief 对模型预测的区域进行向内或向外扩散,扩散比例是unclip_ratio ,目的是找到更加合适的字符区域
* @param box 字符区域坐标
* @param pred 模型预测二值图
* @return 处理后的字符区域
*/
cv::RotatedRect unClip(std::vector<std::vector<float>> box,
const float &unclip_ratio);
/**
* @brief 计算偏移距离
* distance = area * unclip_ratio / dist;
* area = ∑(x_i*y_{i+1} - x_{i+1}*y_i)
* dist = sqrtf(dx * dx + dy * dy)
*
* @param box 字符区域坐标
* @param unclip_ratio 缩放比例
* @param distance 偏移距离
* @return NONE
*/
void get_contour_area(const std::vector<std::vector<float>> &box,
float unclip_ratio, float &distance) ;
/**
* @brief 无效字符区域过滤。首先将boxes映射回原始图像,然后过滤无效区域
* @param boxes 字符区域坐标
* @param ratio_h 垂直方向缩放比例
* @param ratio_w 水平方向缩放比例
* @param srcimg 原始图像
*
* @return 字符区域有效坐标
*/
std::vector<std::vector<std::vector<int>>> filter_det_res(std::vector<std::vector<std::vector<int>>> boxes,
float ratio_h, float ratio_w, cv::Mat srcimg);
/**
* @brief 对字符区域按照从上到下,从左到右的顺序排序
* @param pts 字符区域坐标
*
* @return 字符区域有效坐标
*/
std::vector<std::vector<int>> order_points_clockwise(std::vector<std::vector<int>> pts);
/**
* @brief 获取最小矩形坐标
* @param box 字符区域最小外接矩形的坐标
* @param ssid box的最大边
* @return 字符区域有效坐标
*/
std::vector<std::vector<float>> get_mini_boxes(cv::RotatedRect box,float &ssid) ;
/**
* @brief 计算bitmap上的t_rect区域的平均分数
* @param box_array 模型预测的字符区域
* @param pred 模型预测二值图
* @return score
*/
float box_score_fast(std::vector<std::vector<float>> box_array,cv::Mat pred) ;
void visualize_boxes(const cv::Mat &srcimg,
const std::vector<std::vector<std::vector<int>>> &boxes) ;
bool text_recognition(const cv::Mat &srcimg,
const std::vector<std::vector<std::vector<int>>> &boxes);
};
```
## 推理
### 字符检测模型推理
```c++
bool OcrDet::forward(cv::Mat& img,std::vector<std::vector<std::vector<int>>>& text_roi_boxes)
{
std::vector<std::vector<std::vector<int>>> boxes;
//输入数据预处理
cv::Size ratio = preproc(img,data);
/*
支持两种数据拷贝方式:offload_copy=true、offload_copy=false。当offload_copy为true时,不需要进行内存拷贝,如果为false,需要先预分配输入输出的设备内存,并在推理前,将预处理数据拷贝到设备内存,推理后将模型输出从设备内存中拷贝出来,在做后处理。
*/
if( this->offload_copy ==false )
{
hipMemcpy(input_buffer_device,
(void*)data,
this->input_shape.bytes(),
hipMemcpyHostToDevice);
std::vector<migraphx::argument> results = net.eval(dev_argument);
hipMemcpy(output_buffer_host,
(void*)output_buffer_device,
output_shape.bytes(),
hipMemcpyDeviceToHost);
postprocess((float *)output_buffer_host,boxes);
std::cout<<"copy mode ..."<<std::endl;
}else{
std::unordered_map<std::string, migraphx::argument> inputData;
inputData[input_name] = migraphx::argument{input_shape, (float *)data};
std::vector<migraphx::argument> results = net.eval(inputData);
migraphx::argument result = results[0] ; //get output data
postprocess((float *)result.data(),boxes);
std::cout<<"offload copy mode ..."<<std::endl;
}
//计算等比缩放比例
float ratio_w = float(net_input_width) / float(img.cols);
float ratio_h = float(net_input_height) / float(img.rows);
//过滤无效框
text_roi_boxes = filter_det_res(boxes, ratio_h, ratio_w, img);
//可视化检测结果
visualize_boxes(img,text_roi_boxes);
// TextRecognition(img,boxes);
return true;
}
```
### 字符识别推理
```c++
std::string CTCDecode::forward(cv::Mat& img)
{
//预处理
preproc(img,data,net_input_width,net_input_height);
/*
支持两种数据拷贝方式:offload_copy=true、offload_copy=false。当offload_copy为true时,不需要进行内存拷贝,如果为false,需要先预分配输入输出的设备内存,并在推理前,将预处理数据拷贝到设备内存,推理后将模型输出从设备内存中拷贝出来,在做后处理。
*/
if( this->offload_copy ==false )
{
hipMemcpy(input_buffer_device,
(void*)data,
this->input_shape.bytes(),
hipMemcpyHostToDevice);
std::vector<migraphx::argument> results = net.eval(dev_argument);
hipMemcpy(output_buffer_host,
(void*)output_buffer_device,
output_shape.bytes(),
hipMemcpyDeviceToHost);
//模型后处理,获取字符的最大概率和索引,并根据索引在字符库中查找对应的字符,然后合成一个句子
std::string text = postprocess((float *)output_buffer_device);
return text;
}else{
std::unordered_map<std::string, migraphx::argument> inputData;
inputData[input_name] = migraphx::argument{input_shape, (float *)data};
std::vector<migraphx::argument> results = net.eval(inputData);
migraphx::argument result = results[0] ;
std::string text = postprocess((float *)result.data());
// std::cout<<"ctc: offload copy mode ..."<<std::endl;
return text;
}
}
```
# Ocrv5 API调用说明
API调用步骤如下:
- 类实例化
- 识别接口调用
例:
```c++
int main(int argc, char** argv)
{
std::string det_model_onnx = "../Resource/Models/ppocrv5_server_det_infer.onnx";
std::string rec_model_onnx = "../Resource/Models/ppocrv5_server_rec_infer.onnx";
std::string img_path = "../Resource/Images/20250703205038.png";
std::string character_dict_path = "../Resource/ppocr_keys_v5.txt";
float segm_thres=0.3;
float box_thresh=0.3;
ppOcrEngine ocr_engine(det_model_onnx,
rec_model_onnx,
character_dict_path,
segm_thres,
box_thresh,
true,
"fp32");
cv::Mat img=cv::imread(img_path);
ocr_engine.forward(img);
return 0;
}
```
sample支持两种精度推理(fp32和fp16),默认是fp32),精度和内存拷贝方式分别通过ocr_engine的构造函数传入参数来设置。
\ No newline at end of file
# 概述
PP-OCRv5 是PP-OCR新一代文字识别解决方案,该方案聚焦于多场景、多文字类型的文字识别。在文字类型方面,PP-OCRv5支持简体中文、中文拼音、繁体中文、英文、日文5大主流文字类型,在场景方面,PP-OCRv5升级了中英复杂手写体、竖排文本、生僻字等多种挑战性场景的识别能力。在内部多场景复杂评估集上,PP-OCRv5较PP-OCRv4端到端提升13个百分点,本sample适配了PPOcrV5字符检测和识别模型,并使用MIGraphX 5.0 的python接口实现推理。
## 模型简介
### 文本检测
文本检测使用了dbnet(论文地址:https://arxiv.org/pdf/1911.08947),网络结构:![alt text](Images/DBNet.png),模型输出概率图,并用Vatti Clipping算法对字符区域多边形简化处理。 sample中使用动态shape(N,3,H,C),最大输入shape是[1,3,640,640],模型地址:Resource/Models/ppocrv5_server_det_infer.onnx
### 文本识别
文本识别使用了CRNN+CTCDecode(https://arxiv.org/pdf/2009.09941),网络结构:![alt text](Images/CRNN.png),sample中使用了动态shape (N,3,48,W),最大输入shape是[1,3,48,720],模型地址:Resource/Models/ppocrv5_server_rec_infer.onnx
## 预处理
### 检测模型预处理
检测模型输入数据预处理:
- 图片等比缩放,填充(沿着右、下填充)
- 图片归一化,减均值除方差
- transpose ,MigraphX的输入数据排布顺序为[N,C,H,W]
本示例代码主要采用了OpenCV实现了预处理操作:
```python
def preprocess(self, src_img,
mean: list = [0.485, 0.456, 0.406],
std: list = [0.229, 0.224, 0.225],
scale: float = 1.0/255):
data = dict()
img = src_img.copy()
src_h, src_w, _ = img.shape
#对输入图片等比缩放,确保字符区域不会变形
res_img, [ratio_h, ratio_w] = self.resize_image(img)
norm_img = (res_img* scale - mean) / std
#HWC->CHW
image_data = norm_img.transpose(2, 0, 1)
#HWC->NCHW
image_data = np.expand_dims(image_data, axis=0).astype(np.float32)
image_data = np.ascontiguousarray(image_data)
data["image"] = image_data
data["shape"] = np.array([src_h, src_w, ratio_h, ratio_w])
return data
def resize_image(self, img):
h, w, _ = img.shape
if h > w:
ratio = float(self.db_input_size[1]) / h
else:
ratio = float(self.db_input_size[0]) / w
resize_h = int(h * ratio)
resize_w = int(w * ratio)
resize_h = max(int(round(resize_h / 32) * 32), 32)
resize_w = max(int(round(resize_w / 32) * 32), 32)
try:
if int(resize_w) <= 0 or int(resize_h) <= 0:
return None, (None, None)
img = cv2.resize(img, (int(resize_w), int(resize_h)))
except:
print(img.shape, resize_w, resize_h)
raise ValueError("resize error")
ratio_h = resize_h / float(h)
ratio_w = resize_w / float(w)
im_pad = np.zeros((self.db_input_size[1], self.db_input_size[0], 3), np.float32)
im_pad[:resize_h, :resize_w, :] = img
return im_pad, [ratio_h, ratio_w]
```
### 字符识别模型预处理
```python
字符识别模型输入数据预处理
- 等比缩放保留H维度的原始比例填充(沿着右)
- 图片归一化均值方差默认为0.5
- transpose ,MigraphX的输入数据排布顺序为[N,C,H,W]
def preprocess(self, img, max_wh_ratio):
if isinstance(max_wh_ratio,list) ==False:
raise TypeError("max_wh_ratio must be list")
imgH, imgW = self.rec_input_size
max_h,max_w = self.rec_input_size
h, w = img.shape[:2]
# re_size = (max_w,max_h)
#保留H的原始维度
if h <= max_h:
ratio = max_h / h
w = int(w*ratio)
if w <= max_w:
re_size =(w,max_h)
else:
re_size = (max_w,max_h)
else:
ratio = max_h/h
w,h = int(w*ratio),max_h
if w <= max_w:
re_size = (w,h)
else:
re_size = (max_w,h)
max_wh_ratio.append(ratio)
resized_image = cv2.resize(img, re_size)
resized_image = resized_image.astype("float32")
#归一化
resized_image = resized_image.transpose((2, 0, 1)) / 255
resized_image -= 0.5
resized_image /= 0.5
#填充,沿着右、下填充
padding_im = np.zeros((3, imgH, imgW), dtype=np.float32)
padding_im[:, :, 0:re_size[0]] = resized_image
return padding_im
```
## 类介绍
PPOcrV5 封装了对外提供的API,TextDetector为文本检测类,TextRecgnizer为文本识别类,BaseRecLabelDecode,实现模型输出的索引序列和实际文本标签之间进行转换,CTCLabelDecode继承BaseRecLabelDecode,实现文本识别模型的输出解码将模型输出的概率值转换为字符并连接成句子。
```python
class PPOcrV5():
def __init__(self,
det_model_path:str,
rec_model_path:str,
char_dict_path:str = "../Resource/ppocr_keys_v5.txt",
db_input_size :list = (640,640),
rec_input_size :list = (48,720),
seg_thresh:float=0.3,
box_thresh:float=0.7,
precision_mode:str='fp32',
offload_copy:bool=True,
**kwargs
)
"""Ocr检测识别推理初始化
字符检测、字符编码、识别。
Args:
det_model_path :字符检测模型路径
rec_model_path : 字符分割模型路径。
char_dict_path :字符集路径
db_input_size :检测模型输入size
rec_input_size :是被模型输入size
seg_thresh :像素分割阈值
box_thresh :字符区域box阈值
precision_mode :精度模式。可选 fp32、fp16
offload_copy : 数据拷贝模式 ,支持两种数据拷贝方式:offload_copy=true、offload_copy=false。当offload_copy为true时,不需要进行内存拷贝,如果为false,需要先预分配输入输出的设备内存,并在推理前,将预处理数据拷贝到设备内存,推理后将模型输出从设备内存中拷贝出来
**kwargs :设置字符检测模型后处理相关参数
Returns:
return_type: NONE。
Examples:
det_onnx_path = "PATH/TO/det_onnx_model.onnx"
rec_onnx_path = "PATH/TO/rec_onnx_model.onnx"
image_path = "PATH/TO/test.png"
ppocrv5 = PPOcrV5(det_onnx_path,rec_onnx_path,offload_copy=True)
"""
class TextDetector(object):
def __init__(
self,
det_model_path,
db_input_size=(640,640),
thresh=0.3,
box_thresh=0.7,
max_candidates=1000,
unclip_ratio=2.0,
use_dilation=False,
score_mode="fast",
box_type="quad",
precision_mode="float32",
**kwargs,
)
"""字符检测模型初始化
字符检测(dbnet)。
Args:
det_model_path :字符检测模型路径。
db_input_size :检测模型输入size
thresh :像素分割阈值
box_thresh :字符区域box阈值
max_candidates : 字符最大候选数
unclip_ratio :polygon 扩散比例
precision_mode :精度模式。可选 "fp16","int8","float32"
use_dilation : 是否对二值图进行膨胀处理
score_mode :评分模式。
box_type :box类型,可选矩形和多边形,这里默认为矩形
offload_copy : 数据拷贝模式 ,支持两种数据拷贝方式:offload_copy=true、offload_copy=false。当offload_copy为true时,不需要进行内存拷贝,如果为false,需要先预分配输入输出的设备内存,并在推理前,将预处理数据拷贝到设备内存,推理后将模型输出从设备内存中拷贝出来
**kwargs :设置字符检测模型后处理相关参数
Returns:
return_type: NONE。
Examples:
self.db_detector = TextDetector(
det_model_path,
db_input_size,
thresh=self.seg_thres,
box_thresh=self.box_thresh,
max_candidates=self.max_candidates,
unclip_ratio=self.unclip_ratio,
box_type=self.box_type,
use_dilation=self.use_dilation,
score_mode=self.score_mode,
precision_mode=precision_mode,
offload_copy=offload_copy
"""
class TextRecgnizer(object):
"""Support SVTR_LCNet """
def __init__(
self,
rec_model_path,
rec_batch_num=2,
rec_input_size=(48, 480),#hw
rec_algorithm="SVTR_LCNet",
precision_mode = "fp32",
**kwargs
)
"""字符识别模型初始化
字符识别(crnn+ctc)。
Args:
rec_model_path :字符识别模型路径。
rec_batch_num :模型推理batch size
rec_input_size :模型推理的最大size
rec_algorithm : 后处理算法类型
unclip_ratio :polygon 扩散比例
precision_mode :精度模式。可选 "fp16","float32"
**kwargs :设置字符识别模型后处理相关参数
Returns:
return_type: NONE。
Examples:
self.text_extractor = TextRecgnizer(rec_model_path=rec_model_path,
rec_input_size=rec_input_size,
precision_mode=precision_mode,
offload_copy=offload_copy)
"""
class BaseRecLabelDecode(object):
def __init__(self, character_dict_path=None,
use_space_char=False)
"""Convert between text-label and text-index
字符识别(crnn+ctc)。
Args:
character_dict_path :字符集文件路径。
use_space_char :字符集中是否包含空格。
Returns:
return_type: NONE。
Examples:
"""
class CTCLabelDecode(BaseRecLabelDecode):
def __init__(self, character_dict_path=None, use_space_char=False, **kwargs):
super(CTCLabelDecode, self).__init__(character_dict_path, use_space_char)
"""Convert between text-label and text-index
字符识别(crnn+ctc)。
Args:
character_dict_path :字符集文件路径。
use_space_char :字符集中是否包含空格。
Returns:
return_type: NONE。
Examples:
"""
```
## 推理
### 字符检测模型推理
```python
def __call__(self, src_img):
data = self.preprocess(src_img)
"""支持两种数据拷贝方式:offload_copy=true、offload_copy=false。当offload_copy为true时,不需要进行内存拷贝,如果为false,需要先预分配输入输出的设备内存,并在推理前,将预处理数据拷贝到设备内存,推理后将模型输出从设备内存中拷贝出来,在做后处理。"""
if self.offload_copy==False:
self.d_mem[self.det_input_name] =migraphx.to_gpu(migraphx.argument(data["image"]))
results = self.db_model.run(self.d_mem)
else:
results = self.db_model.run({self.det_input_name:data["image"]})
if self.offload_copy==False :
#从gpu拷贝推理结果到cpu
result=migraphx.from_gpu(results[0])
print("offload copy model")
result = np.array(result)
else:
result = results[0]
shape_list = np.expand_dims(data["shape"], axis=0)
pred = np.array(result)
pred = pred[:, 0, :, :]
#获取大于阈值的概率
segmentation = pred > self.thresh
boxes_batch = []
for batch_index in range(pred.shape[0]):
src_h, src_w, ratio_h, ratio_w = shape_list[batch_index]
if self.dilation_kernel is not None:
mask = cv2.dilate(
np.array(segmentation[batch_index]).astype(np.uint8),
self.dilation_kernel,
)
else:
mask = segmentation[batch_index]
#根据预测的bitmap获取文本区域
if self.box_type == "poly":
boxes, scores = self.polygons_from_bitmap(
pred[batch_index], mask, ratio_w,ratio_h, src_w, src_h
)
elif self.box_type == "quad":
boxes, scores = self.boxes_from_bitmap(
pred[batch_index], mask, ratio_w,ratio_h, src_w, src_h
)
else:
raise ValueError("box_type can only be one of ['quad', 'poly']")
boxes_batch.append(boxes)
#文本区域按照从上到下,从左到右的顺序排序
det_box_batch = self.sorted_boxes(boxes_batch)
#文本区域按坐标映射到原始图像
dt_boxes,det_rects = self.box_standardization(det_box_batch,shape_list)
return dt_boxes,det_rects
```
### 字符识别推理
```python
def __call__(self, batch_img_list):
if len(batch_img_list) == 0:
return []
width_list = []
#遍历图片列表(字符roi存放在图片列表中),为了支持多batch推理,这里还会将batch_size张图片进行拼接np.concatenate(batch_norm_imgs)
for b in range(len(batch_img_list)):
for img in batch_img_list[b]:
width_list.append(img.shape[1] / float(img.shape[0]))
indices = np.argsort(np.array(width_list))
input_batch = self.rec_batch_num
batch_outputs_pre = []
batch_max_wh_ratio_pre = []
for b in range(len(batch_img_list)):
im_count = len(batch_img_list[b])
batch_outputs = []
batch_max_wh_ratio = []
for beg_img_no in range(0, im_count, input_batch):
end_img_no = min(im_count, beg_img_no + input_batch)
# for ino in range(beg_img_no, end_img_no):
# h, w = batch_img_list[b][indices[ino]].shape[0:2]
# wh_ratio = w * 1.0 / h
# max_wh_ratio = max(max_wh_ratio, wh_ratio)
batch_norm_imgs = []
max_wh_ratio = list()
# N batch
for ino in range(beg_img_no, end_img_no):
#单张图片预处理
norm_img = self.preprocess(batch_img_list[b][indices[ino]], max_wh_ratio)
norm_img = norm_img[np.newaxis, :].astype(np.float32)
batch_norm_imgs.append(norm_img)
batch_max_wh_ratio.append(max_wh_ratio)
#batch_size张图片进行拼接
if self.rec_batch_num >1:
norm_img_batch = np.concatenate(batch_norm_imgs)
norm_img_batch = norm_img_batch.copy()
else:
norm_img_batch = np.array([batch_norm_imgs.copy()])
if self.offload_copy==False:
print("offload copy model")
self.d_mem[self.rec_input_name] =migraphx.to_gpu(migraphx.argument(norm_img_batch))
results = self.rec_model.run(self.d_mem)
output = np.array(results[0])
else:
results = self.rec_model.run({self.rec_input_name:norm_img_batch})
output = results[0]
# batch_outputs.append(np.array(output))
#将所有batch的输出结果append到batch_outputs中方便后处理
[batch_outputs.append(out) for out in np.array(output)]
batch_outputs_pre.append(np.array(batch_outputs))
batch_max_wh_ratio_pre.append(batch_max_wh_ratio)
return batch_outputs_pre ,batch_max_wh_ratio_pre
```
# Ocrv5 API调用说明
API调用步骤如下:
- 类实例化
- 识别接口调用
例:
```python
if __name__ == '__main__':
det_onnx_path = "../Resource/Models/ppocrv5_server_det_infer.onnx"
rec_onnx_path = "../Resource/Models/ppocrv5_server_rec_infer.onnx"
image_path = "../Resource/Images/lite_demo.png"
img = cv2.imread(image_path)
ppocrv5 = PPOcrV5(det_onnx_path,rec_onnx_path,offload_copy=True,precision_mode="fp32")
res_img = ppocrv5(img)
cv2.imwrite("res.jpg",res_img)
```
sample支持两种精度推理(fp32和fp16),默认是fp32),精度和内存拷贝方式分别通过precision_mode和offload_copy参数控制。
\ No newline at end of file
# -*- coding: utf-8 -*-
import cv2
import numpy as np
from shapely.geometry import Polygon
import pyclipper
import migraphx
import os
from PIL import Image
def AllocateOutputMemory(model):
outputData={}
for key in model.get_outputs().keys():
outputData[key] = migraphx.allocate_gpu(s=model.get_outputs()[key])
return outputData
class BaseRecLabelDecode(object):
"""Convert between text-label and text-index"""
def __init__(self, character_dict_path=None, use_space_char=False):
self.beg_str = "sos"
self.end_str = "eos"
self.reverse = False
self.character_str = []
if character_dict_path is None:
self.character_str = "0123456789abcdefghijklmnopqrstuvwxyz"
dict_character = list(self.character_str)
else:
with open(character_dict_path, "rb") as fin:
lines = fin.readlines()
for line in lines:
line = line.decode("utf-8").strip("\n").strip("\r\n")
self.character_str.append(line)
if use_space_char:
self.character_str.append(" ")
dict_character = list(self.character_str)
if "arabic" in character_dict_path:
self.reverse = True
dict_character = self.add_special_char(dict_character)
self.dict = {}
for i, char in enumerate(dict_character):
self.dict[char] = i
self.character = dict_character
def pred_reverse(self, pred):
pred_re = []
c_current = ""
for c in pred:
if not bool(re.search("[a-zA-Z0-9 :*./%+-]", c)):
if c_current != "":
pred_re.append(c_current)
pred_re.append(c)
c_current = ""
else:
c_current += c
if c_current != "":
pred_re.append(c_current)
return "".join(pred_re[::-1])
def add_special_char(self, dict_character):
return dict_character
def get_word_info(self, text, selection):
state = None
word_content = []
word_col_content = []
word_list = []
word_col_list = []
state_list = []
valid_col = np.where(selection == True)[0]
for c_i, char in enumerate(text):
if "\u4e00" <= char <= "\u9fff":
c_state = "cn"
elif bool(re.search("[a-zA-Z0-9]", char)):
c_state = "en&num"
else:
c_state = "splitter"
if (
char == "."
and state == "en&num"
and c_i + 1 < len(text)
and bool(re.search("[0-9]", text[c_i + 1]))
): # grouping floating number
c_state = "en&num"
if (
char == "-" and state == "en&num"
): # grouping word with '-', such as 'state-of-the-art'
c_state = "en&num"
if state == None:
state = c_state
if state != c_state:
if len(word_content) != 0:
word_list.append(word_content)
word_col_list.append(word_col_content)
state_list.append(state)
word_content = []
word_col_content = []
state = c_state
if state != "splitter":
word_content.append(char)
word_col_content.append(valid_col[c_i])
if len(word_content) != 0:
word_list.append(word_content)
word_col_list.append(word_col_content)
state_list.append(state)
return word_list, word_col_list, state_list
def decode(
self,
text_index,
text_prob=None,
is_remove_duplicate=False,
return_word_box=False,
):
"""convert text-index into text-label."""
result_list = []
ignored_tokens = self.get_ignored_tokens()
batch_size = len(text_index)
print(f"Info:{text_index.shape},{text_prob.shape}")
for batch_idx in range(batch_size):
selection = np.ones(len(text_index[batch_idx]), dtype=bool)
if is_remove_duplicate:
selection[1:] = text_index[batch_idx][1:] != text_index[batch_idx][:-1]
for ignored_token in ignored_tokens:
selection &= text_index[batch_idx] != ignored_token
# print(f"[debug] {len(text_index)},{batch_idx},{selection},{text_index[batch_idx][selection]},{len(self.character)}")
char_list = [
self.character[text_id] for text_id in text_index[batch_idx][selection]
]
if text_prob is not None:
conf_list = text_prob[batch_idx][selection]
else:
conf_list = [1] * len(selection)
if len(conf_list) == 0:
conf_list = [0]
text = "".join(char_list)
if self.reverse: # for arabic rec
text = self.pred_reverse(text)
if return_word_box:
word_list, word_col_list, state_list = self.get_word_info(
text, selection
)
result_list.append(
(
text,
np.mean(conf_list).tolist(),
[
len(text_index[batch_idx]),
word_list,
word_col_list,
state_list,
],
)
)
else:
result_list.append((text, np.mean(conf_list).tolist()))
return result_list
def get_ignored_tokens(self):
return [0] # for ctc blank
class CTCLabelDecode(BaseRecLabelDecode):
"""Convert between text-label and text-index"""
def __init__(self, character_dict_path=None, use_space_char=False, **kwargs):
super(CTCLabelDecode, self).__init__(character_dict_path, use_space_char)
def __call__(self, preds, label=None, return_word_box=False, *args, **kwargs):
batch_text_list = []
batch_label_list = []
for b in range(len(preds)):
print(preds[b].shape)
preds_idx = preds[b].argmax(axis=2)
preds_prob = preds[b].max(axis=2)
text = self.decode(
preds_idx,
preds_prob,
is_remove_duplicate=True,
return_word_box=return_word_box,
)
if return_word_box:
for rec_idx, rec in enumerate(text):
wh_ratio = kwargs["wh_ratio_list"][b][id][rec_idx]
rec[2][0] = rec[2][0] /wh_ratio
if label is None:
batch_text_list.append(text)
continue
label = self.decode(label)
batch_text_list.append(text)
batch_label_list.append(label)
return batch_text_list, batch_label_list
def add_special_char(self, dict_character):
dict_character = ["blank"] + dict_character
return dict_character
class TextRecgnizer(object):
"""Support SVTR_LCNet """
def __init__(
self,
rec_model_path,
rec_batch_num=1,
rec_input_size=(48, 480),#hw
rec_algorithm="SVTR_LCNet",
precision_mode = "fp32",
**kwargs
):
self.rec_algorithm = rec_algorithm
self.rec_input_size = rec_input_size
self.precision_mode = precision_mode
self.rec_batch_num = rec_batch_num
self.offload_copy = kwargs.get("offload_copy", True)
if os.path.exists(rec_model_path) and rec_model_path.endswith(".onnx"):
self.rec_input_name = "x"
maxInput={self.rec_input_name:[rec_batch_num,3,self.rec_input_size[0],self.rec_input_size[1]]}
self.rec_model = migraphx.parse_onnx(rec_model_path,map_input_dims=maxInput)
if self.precision_mode == "fp16":
migraphx.quantize_fp16(self.rec_model)
self.rec_model.compile(t=migraphx.get_target("gpu"),offload_copy=self.offload_copy,device_id=0)
inputs = self.rec_model.get_inputs()
outputs = self.rec_model.get_outputs()
if self.offload_copy==False:
self.d_mem = AllocateOutputMemory(self.rec_model)
print("Text recognizition model info:")
print(f" inputs info:{inputs}")
print(f" outputs info:{outputs}")
def __call__(self, batch_img_list):
if len(batch_img_list) == 0:
return []
width_list = []
for b in range(len(batch_img_list)):
for img in batch_img_list[b]:
width_list.append(img.shape[1] / float(img.shape[0]))
indices = np.argsort(np.array(width_list))
input_batch = self.rec_batch_num
batch_outputs_pre = []
batch_max_wh_ratio_pre = []
# print(f"Batch size :{input_batch}")
for b in range(len(batch_img_list)):
im_count = len(batch_img_list[b])
batch_outputs = []
batch_max_wh_ratio = []
for beg_img_no in range(0, im_count, input_batch):
end_img_no = min(im_count, beg_img_no + input_batch)
# for ino in range(beg_img_no, end_img_no):
# h, w = batch_img_list[b][indices[ino]].shape[0:2]
# wh_ratio = w * 1.0 / h
# max_wh_ratio = max(max_wh_ratio, wh_ratio)
batch_norm_imgs = []
max_wh_ratio = list()
# N batch
for ino in range(beg_img_no, end_img_no):
norm_img = self.preprocess(batch_img_list[b][indices[ino]], max_wh_ratio)
norm_img = norm_img[np.newaxis, :].astype(np.float32)
batch_norm_imgs.append(norm_img)
if len(batch_norm_imgs)==0:
continue
batch_max_wh_ratio.append(max_wh_ratio)
# if self.rec_batch_num >1:
# norm_img_batch = np.concatenate(batch_norm_imgs)
# norm_img_batch = norm_img_batch.copy()
# else:
# norm_img_batch = np.concatenate(batch_norm_imgs)
# norm_img_batch = norm_img_batch.copy()
norm_img_batch = np.concatenate(batch_norm_imgs)
norm_img_batch = norm_img_batch.copy()
# print(f"batch shape:{norm_img_batch.shape}")
if self.offload_copy==False:
print("offload copy model")
self.d_mem[self.rec_input_name] =migraphx.to_gpu(migraphx.argument(norm_img_batch))
results = self.rec_model.run(self.d_mem)
output = np.array(results[0])
else:
results = self.rec_model.run({self.rec_input_name:norm_img_batch})
output = results[0]
# batch_outputs.append(np.array(output))
[batch_outputs.append(out) for out in np.array(output)]
batch_outputs_pre.append(np.array(batch_outputs))
batch_max_wh_ratio_pre.append(batch_max_wh_ratio)
return batch_outputs_pre ,batch_max_wh_ratio_pre
def preprocess(self, img, max_wh_ratio):
if isinstance(max_wh_ratio,list) ==False:
raise TypeError("max_wh_ratio must be list")
imgH, imgW = self.rec_input_size
max_h,max_w = self.rec_input_size
h, w = img.shape[:2]
# re_size = (max_w,max_h)
#沿着h axixientation 轴进行resize
if h <= max_h:
ratio = max_h / h
w = int(w*ratio)
if w <= max_w:
re_size =(w,max_h)
else:
re_size = (max_w,max_h)
else:
ratio = max_h/h
w,h = int(w*ratio),max_h
if w <= max_w:
re_size = (w,h)
else:
re_size = (max_w,h)
max_wh_ratio.append(ratio)
resized_image = cv2.resize(img, re_size)
resized_image = resized_image.astype("float32")
resized_image = resized_image.transpose((2, 0, 1)) / 255
resized_image -= 0.5
resized_image /= 0.5
padding_im = np.zeros((3, imgH, imgW), dtype=np.float32)
padding_im[:, :, 0:re_size[0]] = resized_image
return padding_im
class TextDetector(object):
def __init__(
self,
det_model_path,
db_input_size=(640,640),
thresh=0.3,
box_thresh=0.7,
max_candidates=1000,
unclip_ratio=2.0,
use_dilation=False,
score_mode="fast",
box_type="quad",
precision_mode="float32",
**kwargs,
):
self.thresh = thresh
self.db_input_size = db_input_size
self.box_thresh = box_thresh
self.max_candidates = max_candidates
self.unclip_ratio = unclip_ratio
self.min_size = 3
self.score_mode = score_mode
self.box_type = box_type
self.precision_mode = precision_mode
assert score_mode in [
"slow",
"fast",
], "Score mode must be in [slow, fast] but got: {}".format(score_mode)
self.dilation_kernel = None if not use_dilation else np.array([[1, 1], [1, 1]])
self.offload_copy = kwargs.get("offload_copy", True)
if os.path.exists(det_model_path) and det_model_path.endswith(".onnx"):
self.det_input_name = "x"
maxInput={self.det_input_name:[1,3,db_input_size[0],db_input_size[1]]}
self.db_model = migraphx.parse_onnx(det_model_path,map_input_dims=maxInput)
inputs = self.db_model.get_inputs()
outputs = self.db_model.get_outputs()
# if self.precision_mode == "int8":
# print("int8 quantization")
# dic = dict()
# image_path = "../Resource/Images/lite_demo.png"
# img = cv2.imread(image_path)
# data = self.preprocess(img)
# print(data["image"].shape)
# print(data["image"].dtype)
# dic[self.det_input_name] = migraphx.argument(data["image"].copy())
# calibration = [dic]
# migraphx.quantize_int8(self.db_model, migraphx.get_target("gpu"), calibration)
if self.precision_mode == "fp16":
migraphx.quantize_fp16(self.db_model)
self.db_model.compile(t=migraphx.get_target("gpu"),offload_copy=self.offload_copy,device_id=0)
if self.offload_copy==False:
self.d_mem = AllocateOutputMemory(self.db_model)
print("Detection model info:")
print(f" inputs info:{inputs}")
print(f" outputs info:{outputs}")
def polygons_from_bitmap(self, pred, _bitmap, ratio_w,ratio_h,dest_width, dest_height):
"""
_bitmap: single map with shape (1, H, W),
whose values are binarized as {0, 1}
"""
bitmap = _bitmap
height, width = bitmap.shape
boxes = []
scores = []
contours, _ = cv2.findContours(
(bitmap * 255).astype(np.uint8), cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE
)
for contour in contours[: self.max_candidates]:
epsilon = 0.002 * cv2.arcLength(contour, True)
approx = cv2.approxPolyDP(contour, epsilon, True)
points = approx.reshape((-1, 2))
if points.shape[0] < 4:
continue
score = self.box_score_fast(pred, points.reshape(-1, 2))
if self.box_thresh > score:
continue
if points.shape[0] > 2:
box = self.unclip(points, self.unclip_ratio)
if len(box) > 1:
continue
else:
continue
box = np.array(box).reshape(-1, 2)
if len(box) == 0:
continue
_, sside = self.get_mini_boxes(box.reshape((-1, 1, 2)))
if sside < self.min_size + 2:
continue
box = np.array(box)
box[:, 0] = np.clip(np.round(box[:, 0] /ratio_w), 0, dest_width)
box[:, 1] = np.clip(
np.round(box[:, 1] / ratio_h), 0, dest_height
)
boxes.append(box.tolist())
scores.append(score)
return boxes, scores
def boxes_from_bitmap(self, pred, _bitmap, ratio_w,ratio_h, dest_width, dest_height):
"""
_bitmap: single map with shape (1, H, W),
whose values are binarized as {0, 1}
"""
bitmap = _bitmap
height, width = bitmap.shape
outs = cv2.findContours(
(bitmap * 255).astype(np.uint8), cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE
)
if len(outs) == 3:
img, contours, _ = outs[0], outs[1], outs[2]
elif len(outs) == 2:
contours, _ = outs[0], outs[1]
num_contours = min(len(contours), self.max_candidates)
boxes = []
scores = []
for index in range(num_contours):
contour = contours[index]
points, sside = self.get_mini_boxes(contour)
if sside < self.min_size:
continue
points = np.array(points)
if self.score_mode == "fast":
score = self.box_score_fast(pred, points.reshape(-1, 2))
else:
score = self.box_score_slow(pred, contour)
if self.box_thresh > score:
continue
box = self.unclip(points, self.unclip_ratio)
if len(box) > 1:
continue
box = np.array(box).reshape(-1, 1, 2)
box, sside = self.get_mini_boxes(box)
if sside < self.min_size + 2:
continue
box = np.array(box)
box[:, 0] = np.clip(np.round(box[:, 0] / ratio_w), 0, dest_width)
box[:, 1] = np.clip(
np.round(box[:, 1] / ratio_h), 0, dest_height
)
boxes.append(box.astype("int32"))
scores.append(score)
return np.array(boxes, dtype="int32"), scores
def unclip(self, box, unclip_ratio):
poly = Polygon(box)
distance = poly.area * unclip_ratio / poly.length
offset = pyclipper.PyclipperOffset()
offset.AddPath(box, pyclipper.JT_ROUND, pyclipper.ET_CLOSEDPOLYGON)
expanded = offset.Execute(distance)
return expanded
def get_mini_boxes(self, contour):
bounding_box = cv2.minAreaRect(contour)
points = sorted(list(cv2.boxPoints(bounding_box)), key=lambda x: x[0])
index_1, index_2, index_3, index_4 = 0, 1, 2, 3
if points[1][1] > points[0][1]:
index_1 = 0
index_4 = 1
else:
index_1 = 1
index_4 = 0
if points[3][1] > points[2][1]:
index_2 = 2
index_3 = 3
else:
index_2 = 3
index_3 = 2
box = [points[index_1], points[index_2], points[index_3], points[index_4]]
return box, min(bounding_box[1])
def box_score_fast(self, bitmap, _box):
"""
box_score_fast: use bbox mean score as the mean score
"""
h, w = bitmap.shape[:2]
box = _box.copy()
xmin = np.clip(np.floor(box[:, 0].min()).astype("int32"), 0, w - 1)
xmax = np.clip(np.ceil(box[:, 0].max()).astype("int32"), 0, w - 1)
ymin = np.clip(np.floor(box[:, 1].min()).astype("int32"), 0, h - 1)
ymax = np.clip(np.ceil(box[:, 1].max()).astype("int32"), 0, h - 1)
mask = np.zeros((ymax - ymin + 1, xmax - xmin + 1), dtype=np.uint8)
box[:, 0] = box[:, 0] - xmin
box[:, 1] = box[:, 1] - ymin
cv2.fillPoly(mask, box.reshape(1, -1, 2).astype("int32"), 1)
return cv2.mean(bitmap[ymin : ymax + 1, xmin : xmax + 1], mask)[0]
def box_score_slow(self, bitmap, contour):
"""
box_score_slow: use polyon mean score as the mean score
"""
h, w = bitmap.shape[:2]
contour = contour.copy()
contour = np.reshape(contour, (-1, 2))
xmin = np.clip(np.min(contour[:, 0]), 0, w - 1)
xmax = np.clip(np.max(contour[:, 0]), 0, w - 1)
ymin = np.clip(np.min(contour[:, 1]), 0, h - 1)
ymax = np.clip(np.max(contour[:, 1]), 0, h - 1)
mask = np.zeros((ymax - ymin + 1, xmax - xmin + 1), dtype=np.uint8)
contour[:, 0] = contour[:, 0] - xmin
contour[:, 1] = contour[:, 1] - ymin
cv2.fillPoly(mask, contour.reshape(1, -1, 2).astype("int32"), 1)
return cv2.mean(bitmap[ymin : ymax + 1, xmin : xmax + 1], mask)[0]
def box_standardization(self,boxes_batch,shape_list):
dt_batch_boxs = []
dt_batch_rects = []
for b in range(len(boxes_batch)):
src_h, src_w, _, _ = shape_list[b]
det_boxs = []
det_rects = []
for box in boxes_batch[b]:
if isinstance(box,list):
box = np.array(box)
rect = np.zeros((4, 2), dtype="float32")
s = box.sum(axis=1)
rect[0] = box[np.argmin(s)]
rect[2] = box[np.argmax(s)]
tmp = np.delete(box, (np.argmin(s), np.argmax(s)), axis=0)
#diff = y-x bottom-left : y>x top-right:y<x
diff = np.diff(np.array(tmp), axis=1)
rect[1] = tmp[np.argmin(diff)]
rect[3] = tmp[np.argmax(diff)]
for i in range(rect.shape[0]):
rect[i, 0] = int(min(max(rect[i, 0], 0), src_w - 1))
rect[i, 1] = int(min(max(rect[i, 1], 0), src_h - 1))
b_w = int(np.linalg.norm(box[0] - box[1]))
b_h = int(np.linalg.norm(box[0] - box[3]))
if b_w <= 3 or b_h <= 3:
continue
_rect = [int(rect[0][0]),int(rect[0][1]),int(rect[2][0]),int(rect[2][1])]
det_boxs.append(rect)
det_rects.append(_rect)
dt_batch_boxs.append(det_boxs)
dt_batch_rects.append(det_rects)
return dt_batch_boxs,dt_batch_rects
def __call__(self, src_img):
data = self.preprocess(src_img)
if self.offload_copy==False:
self.d_mem[self.det_input_name] =migraphx.to_gpu(migraphx.argument())
results = self.db_model.run(self.d_mem)
else:
results = self.db_model.run({self.det_input_name:data["image"]})
if self.offload_copy==False :
result=migraphx.from_gpu(results[0])
print("offload copy model")
result = np.array(result)
else:
result = results[0]
shape_list = np.expand_dims(data["shape"], axis=0)
pred = np.array(result)
pred = pred[:, 0, :, :]
segmentation = pred > self.thresh
boxes_batch = []
for batch_index in range(pred.shape[0]):
src_h, src_w, ratio_h, ratio_w = shape_list[batch_index]
if self.dilation_kernel is not None:
mask = cv2.dilate(
np.array(segmentation[batch_index]).astype(np.uint8),
self.dilation_kernel,
)
else:
mask = segmentation[batch_index]
if self.box_type == "poly":
boxes, scores = self.polygons_from_bitmap(
pred[batch_index], mask, ratio_w,ratio_h, src_w, src_h
)
elif self.box_type == "quad":
boxes, scores = self.boxes_from_bitmap(
pred[batch_index], mask, ratio_w,ratio_h, src_w, src_h
)
else:
raise ValueError("box_type can only be one of ['quad', 'poly']")
boxes_batch.append(boxes)
det_box_batch = self.sorted_boxes(boxes_batch)
dt_boxes,det_rects = self.box_standardization(det_box_batch,shape_list)
return dt_boxes,det_rects
def preprocess(self, src_img,
mean: list = [0.485, 0.456, 0.406],
std: list = [0.229, 0.224, 0.225],
scale: float = 1.0/255):
data = dict()
img = src_img.copy()
src_h, src_w, _ = img.shape
res_img, [ratio_h, ratio_w] = self.resize_image(img)
norm_img = (res_img* scale - mean) / std
image_data = norm_img.transpose(2, 0, 1)
image_data = np.expand_dims(image_data, axis=0).astype(np.float32)
image_data = np.ascontiguousarray(image_data)
data["image"] = image_data
data["shape"] = np.array([src_h, src_w, ratio_h, ratio_w])
return data
def resize_image(self, img):
h, w, _ = img.shape
if h > w:
ratio = float(self.db_input_size[1]) / h
else:
ratio = float(self.db_input_size[0]) / w
resize_h = int(h * ratio)
resize_w = int(w * ratio)
resize_h = max(int(round(resize_h / 32) * 32), 32)
resize_w = max(int(round(resize_w / 32) * 32), 32)
try:
if int(resize_w) <= 0 or int(resize_h) <= 0:
return None, (None, None)
img = cv2.resize(img, (int(resize_w), int(resize_h)))
except:
print(img.shape, resize_w, resize_h)
raise ValueError("resize error")
ratio_h = resize_h / float(h)
ratio_w = resize_w / float(w)
im_pad = np.zeros((self.db_input_size[1], self.db_input_size[0], 3), np.float32)
im_pad[:resize_h, :resize_w, :] = img
return im_pad, [ratio_h, ratio_w]
def sorted_boxes(self,dt_boxes):
"""
Sort text boxes in order from top to bottom, left to right
args:
dt_boxes(array):detected text boxes with shape [4, 2]
return:
sorted boxes(array) with shape [4, 2]
"""
batch_boxes = list()
# print(dt_boxes)
for b in range(len(dt_boxes)):
num_boxes = dt_boxes[b].shape[0]
batch_sorted_boxes = sorted(dt_boxes[b], key=lambda x: (x[0][1], x[0][0]))
_boxes = list(batch_sorted_boxes)
for i in range(num_boxes - 1):
for j in range(i, -1, -1):
if abs(_boxes[j + 1][0][1] - _boxes[j][0][1]) < 10 and (
_boxes[j + 1][0][0] < _boxes[j][0][0]
):
tmp = _boxes[j]
_boxes[j] = _boxes[j + 1]
_boxes[j + 1] = tmp
else:
break
batch_boxes.append(_boxes)
# print("----------------------------------------")
# print(batch_boxes)
return batch_boxes
class PPOcrV5():
def __init__(self,
det_model_path:str,
rec_model_path:str,
char_dict_path:str = "../Resource/ppocr_keys_v5.txt",
db_input_size :list = (640,640),
rec_input_size :list = (48,720),
seg_thresh:float=0.3,
box_thresh:float=0.7,
precision_mode:str='fp32',
offload_copy:bool=True,
**kwargs
):
"""
det_model_path: detection model path
rec_model_path: recognition model path
seg_thresh: dbnet segmentation threshold
box_thresh: box threshold
db_input_size: dbnet input size
"""
self.seg_thres = seg_thresh
self.box_thresh = box_thresh
self.db_input_size = db_input_size
self.offload_copy = offload_copy
if hasattr(kwargs,"max_candidates"):
self.max_candidates = kwargs["max_candidates"]
else:
self.max_candidates = 1000
if hasattr(kwargs,"unclip_ratio"):
self.unclip_ratio = kwargs["unclip_ratio"]
else:
self.unclip_ratio = 2.0
if hasattr(kwargs,"use_dilation"):
self.use_dilation = kwargs["use_dilation"]
else:
self.use_dilation = False
if hasattr(kwargs,"score_mode"):
self.score_mode = kwargs["score_mode"]
else:
self.score_mode = "fast"
if hasattr(kwargs,"box_type"):
self.box_type = kwargs["box_type"]
else:
self.box_type = "quad"
self.db_detector = TextDetector(
det_model_path,
db_input_size,
thresh=self.seg_thres,
box_thresh=self.box_thresh,
max_candidates=self.max_candidates,
unclip_ratio=self.unclip_ratio,
box_type=self.box_type,
use_dilation=self.use_dilation,
score_mode=self.score_mode,
precision_mode=precision_mode,
offload_copy=offload_copy
)
self.text_extractor = TextRecgnizer(rec_model_path=rec_model_path,
rec_input_size=rec_input_size,
precision_mode=precision_mode,
offload_copy=offload_copy)
self.ctc_decoder = CTCLabelDecode(character_dict_path=char_dict_path,
use_space_char=True)
def __call__(self, src_img):
import time
start = time.time()
dt_boxs,dt_rects = self.db_detector(src_img)
res_img = self.vis_boxes(dt_boxs,src_img)
batch_img_list = self.detection_roi_crop(src_img,dt_rects)
batch_outputs_pre ,batch_max_wh_ratio_pre = self.text_extractor(batch_img_list)
batch_text_list, batch_label_list = self.ctc_decoder(batch_outputs_pre,return_word_box=False,wh_ratio_list = batch_max_wh_ratio_pre)
end = time.time()
batch_text_out = []
batch_boxes_out = []
for b in range(len(dt_boxs)):
text_out = []
boxex_out = []
print("text box num:",len(dt_boxs[b]))
for box, rec_result in zip(dt_boxs[b], batch_text_list[b]):
text, score = rec_result[0], rec_result[1]
if score >= 0.5:
text_out.append(rec_result)
boxex_out.append(box)
batch_text_out.append(text_out)
batch_boxes_out.append(boxex_out)
for b in range(len(batch_text_out)):
for text, score in batch_text_out[b]:
print("{}, {:.3f}".format(text, score))
# res_img = self.vis_oct_text(batch_text_out,dt_rects,res_img)
print(f"[Time info] elapsed:{end-start:.4f}")
return res_img
def detection_roi_crop(self,src_img,rects):
batch_cut_imgs = list()
for b in range(len(rects)):
crop_imgs = list()
for rect in rects[b]:
x_min,y_min,x_max,y_max = rect
rect_w ,rect_h = x_max-x_min,y_max-y_min
# if rect_w<3 or rect_h<3:
# continue
# print(x_min,y_min,x_max,y_max)
crop_img = src_img[y_min:y_max, x_min:x_max,:]
crop_imgs.append(crop_img)
batch_cut_imgs.append(crop_imgs)
return batch_cut_imgs
def vis_oct_text(self,batch_text,batch_rect,src_img,fornt_path="../Resource/fonts/simfang.ttf"):
from PIL import Image, ImageDraw, ImageFont
img = np.zeros(src_img.shape, dtype=np.uint8)
img.fill(114)
pil_img = Image.fromarray(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
draw = ImageDraw.Draw(pil_img)
for b in range(len(batch_text)):
for id,text in enumerate(batch_text[b]):
text,conf = text
f_start = batch_rect[b][id][0:2]
w,h = np.array(batch_rect[b][id][2:]) - np.array(batch_rect[b][id][0:2])
font_size = int(h*0.9)
font = ImageFont.truetype(fornt_path, font_size,encoding="utf-8")
draw.text(f_start, text, font=font, fill=(0, 255, 0))
res_img = np.concatenate([src_img, np.array(pil_img)], axis=1)
return res_img
def vis_boxes(self,boxes, img, colors=(255,0,0), thickness=2):
for b in range(len(boxes)):
for tl,tr,br,bl in boxes[b]:
box = [int(tl[0]),int(tl[1]),int(br[0]),int(br[1])]
cv2.rectangle(img, (box[0],box[1]), (box[2],box[3]), colors, thickness)
return img
if __name__ == '__main__':
det_onnx_path = "../Resource/Models/ppocrv5_server_det_infer.onnx"
rec_onnx_path = "../Resource/Models/ppocrv5_server_rec_infer.onnx"
image_path = "../Resource/Images/lite_demo.png"
img = cv2.imread(image_path)
ppocrv5 = PPOcrV5(det_onnx_path,rec_onnx_path,offload_copy=True,precision_mode="fp32")
res_img = ppocrv5(img)
cv2.imwrite("res.jpg",res_img)
\ No newline at end of file
opencv-python
numpy
shapely
pyclipper
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment