Commit 5713e0ca authored by yangql's avatar yangql
Browse files

Initial commit

parents
Pipeline #501 canceled with stages
#! /bin/sh
############### Ubuntu ###############
# 参考:https://docs.opencv.org/3.4.11/d7/d9f/tutorial_linux_install.html
# apt-get install build-essential -y
# apt-get install cmake git libgtk2.0-dev pkg-config libavcodec-dev libavformat-dev libswscale-dev -y
# apt-get install python-dev python-numpy libtbb2 libtbb-dev libjpeg-dev libpng-dev libtiff-dev libjasper-dev libdc1394-22-dev -y # 处理图像所需的包,可选
############### CentOS ###############
yum install gcc gcc-c++ gtk2-devel gimp-devel gimp-devel-tools gimp-help-browser zlib-devel libtiff-devel libjpeg-devel libpng-devel gstreamer-devel libavc1394-devel libraw1394-devel libdc1394-devel jasper-devel jasper-utils swig python libtool nasm -y
\ No newline at end of file
############################ 在线安装依赖 ###############################
#cd ./3rdParty
#pip install rbuild-master.tar.gz
############################ 离线安装依赖 ###############################
# 安装依赖
cd ./3rdParty/rbuild_depend
pip install click-6.6-py2.py3-none-any.whl
pip install six-1.15.0-py2.py3-none-any.whl
pip install subprocess32-3.5.4.tar.gz
pip install cget-0.1.9.tar.gz
# 安装rbuild
cd ../
pip install rbuild-master.tar.gz
# 设置cmake的最低版本
cmake_minimum_required(VERSION 3.5)
# 设置项目名
project(RapidOcrOnnx)
# 设置编译器
set(CMAKE_CXX_COMPILER g++)
set(CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS} -std=c++17) # 2.2版本以上需要c++17
set(CMAKE_BUILD_TYPE release)
# 添加头文件路径
set(INCLUDE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/include/
$ENV{DTKROOT}/include/
${CMAKE_CURRENT_SOURCE_DIR}/depend/include/)
include_directories(${INCLUDE_PATH})
# 添加依赖库路径
set(LIBRARY_PATH ${CMAKE_CURRENT_SOURCE_DIR}/depend/lib64/
$ENV{DTKROOT}/lib/)
link_directories(${LIBRARY_PATH})
# 添加依赖库
set(LIBRARY opencv_core
opencv_imgproc
opencv_imgcodecs
opencv_dnn
onnxruntime
)
link_libraries(${LIBRARY})
# 添加源文件
set(SOURCE_FILES ${CMAKE_CURRENT_SOURCE_DIR}/Src/main.cpp
${CMAKE_CURRENT_SOURCE_DIR}/Src/AngleNet.cpp
${CMAKE_CURRENT_SOURCE_DIR}/Src/clipper.cpp
${CMAKE_CURRENT_SOURCE_DIR}/Src/CrnnNet.cpp
${CMAKE_CURRENT_SOURCE_DIR}/Src/DbNet.cpp
${CMAKE_CURRENT_SOURCE_DIR}/Src/getopt.cpp
${CMAKE_CURRENT_SOURCE_DIR}/Src/OcrLite.cpp
${CMAKE_CURRENT_SOURCE_DIR}/Src/OcrLiteCApi.cpp
${CMAKE_CURRENT_SOURCE_DIR}/Src/OcrLiteJni.cpp
${CMAKE_CURRENT_SOURCE_DIR}/Src/OcrResultUtils.cpp
${CMAKE_CURRENT_SOURCE_DIR}/Src/OcrUtils.cpp
)
# 添加可执行目标
add_executable(RapidOcr ${SOURCE_FILES})
# RapidOcr
本示例通过RapidOcr模型说明如何使用ONNXRuntime C++ API进行图像文本识别模型的推理,包括如何预处理、推理并获取推理结果。
## 模型简介
本示例使用了ch_PP-OCRv3_det + ch_ppocr_mobile_v2.0_cls + ch_PP-OCRv3_rec三个模型,onnx文件在Resource/Models/文件夹下,模型结构可以通过netron (https://netron.app/) 查看,并通过netron查询各个模型的输入输出。
## 预处理
在将数据输入到模型之前,需要对图像做如下预处理操作:
这段代码的目的是在进行字符识别之前,对图像进行预处理,包括读取图像、调整大小、填充、缩放等操作。然后,它将处理后的图像和其他参数传递给同名的detect函数来执行字符识别,并将结果存储在result对象中,最后将result对象作为函数的返回值。
本示例代码主要实现了预处理操作:
```c++
OcrResult OcrLite::detect(const char *path, const char *imgName,
const int padding, const int maxSideLen,
float boxScoreThresh, float boxThresh, float unClipRatio, bool doAngle, bool mostAngle) {
std::string imgFile = getSrcImgFilePath(path, imgName);
cv::Mat originSrc = imread(imgFile, cv::IMREAD_COLOR);//default : BGR
int originMaxSide = (std::max)(originSrc.cols, originSrc.rows);
int resize;
if (maxSideLen <= 0 || maxSideLen > originMaxSide) {
resize = originMaxSide;
} else {
resize = maxSideLen;
}
resize += 2 * padding;
cv::Rect paddingRect(padding, padding, originSrc.cols, originSrc.rows);
cv::Mat paddingSrc = makePadding(originSrc, padding);
ScaleParam scale = getScaleParam(paddingSrc, resize);
OcrResult result;
result = detect(path, imgName, paddingSrc, paddingRect, scale,
boxScoreThresh, boxThresh, unClipRatio, doAngle, mostAngle);
return result;
OcrResult OcrLite::detect(const cv::Mat &mat, int padding, int maxSideLen, float boxScoreThresh, float boxThresh,
float unClipRatio, bool doAngle, bool mostAngle) {
cv::Mat originSrc = mat;
int originMaxSide = (std::max)(originSrc.cols, originSrc.rows);
int resize;
if (maxSideLen <= 0 || maxSideLen > originMaxSide) {
resize = originMaxSide;
} else {
resize = maxSideLen;
}
resize += 2 * padding;
cv::Rect paddingRect(padding, padding, originSrc.cols, originSrc.rows);
cv::Mat paddingSrc = makePadding(originSrc, padding);
ScaleParam scale = getScaleParam(paddingSrc, resize);
OcrResult result;
result = detect(NULL, NULL, paddingSrc, paddingRect, scale,
boxScoreThresh, boxThresh, unClipRatio, doAngle, mostAngle);
return result;
}
```
这两段代码展示了OcrLite类中的两个重载版本的detect函数。
第一个版本接受文件路径和图像名称作为参数,从文件中读取图像进行处理。它的实现流程如下:
1、使用OpenCV的imread函数从文件中读取图像,以BGR格式存储在originSrc变量中。计算originSrc图像的最大边长,将其存储在originMaxSide变量中,使用std::max函数比较图像的宽度和高度。
2、创建一个cv::Rect对象paddingRect,表示填充区域的位置和大小,其中padding用于指定填充的大小,originSrc.cols和originSrc.rows表示填充后的图像的宽度和高度。调用名为makePadding的函数,传递originSrc和padding参数,返回填充后的图像,并将其存储在paddingSrc变量中。
3、调用名为getScaleParam的函数,传递paddingSrc和resize参数,返回一个ScaleParam对象scale,用于缩放图像。
4、调用同名的detect函数,传递文件路径和图像名称作为参数,填充后的图像paddingSrc、填充区域paddingRect、缩放参数scale,以及其他参数,将返回的结果存储在result对象中。返回result对象作为函数的结果。
第二个版本接受一个cv::Mat对象作为输入,直接对该图像进行处理。它的实现流程与前一个版本类似,只是省略了文件读取的步骤,而是直接使用传入的mat作为原始图像。这两个版本的detect函数的目的是根据输入的图像进行字符识别,并返回一个OcrResult对象作为结果。具体的处理步骤包括图像大小调整、填充、缩放和字符识别等过程。
## 推理
### 推理分为三部分:
#### 第一部分:
DbNet::getTextBoxes(){}使用ch_ppocr_v3_det_infer.onnx模型,这是一个预训练的文本检测模型,用于文本检测任务。它可以检测图像中的文本区域,并返回文本框的位置和边界框信息。
```c++
DbNet::getTextBoxes(cv::Mat &src, ScaleParam &s, float boxScoreThresh, float boxThresh, float unClipRatio) {
//创建输入
cv::Mat srcResize;
resize(src, srcResize, cv::Size(s.dstWidth, s.dstHeight));
std::vector<float> inputTensorValues = substractMeanNormalize(srcResize, meanValues, normValues);
std::array<int64_t, 4> inputShape{1, srcResize.channels(), srcResize.rows, srcResize.cols};
auto memoryInfo = Ort::MemoryInfo::CreateCpu(OrtDeviceAllocator, OrtMemTypeCPU);
Ort::Value inputTensor = Ort::Value::CreateTensor<float>(memoryInfo, inputTensorValues.data(),
inputTensorValues.size(), inputShape.data(),
inputShape.size());
assert(inputTensor.IsTensor());
std::vector<const char *> inputNames = {inputNamesPtr.data()->get()};
std::vector<const char *> outputNames = {outputNamesPtr.data()->get()};
//进行推理
auto outputTensor = session->Run(Ort::RunOptions{nullptr}, inputNames.data(), &inputTensor,
inputNames.size(), outputNames.data(), outputNames.size());
assert(outputTensor.size() == 1 && outputTensor.front().IsTensor());
std::vector<int64_t> outputShape = outputTensor[0].GetTensorTypeAndShapeInfo().GetShape();
int64_t outputCount = std::accumulate(outputShape.begin(), outputShape.end(), 1,
std::multiplies<int64_t>());
float *floatArray = outputTensor.front().GetTensorMutableData<float>();
std::vector<float> outputData(floatArray, floatArray + outputCount);
...
}
```
#### 第二部分:
Angle AngleNet::getAngle(){}使用ch_ppocr_v2_cls_infer.onnx模型:这是一个预训练的分类器模型,用于文本分类任务。它可以用于判断文本属于哪个类别或类别的概率。
```c++
Angle AngleNet::getAngle(cv::Mat &src) {
std::vector<float> inputTensorValues = substractMeanNormalize(src, meanValues, normValues);
std::array<int64_t, 4> inputShape{1, src.channels(), src.rows, src.cols};
auto memoryInfo = Ort::MemoryInfo::CreateCpu(OrtDeviceAllocator, OrtMemTypeCPU);
Ort::Value inputTensor = Ort::Value::CreateTensor<float>(memoryInfo, inputTensorValues.data(),
inputTensorValues.size(), inputShape.data(),
inputShape.size());
//创建输入
assert(inputTensor.IsTensor());
std::vector<const char *> inputNames = {inputNamesPtr.data()->get()};
std::vector<const char *> outputNames = {outputNamesPtr.data()->get()};
//进行推理
auto outputTensor = session->Run(Ort::RunOptions{nullptr}, inputNames.data(), &inputTensor,
inputNames.size(), outputNames.data(), outputNames.size());
assert(outputTensor.size() == 1 && outputTensor.front().IsTensor());
std::vector<int64_t> outputShape = outputTensor[0].GetTensorTypeAndShapeInfo().GetShape();
int64_t outputCount = std::accumulate(outputShape.begin(), outputShape.end(), 1,
std::multiplies<int64_t>());
float *floatArray = outputTensor.front().GetTensorMutableData<float>();
std::vector<float> outputData(floatArray, floatArray + outputCount);
return scoreToAngle(outputData);
}
```
#### 第三部分:
TextLine CrnnNet::getTextLine(){}使用ch_ppocr_v3_rec_infer.onnx:这是一个预训练的文本识别模型,用于文本识别任务。它可以接收一个文本框的图像区域作为输入,并返回该区域中文本的识别
```c++
TextLine CrnnNet::getTextLine(const cv::Mat &src) {
float scale = (float) dstHeight / (float) src.rows;
int dstWidth = int((float) src.cols * scale);
cv::Mat srcResize;
resize(src, srcResize, cv::Size(dstWidth, dstHeight));
std::vector<float> inputTensorValues = substractMeanNormalize(srcResize, meanValues, normValues);
std::array<int64_t, 4> inputShape{1, srcResize.channels(), srcResize.rows, srcResize.cols};
auto memoryInfo = Ort::MemoryInfo::CreateCpu(OrtDeviceAllocator, OrtMemTypeCPU);
Ort::Value inputTensor = Ort::Value::CreateTensor<float>(memoryInfo, inputTensorValues.data(),
inputTensorValues.size(), inputShape.data(),
inputShape.size());
//创建输入
assert(inputTensor.IsTensor());
std::vector<const char *> inputNames = {inputNamesPtr.data()->get()};
std::vector<const char *> outputNames = {outputNamesPtr.data()->get()};
//进行推理
auto outputTensor = session->Run(Ort::RunOptions{nullptr}, inputNames.data(), &inputTensor,
inputNames.size(), outputNames.data(), outputNames.size());
assert(outputTensor.size() == 1 && outputTensor.front().IsTensor());
std::vector<int64_t> outputShape = outputTensor[0].GetTensorTypeAndShapeInfo().GetShape();
int64_t outputCount = std::accumulate(outputShape.begin(), outputShape.end(), 1,
std::multiplies<int64_t>());
float *floatArray = outputTensor.front().GetTensorMutableData<float>();
std::vector<float> outputData(floatArray, floatArray + outputCount);
return scoreToTextLine(outputData, outputShape[1], outputShape[2]);
}
# RapidOcr
本示例通过RapidOcr模型说明如何使用ONNXRuntime Python API进行图像文本识别模型的推理,包括如何预处理、推理并获取推理结果。
## 模型简介
本示例使用了ch_PP-OCRv3_det + ch_ppocr_mobile_v2.0_cls + ch_PP-OCRv3_rec三个模型,onnx文件在Resource/Models/文件夹下,模型结构可以通过netron (https://netron.app/) 查看,并通过netron查询各个模型的输入输出。
## 预处理
在将数据输入到模型之前,需要对图像做如下预处理操作:
这段代码的目的是在进行字符识别之前,对图像进行预处理,包括读取图像、调整大小、填充、缩放等操作。
本示例代码采用了OpenCV实现了预处理操作:
### TextDetector的预处理
```python
pre_process_list = {
"DetResizeForTest": {
"limit_side_len": config.get("limit_side_len", 736),
"limit_type": config.get("limit_type", "min"),
},
"NormalizeImage": {
"std": [0.229, 0.224, 0.225],
"mean": [0.485, 0.456, 0.406],
"scale": "1./255.",
"order": "hwc",
},
"ToCHWImage": None,
"KeepKeys": {"keep_keys": ["image", "shape"]},
}
self.preprocess_op = create_operators(pre_process_list)
post_process = {
"thresh": config.get("thresh", 0.3),
"box_thresh": config.get("box_thresh", 0.5),
"max_candidates": config.get("max_candidates", 1000),
"unclip_ratio": config.get("unclip_ratio", 1.6),
"use_dilation": config.get("use_dilation", True),
"score_mode": config.get("score_mode", "fast"),
}
```
### TextClassifier的预处理
```python
def resize_norm_img(self, img):
img_c, img_h, img_w = self.cls_image_shape
h, w = img.shape[:2]
ratio = w / float(h)
if math.ceil(img_h * ratio) > img_w:
resized_w = img_w
else:
resized_w = int(math.ceil(img_h * ratio))
resized_image = cv2.resize(img, (resized_w, img_h))
resized_image = resized_image.astype("float32")
if img_c == 1:
resized_image = resized_image / 255
resized_image = resized_image[np.newaxis, :]
else:
resized_image = resized_image.transpose((2, 0, 1)) / 255
resized_image -= 0.5
resized_image /= 0.5
padding_im = np.zeros((img_c, img_h, img_w), dtype=np.float32)
padding_im[:, :, :resized_w] = resized_image
return padding_im
```
### TextRecognizer的预处理
```python
def resize_norm_img(self, img, max_wh_ratio):
img_channel, img_height, img_width = self.rec_image_shape
assert img_channel == img.shape[2]
img_width = int(img_height * max_wh_ratio)
h, w = img.shape[:2]
ratio = w / float(h)
if math.ceil(img_height * ratio) > img_width:
resized_w = img_width
else:
resized_w = int(math.ceil(img_height * ratio))
resized_image = cv2.resize(img, (resized_w, img_height))
resized_image = resized_image.astype("float32")
resized_image = resized_image.transpose((2, 0, 1)) / 255
resized_image -= 0.5
resized_image /= 0.5
padding_im = np.zeros((img_channel, img_height, img_width), dtype=np.float32)
padding_im[:, :, 0:resized_w] = resized_image
return padding_im
```
## 推理
### 推理分为三部分:
#### 第一部分:
TextDetector使用ch_ppocr_v3_det_infer.onnx模型,这是一个预训练的文本检测模型,用于文本检测任务。它可以检测图像中的文本区域,并返回文本框的位置和边界框信息。
```python
class TextDetector:
...
post_process = {
"thresh": config.get("thresh", 0.3),
"box_thresh": config.get("box_thresh", 0.5),
"max_candidates": config.get("max_candidates", 1000),
"unclip_ratio": config.get("unclip_ratio", 1.6),
"use_dilation": config.get("use_dilation", True),
"score_mode": config.get("score_mode", "fast"),
}
self.postprocess_op = DBPostProcess(**post_process)
self.infer = OrtInferSession(config)
...
```
#### 第二部分:
TextClassifier使用ch_ppocr_v2_cls_infer.onnx模型:这是一个预训练的分类器模型,用于文本分类任务。它可以用于判断文本属于哪个类别或类别的概率。
```python
class TextClassifier:
...
def __init__(self, config):
self.cls_image_shape = config["cls_image_shape"]
self.cls_batch_num = config["cls_batch_num"]
self.cls_thresh = config["cls_thresh"]
self.postprocess_op = ClsPostProcess(config["label_list"])
self.infer = OrtInferSession(config)
...
```
#### 第三部分:
TextDetector使用ch_ppocr_v3_rec_infer.onnx:这是一个预训练的文本识别模型,用于文本识别任务。它可以接收一个文本框的图像区域作为输入,并返回该区域中文本的识别
```python
class TextDetector:
...
def __init__(self, config):
pre_process_list = {
"DetResizeForTest": {
"limit_side_len": config.get("limit_side_len", 736),
"limit_type": config.get("limit_type", "min"),
},
"NormalizeImage": {
"std": [0.229, 0.224, 0.225],
"mean": [0.485, 0.456, 0.406],
"scale": "1./255.",
"order": "hwc",
},
"ToCHWImage": None,
"KeepKeys": {"keep_keys": ["image", "shape"]},
}
self.preprocess_op = create_operators(pre_process_list)
post_process = {
"thresh": config.get("thresh", 0.3),
"box_thresh": config.get("box_thresh", 0.5),
"max_candidates": config.get("max_candidates", 1000),
"unclip_ratio": config.get("unclip_ratio", 1.6),
"use_dilation": config.get("use_dilation", True),
"score_mode": config.get("score_mode", "fast"),
}
self.postprocess_op = DBPostProcess(**post_process)
self.infer = OrtInferSession(config)
...
```
#导入rapidocr_onnxruntime
from rapidocr_onnxruntime import RapidOCR
if __name__ == "__main__":
rapid_ocr = RapidOCR()
#设置图片路径
image_path = "../Resource/Images/1.jpg"
with open(image_path, "rb") as f:
img = f.read()
#对图片进行文字识别
result, elapse_list = rapid_ocr(img)
print(result)
from .rapid_ocr_api import RapidOCR
from .utils import LoadImageError
#模型的信息
cls_image_shape: [3, 48, 192]
cls_batch_num: 6
cls_thresh: 0.9
label_list: ['0', '180']
\ No newline at end of file
import argparse
import copy
import math
import time
from typing import List
import cv2
import numpy as np
from rapidocr_onnxruntime.utils import OrtInferSession, read_yaml
from .utils import ClsPostProcess
class TextClassifier:
def __init__(self, config):
self.cls_image_shape = config["cls_image_shape"]
self.cls_batch_num = config["cls_batch_num"]
self.cls_thresh = config["cls_thresh"]
self.postprocess_op = ClsPostProcess(config["label_list"])
self.infer = OrtInferSession(config)
def __call__(self, img_list: List[np.ndarray]):
if isinstance(img_list, np.ndarray):
img_list = [img_list]
img_list = copy.deepcopy(img_list)
# Calculate the aspect ratio of all text bars
width_list = [img.shape[1] / float(img.shape[0]) for img in img_list]
# Sorting can speed up the cls process
indices = np.argsort(np.array(width_list))
img_num = len(img_list)
cls_res = [["", 0.0]] * img_num
batch_num = self.cls_batch_num
elapse = 0
for beg_img_no in range(0, img_num, batch_num):
end_img_no = min(img_num, beg_img_no + batch_num)
norm_img_batch = []
for ino in range(beg_img_no, end_img_no):
norm_img = self.resize_norm_img(img_list[indices[ino]])
norm_img = norm_img[np.newaxis, :]
norm_img_batch.append(norm_img)
norm_img_batch = np.concatenate(norm_img_batch).astype(np.float32)
starttime = time.time()
prob_out = self.infer(norm_img_batch)[0]
cls_result = self.postprocess_op(prob_out)
elapse += time.time() - starttime
for rno in range(len(cls_result)):
label, score = cls_result[rno]
cls_res[indices[beg_img_no + rno]] = [label, score]
if "180" in label and score > self.cls_thresh:
img_list[indices[beg_img_no + rno]] = cv2.rotate(
img_list[indices[beg_img_no + rno]], 1
)
return img_list, cls_res, elapse
def resize_norm_img(self, img):
img_c, img_h, img_w = self.cls_image_shape
h, w = img.shape[:2]
ratio = w / float(h)
if math.ceil(img_h * ratio) > img_w:
resized_w = img_w
else:
resized_w = int(math.ceil(img_h * ratio))
resized_image = cv2.resize(img, (resized_w, img_h))
resized_image = resized_image.astype("float32")
if img_c == 1:
resized_image = resized_image / 255
resized_image = resized_image[np.newaxis, :]
else:
resized_image = resized_image.transpose((2, 0, 1)) / 255
resized_image -= 0.5
resized_image /= 0.5
padding_im = np.zeros((img_c, img_h, img_w), dtype=np.float32)
padding_im[:, :, :resized_w] = resized_image
return padding_im
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--image_path", type=str, help="image_dir|image_path")
parser.add_argument("--config_path", type=str, default="config.yaml")
args = parser.parse_args()
config = read_yaml(args.config_path)
text_classifier = TextClassifier(config)
img = cv2.imread(args.image_path)
img_list, cls_res, predict_time = text_classifier(img)
for ino in range(len(img_list)):
print(f"cls result:{cls_res[ino]}")
class ClsPostProcess:
"""Convert between text-label and text-index"""
def __init__(self, label_list):
super(ClsPostProcess, self).__init__()
self.label_list = label_list
def __call__(self, preds, label=None):
pred_idxs = preds.argmax(axis=1)
decode_out = [
(self.label_list[idx], preds[i, idx]) for i, idx in enumerate(pred_idxs)
]
if label is None:
return decode_out
label = [(self.label_list[idx], 1.0) for idx in label]
return decode_out, label
#前处理参数
pre_process:
DetResizeForTest:
limit_side_len: 736
limit_type: min
NormalizeImage:
std: [0.229, 0.224, 0.225]
mean: [0.485, 0.456, 0.406]
scale: 1./255.
order: hwc
ToCHWImage:
KeepKeys:
keep_keys: ['image', 'shape']
#后处理参数
post_process:
thresh: 0.3
box_thresh: 0.5
max_candidates: 1000
unclip_ratio: 1.6
use_dilation: true
score_mode: "fast"
import argparse
import time
import cv2
import numpy as np
from rapidocr_onnxruntime.utils import OrtInferSession, read_yaml
from .utils import DBPostProcess, create_operators, transform
class TextDetector:
def __init__(self, config):
pre_process_list = {
"DetResizeForTest": {
"limit_side_len": config.get("limit_side_len", 736),
"limit_type": config.get("limit_type", "min"),
},
"NormalizeImage": {
"std": [0.229, 0.224, 0.225],
"mean": [0.485, 0.456, 0.406],
"scale": "1./255.",
"order": "hwc",
},
"ToCHWImage": None,
"KeepKeys": {"keep_keys": ["image", "shape"]},
}
self.preprocess_op = create_operators(pre_process_list)
post_process = {
"thresh": config.get("thresh", 0.3),
"box_thresh": config.get("box_thresh", 0.5),
"max_candidates": config.get("max_candidates", 1000),
"unclip_ratio": config.get("unclip_ratio", 1.6),
"use_dilation": config.get("use_dilation", True),
"score_mode": config.get("score_mode", "fast"),
}
self.postprocess_op = DBPostProcess(**post_process)
self.infer = OrtInferSession(config)
def __call__(self, img):
if img is None:
raise ValueError("img is None")
ori_im_shape = img.shape[:2]
data = {"image": img}
data = transform(data, self.preprocess_op)
img, shape_list = data
if img is None:
return None, 0
img = np.expand_dims(img, axis=0).astype(np.float32)
shape_list = np.expand_dims(shape_list, axis=0)
starttime = time.time()
preds = self.infer(img)[0]
post_result = self.postprocess_op(preds, shape_list)
dt_boxes = post_result[0]["points"]
dt_boxes = self.filter_tag_det_res(dt_boxes, ori_im_shape)
elapse = time.time() - starttime
return dt_boxes, elapse
def order_points_clockwise(self, pts):
"""
reference from:
https://github.com/jrosebr1/imutils/blob/master/imutils/perspective.py
sort the points based on their x-coordinates
"""
xSorted = pts[np.argsort(pts[:, 0]), :]
# grab the left-most and right-most points from the sorted
# x-roodinate points
leftMost = xSorted[:2, :]
rightMost = xSorted[2:, :]
# now, sort the left-most coordinates according to their
# y-coordinates so we can grab the top-left and bottom-left
# points, respectively
leftMost = leftMost[np.argsort(leftMost[:, 1]), :]
(tl, bl) = leftMost
rightMost = rightMost[np.argsort(rightMost[:, 1]), :]
(tr, br) = rightMost
rect = np.array([tl, tr, br, bl], dtype="float32")
return rect
def clip_det_res(self, points, img_height, img_width):
for pno in range(points.shape[0]):
points[pno, 0] = int(min(max(points[pno, 0], 0), img_width - 1))
points[pno, 1] = int(min(max(points[pno, 1], 0), img_height - 1))
return points
def filter_tag_det_res(self, dt_boxes, image_shape):
img_height, img_width = image_shape[:2]
dt_boxes_new = []
for box in dt_boxes:
box = self.order_points_clockwise(box)
box = self.clip_det_res(box, img_height, img_width)
rect_width = int(np.linalg.norm(box[0] - box[1]))
rect_height = int(np.linalg.norm(box[0] - box[3]))
if rect_width <= 3 or rect_height <= 3:
continue
dt_boxes_new.append(box)
dt_boxes = np.array(dt_boxes_new)
return dt_boxes
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--config_path", type=str, default="config.yaml")
parser.add_argument("--image_path", type=str, default=None)
args = parser.parse_args()
config = read_yaml(args.config_path)
text_detector = TextDetector(config)
img = cv2.imread(args.image_path)
dt_boxes, elapse = text_detector(img)
from utils import draw_text_det_res
src_im = draw_text_det_res(dt_boxes, args.image_path)
cv2.imwrite("det_results.jpg", src_im)
print("The det_results.jpg has been saved in the current directory.")
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment