Commit 4c6b03ad authored by Leif's avatar Leif
Browse files

Merge remote-tracking branch 'origin/release/2.5' into release2.5

parents edad6e7c 77331549
...@@ -27,12 +27,12 @@ class CosineEmbeddingLoss(nn.Layer): ...@@ -27,12 +27,12 @@ class CosineEmbeddingLoss(nn.Layer):
self.epsilon = 1e-12 self.epsilon = 1e-12
def forward(self, x1, x2, target): def forward(self, x1, x2, target):
similarity = paddle.fluid.layers.reduce_sum( similarity = paddle.sum(
x1 * x2, dim=-1) / (paddle.norm( x1 * x2, dim=-1) / (paddle.norm(
x1, axis=-1) * paddle.norm( x1, axis=-1) * paddle.norm(
x2, axis=-1) + self.epsilon) x2, axis=-1) + self.epsilon)
one_list = paddle.full_like(target, fill_value=1) one_list = paddle.full_like(target, fill_value=1)
out = paddle.fluid.layers.reduce_mean( out = paddle.mean(
paddle.where( paddle.where(
paddle.equal(target, one_list), 1. - similarity, paddle.equal(target, one_list), 1. - similarity,
paddle.maximum( paddle.maximum(
......
...@@ -19,7 +19,6 @@ from __future__ import print_function ...@@ -19,7 +19,6 @@ from __future__ import print_function
import paddle import paddle
from paddle import nn from paddle import nn
from paddle.nn import functional as F from paddle.nn import functional as F
from paddle import fluid
class TableAttentionLoss(nn.Layer): class TableAttentionLoss(nn.Layer):
def __init__(self, structure_weight, loc_weight, use_giou=False, giou_weight=1.0, **kwargs): def __init__(self, structure_weight, loc_weight, use_giou=False, giou_weight=1.0, **kwargs):
...@@ -36,13 +35,13 @@ class TableAttentionLoss(nn.Layer): ...@@ -36,13 +35,13 @@ class TableAttentionLoss(nn.Layer):
:param bbox:[[x1,y1,x2,y2], [x1,y1,x2,y2],,,] :param bbox:[[x1,y1,x2,y2], [x1,y1,x2,y2],,,]
:return: loss :return: loss
''' '''
ix1 = fluid.layers.elementwise_max(preds[:, 0], bbox[:, 0]) ix1 = paddle.maximum(preds[:, 0], bbox[:, 0])
iy1 = fluid.layers.elementwise_max(preds[:, 1], bbox[:, 1]) iy1 = paddle.maximum(preds[:, 1], bbox[:, 1])
ix2 = fluid.layers.elementwise_min(preds[:, 2], bbox[:, 2]) ix2 = paddle.minimum(preds[:, 2], bbox[:, 2])
iy2 = fluid.layers.elementwise_min(preds[:, 3], bbox[:, 3]) iy2 = paddle.minimum(preds[:, 3], bbox[:, 3])
iw = fluid.layers.clip(ix2 - ix1 + 1e-3, 0., 1e10) iw = paddle.clip(ix2 - ix1 + 1e-3, 0., 1e10)
ih = fluid.layers.clip(iy2 - iy1 + 1e-3, 0., 1e10) ih = paddle.clip(iy2 - iy1 + 1e-3, 0., 1e10)
# overlap # overlap
inters = iw * ih inters = iw * ih
...@@ -55,12 +54,12 @@ class TableAttentionLoss(nn.Layer): ...@@ -55,12 +54,12 @@ class TableAttentionLoss(nn.Layer):
# ious # ious
ious = inters / uni ious = inters / uni
ex1 = fluid.layers.elementwise_min(preds[:, 0], bbox[:, 0]) ex1 = paddle.minimum(preds[:, 0], bbox[:, 0])
ey1 = fluid.layers.elementwise_min(preds[:, 1], bbox[:, 1]) ey1 = paddle.minimum(preds[:, 1], bbox[:, 1])
ex2 = fluid.layers.elementwise_max(preds[:, 2], bbox[:, 2]) ex2 = paddle.maximum(preds[:, 2], bbox[:, 2])
ey2 = fluid.layers.elementwise_max(preds[:, 3], bbox[:, 3]) ey2 = paddle.maximum(preds[:, 3], bbox[:, 3])
ew = fluid.layers.clip(ex2 - ex1 + 1e-3, 0., 1e10) ew = paddle.clip(ex2 - ex1 + 1e-3, 0., 1e10)
eh = fluid.layers.clip(ey2 - ey1 + 1e-3, 0., 1e10) eh = paddle.clip(ey2 - ey1 + 1e-3, 0., 1e10)
# enclose erea # enclose erea
enclose = ew * eh + eps enclose = ew * eh + eps
......
...@@ -175,12 +175,7 @@ class Kie_backbone(nn.Layer): ...@@ -175,12 +175,7 @@ class Kie_backbone(nn.Layer):
img, relations, texts, gt_bboxes, tag, img_size) img, relations, texts, gt_bboxes, tag, img_size)
x = self.img_feat(img) x = self.img_feat(img)
boxes, rois_num = self.bbox2roi(gt_bboxes) boxes, rois_num = self.bbox2roi(gt_bboxes)
feats = paddle.fluid.layers.roi_align( feats = paddle.vision.ops.roi_align(
x, x, boxes, spatial_scale=1.0, output_size=7, boxes_num=rois_num)
boxes,
spatial_scale=1.0,
pooled_height=7,
pooled_width=7,
rois_num=rois_num)
feats = self.maxpool(feats).squeeze(-1).squeeze(-1) feats = self.maxpool(feats).squeeze(-1).squeeze(-1)
return [relations, texts, feats] return [relations, texts, feats]
...@@ -18,7 +18,6 @@ from __future__ import print_function ...@@ -18,7 +18,6 @@ from __future__ import print_function
from paddle import nn, ParamAttr from paddle import nn, ParamAttr
from paddle.nn import functional as F from paddle.nn import functional as F
import paddle.fluid as fluid
import paddle import paddle
import numpy as np import numpy as np
......
...@@ -20,13 +20,11 @@ import math ...@@ -20,13 +20,11 @@ import math
import paddle import paddle
from paddle import nn, ParamAttr from paddle import nn, ParamAttr
from paddle.nn import functional as F from paddle.nn import functional as F
import paddle.fluid as fluid
import numpy as np import numpy as np
from .self_attention import WrapEncoderForFeature from .self_attention import WrapEncoderForFeature
from .self_attention import WrapEncoder from .self_attention import WrapEncoder
from paddle.static import Program from paddle.static import Program
from ppocr.modeling.backbones.rec_resnet_fpn import ResNetFPN from ppocr.modeling.backbones.rec_resnet_fpn import ResNetFPN
import paddle.fluid.framework as framework
from collections import OrderedDict from collections import OrderedDict
gradient_clip = 10 gradient_clip = 10
......
...@@ -22,7 +22,6 @@ import paddle ...@@ -22,7 +22,6 @@ import paddle
from paddle import ParamAttr, nn from paddle import ParamAttr, nn
from paddle import nn, ParamAttr from paddle import nn, ParamAttr
from paddle.nn import functional as F from paddle.nn import functional as F
import paddle.fluid as fluid
import numpy as np import numpy as np
gradient_clip = 10 gradient_clip = 10
...@@ -288,10 +287,10 @@ class PrePostProcessLayer(nn.Layer): ...@@ -288,10 +287,10 @@ class PrePostProcessLayer(nn.Layer):
"layer_norm_%d" % len(self.sublayers()), "layer_norm_%d" % len(self.sublayers()),
paddle.nn.LayerNorm( paddle.nn.LayerNorm(
normalized_shape=d_model, normalized_shape=d_model,
weight_attr=fluid.ParamAttr( weight_attr=paddle.ParamAttr(
initializer=fluid.initializer.Constant(1.)), initializer=paddle.nn.initializer.Constant(1.)),
bias_attr=fluid.ParamAttr( bias_attr=paddle.ParamAttr(
initializer=fluid.initializer.Constant(0.))))) initializer=paddle.nn.initializer.Constant(0.)))))
elif cmd == "d": # add dropout elif cmd == "d": # add dropout
self.functors.append(lambda x: F.dropout( self.functors.append(lambda x: F.dropout(
x, p=dropout_rate, mode="downscale_in_infer") x, p=dropout_rate, mode="downscale_in_infer")
...@@ -324,7 +323,7 @@ class PrepareEncoder(nn.Layer): ...@@ -324,7 +323,7 @@ class PrepareEncoder(nn.Layer):
def forward(self, src_word, src_pos): def forward(self, src_word, src_pos):
src_word_emb = src_word src_word_emb = src_word
src_word_emb = fluid.layers.cast(src_word_emb, 'float32') src_word_emb = paddle.cast(src_word_emb, 'float32')
src_word_emb = paddle.scale(x=src_word_emb, scale=self.src_emb_dim**0.5) src_word_emb = paddle.scale(x=src_word_emb, scale=self.src_emb_dim**0.5)
src_pos = paddle.squeeze(src_pos, axis=-1) src_pos = paddle.squeeze(src_pos, axis=-1)
src_pos_enc = self.emb(src_pos) src_pos_enc = self.emb(src_pos)
...@@ -367,7 +366,7 @@ class PrepareDecoder(nn.Layer): ...@@ -367,7 +366,7 @@ class PrepareDecoder(nn.Layer):
self.dropout_rate = dropout_rate self.dropout_rate = dropout_rate
def forward(self, src_word, src_pos): def forward(self, src_word, src_pos):
src_word = fluid.layers.cast(src_word, 'int64') src_word = paddle.cast(src_word, 'int64')
src_word = paddle.squeeze(src_word, axis=-1) src_word = paddle.squeeze(src_word, axis=-1)
src_word_emb = self.emb0(src_word) src_word_emb = self.emb0(src_word)
src_word_emb = paddle.scale(x=src_word_emb, scale=self.src_emb_dim**0.5) src_word_emb = paddle.scale(x=src_word_emb, scale=self.src_emb_dim**0.5)
......
...@@ -38,6 +38,7 @@ class DBPostProcess(object): ...@@ -38,6 +38,7 @@ class DBPostProcess(object):
unclip_ratio=2.0, unclip_ratio=2.0,
use_dilation=False, use_dilation=False,
score_mode="fast", score_mode="fast",
visual_output=False,
**kwargs): **kwargs):
self.thresh = thresh self.thresh = thresh
self.box_thresh = box_thresh self.box_thresh = box_thresh
...@@ -51,6 +52,7 @@ class DBPostProcess(object): ...@@ -51,6 +52,7 @@ class DBPostProcess(object):
self.dilation_kernel = None if not use_dilation else np.array( self.dilation_kernel = None if not use_dilation else np.array(
[[1, 1], [1, 1]]) [[1, 1], [1, 1]])
self.visual = visual_output
def boxes_from_bitmap(self, pred, _bitmap, dest_width, dest_height): def boxes_from_bitmap(self, pred, _bitmap, dest_width, dest_height):
''' '''
...@@ -169,12 +171,19 @@ class DBPostProcess(object): ...@@ -169,12 +171,19 @@ class DBPostProcess(object):
cv2.fillPoly(mask, contour.reshape(1, -1, 2).astype(np.int32), 1) cv2.fillPoly(mask, contour.reshape(1, -1, 2).astype(np.int32), 1)
return cv2.mean(bitmap[ymin:ymax + 1, xmin:xmax + 1], mask)[0] return cv2.mean(bitmap[ymin:ymax + 1, xmin:xmax + 1], mask)[0]
def visual_output(self, pred):
im = np.array(pred[0] * 255).astype(np.uint8)
cv2.imwrite("db_probability_map.png", im)
print("The probalibity map is visualized in db_probability_map.png")
def __call__(self, outs_dict, shape_list): def __call__(self, outs_dict, shape_list):
pred = outs_dict['maps'] pred = outs_dict['maps']
if isinstance(pred, paddle.Tensor): if isinstance(pred, paddle.Tensor):
pred = pred.numpy() pred = pred.numpy()
pred = pred[:, 0, :, :] pred = pred[:, 0, :, :]
segmentation = pred > self.thresh segmentation = pred > self.thresh
if self.visual:
self.visual_output(pred)
boxes_batch = [] boxes_batch = []
for batch_index in range(pred.shape[0]): for batch_index in range(pred.shape[0]):
......
...@@ -177,9 +177,9 @@ def save_model(model, ...@@ -177,9 +177,9 @@ def save_model(model,
model.backbone.model.save_pretrained(model_prefix) model.backbone.model.save_pretrained(model_prefix)
metric_prefix = os.path.join(model_prefix, 'metric') metric_prefix = os.path.join(model_prefix, 'metric')
# save metric and config # save metric and config
with open(metric_prefix + '.states', 'wb') as f:
pickle.dump(kwargs, f, protocol=2)
if is_best: if is_best:
with open(metric_prefix + '.states', 'wb') as f:
pickle.dump(kwargs, f, protocol=2)
logger.info('save best model is to {}'.format(model_prefix)) logger.info('save best model is to {}'.format(model_prefix))
else: else:
logger.info("save model in {}".format(model_prefix)) logger.info("save model in {}".format(model_prefix))
...@@ -19,6 +19,24 @@ SDMGR是一个关键信息提取算法,将每个检测到的文本区域分类 ...@@ -19,6 +19,24 @@ SDMGR是一个关键信息提取算法,将每个检测到的文本区域分类
wget https://paddleocr.bj.bcebos.com/dygraph_v2.1/kie/wildreceipt.tar && tar xf wildreceipt.tar wget https://paddleocr.bj.bcebos.com/dygraph_v2.1/kie/wildreceipt.tar && tar xf wildreceipt.tar
``` ```
数据集格式:
```
./wildreceipt
├── class_list.txt # box内的文本类别,比如金额、时间、日期等。
├── dict.txt # 识别的字典文件,数据集中包含的字符列表
├── wildreceipt_train.txt # 训练数据标签文件
└── wildreceipt_test.txt # 评估数据标签文件
└── image_files/ # 图像数据文件夹
```
其中标签文件里的格式为:
```
" 图像文件名 json.dumps编码的图像标注信息"
image_files/Image_16/11/d5de7f2a20751e50b84c747c17a24cd98bed3554.jpeg [{"label": 1, "transcription": "SAFEWAY", "points": [[550.0, 190.0], [937.0, 190.0], [937.0, 104.0], [550.0, 104.0]]}, {"label": 25, "transcription": "TM", "points": [[1048.0, 211.0], [1074.0, 211.0], [1074.0, 196.0], [1048.0, 196.0]]}, {"label": 25, "transcription": "ATOREMGRTOMMILAZZO", "points": [[535.0, 239.0], [833.0, 239.0], [833.0, 200.0], [535.0, 200.0]]}, {"label": 5, "transcription": "703-777-5833", "points": [[907.0, 256.0], [1081.0, 256.0], [1081.0, 223.0], [907.0, 223.0]]}......
```
**注:如果您希望在自己的数据集上训练,建议按照上述数据个数准备数据集。**
执行预测: 执行预测:
``` ```
......
...@@ -18,6 +18,22 @@ This section provides a tutorial example on how to quickly use, train, and evalu ...@@ -18,6 +18,22 @@ This section provides a tutorial example on how to quickly use, train, and evalu
wget https://paddleocr.bj.bcebos.com/dygraph_v2.1/kie/wildreceipt.tar && tar xf wildreceipt.tar wget https://paddleocr.bj.bcebos.com/dygraph_v2.1/kie/wildreceipt.tar && tar xf wildreceipt.tar
``` ```
The dataset format are as follows:
```
./wildreceipt
├── class_list.txt # The text category inside the box, such as amount, time, date, etc.
├── dict.txt # A recognized dictionary file, a list of characters contained in the dataset
├── wildreceipt_train.txt # training data label file
└── wildreceipt_test.txt # testing data label file
└── image_files/ # image dataset file
```
The format in the label file is:
```
" The image file path Image annotation information encoded by json.dumps"
image_files/Image_16/11/d5de7f2a20751e50b84c747c17a24cd98bed3554.jpeg [{"label": 1, "transcription": "SAFEWAY", "points": [[550.0, 190.0], [937.0, 190.0], [937.0, 104.0], [550.0, 104.0]]}, {"label": 25, "transcription": "TM", "points": [[1048.0, 211.0], [1074.0, 211.0], [1074.0, 196.0], [1048.0, 196.0]]}, {"label": 25, "transcription": "ATOREMGRTOMMILAZZO", "points": [[535.0, 239.0], [833.0, 239.0], [833.0, 200.0], [535.0, 200.0]]}, {"label": 5, "transcription": "703-777-5833", "points": [[907.0, 256.0], [1081.0, 256.0], [1081.0, 223.0], [907.0, 223.0]]}......
```
Download the pretrained model and predict the result: Download the pretrained model and predict the result:
```shell ```shell
......
...@@ -192,7 +192,7 @@ Finally, `precision`, `recall`, `hmean` and other indicators will be printed ...@@ -192,7 +192,7 @@ Finally, `precision`, `recall`, `hmean` and other indicators will be printed
Use the following command to complete the series prediction of `OCR engine + SER`, taking the pretrained SER model as an example: Use the following command to complete the series prediction of `OCR engine + SER`, taking the pretrained SER model as an example:
```shell ```shell
CUDA_VISIBLE_DEVICES=0 python3 tools/infer_vqa_token_ser.py -c configs/vqa/ser/layoutxlm.yml -o Architecture.Backbone.checkpoints=pretrain/ser_LayoutXLM_xfun_zh/Global.infer_img=doc/vqa/input/zh_val_42.jpg CUDA_VISIBLE_DEVICES=0 python3 tools/infer_vqa_token_ser.py -c configs/vqa/ser/layoutxlm.yml -o Architecture.Backbone.checkpoints=pretrain/ser_LayoutXLM_xfun_zh/ Global.infer_img=doc/vqa/input/zh_val_42.jpg
```` ````
Finally, the prediction result visualization image and the prediction result text file will be saved in the directory configured by the `config.Global.save_res_path` field. The prediction result text file is named `infer_results.txt`. Finally, the prediction result visualization image and the prediction result text file will be saved in the directory configured by the `config.Global.save_res_path` field. The prediction result text file is named `infer_results.txt`.
...@@ -203,7 +203,7 @@ First use the `tools/infer_vqa_token_ser.py` script to complete the prediction o ...@@ -203,7 +203,7 @@ First use the `tools/infer_vqa_token_ser.py` script to complete the prediction o
```shell ```shell
export CUDA_VISIBLE_DEVICES=0 export CUDA_VISIBLE_DEVICES=0
python3 tools/eval_with_label_end2end.py --gt_json_path XFUND/zh_val/xfun_normalize_val.json --pred_json_path output_res/infer_results.txt python3 ppstructure/vqa/tools/eval_with_label_end2end.py --gt_json_path XFUND/zh_val/xfun_normalize_val.json --pred_json_path output_res/infer_results.txt
```` ````
<a name="53"></a> <a name="53"></a>
...@@ -247,7 +247,7 @@ Finally, `precision`, `recall`, `hmean` and other indicators will be printed ...@@ -247,7 +247,7 @@ Finally, `precision`, `recall`, `hmean` and other indicators will be printed
Use the following command to complete the series prediction of `OCR engine + SER + RE`, taking the pretrained SER and RE models as an example: Use the following command to complete the series prediction of `OCR engine + SER + RE`, taking the pretrained SER and RE models as an example:
```shell ```shell
export CUDA_VISIBLE_DEVICES=0 export CUDA_VISIBLE_DEVICES=0
python3 tools/infer_vqa_token_ser_re.py -c configs/vqa/re/layoutxlm.yml -o Architecture.Backbone.checkpoints=pretrain/re_LayoutXLM_xfun_zh/Global.infer_img=doc/vqa/input/zh_val_21.jpg -c_ser configs/vqa/ser/layoutxlm. yml -o_ser Architecture.Backbone.checkpoints=pretrain/ser_LayoutXLM_xfun_zh/ python3 tools/infer_vqa_token_ser_re.py -c configs/vqa/re/layoutxlm.yml -o Architecture.Backbone.checkpoints=pretrain/re_LayoutXLM_xfun_zh/ Global.infer_img=doc/vqa/input/zh_val_21.jpg -c_ser configs/vqa/ser/layoutxlm. yml -o_ser Architecture.Backbone.checkpoints=pretrain/ser_LayoutXLM_xfun_zh/
```` ````
Finally, the prediction result visualization image and the prediction result text file will be saved in the directory configured by the `config.Global.save_res_path` field. The prediction result text file is named `infer_results.txt`. Finally, the prediction result visualization image and the prediction result text file will be saved in the directory configured by the `config.Global.save_res_path` field. The prediction result text file is named `infer_results.txt`.
......
...@@ -198,7 +198,7 @@ CUDA_VISIBLE_DEVICES=0 python3 tools/infer_vqa_token_ser.py -c configs/vqa/ser/l ...@@ -198,7 +198,7 @@ CUDA_VISIBLE_DEVICES=0 python3 tools/infer_vqa_token_ser.py -c configs/vqa/ser/l
```shell ```shell
export CUDA_VISIBLE_DEVICES=0 export CUDA_VISIBLE_DEVICES=0
python3 tools/eval_with_label_end2end.py --gt_json_path XFUND/zh_val/xfun_normalize_val.json --pred_json_path output_res/infer_results.txt python3 ppstructure/vqa/tools/eval_with_label_end2end.py --gt_json_path XFUND/zh_val/xfun_normalize_val.json --pred_json_path output_res/infer_results.txt
``` ```
### 5.3 RE ### 5.3 RE
......
...@@ -329,6 +329,7 @@ else ...@@ -329,6 +329,7 @@ else
set_save_model=$(func_set_params "${save_model_key}" "${save_log}") set_save_model=$(func_set_params "${save_model_key}" "${save_log}")
if [ ${#gpu} -le 2 ];then # train with cpu or single gpu if [ ${#gpu} -le 2 ];then # train with cpu or single gpu
eval ${env}
cmd="${python} ${run_train} ${set_use_gpu} ${set_save_model} ${set_epoch} ${set_pretrain} ${set_autocast} ${set_batchsize} ${set_train_params1} ${set_amp_config} " cmd="${python} ${run_train} ${set_use_gpu} ${set_save_model} ${set_epoch} ${set_pretrain} ${set_autocast} ${set_batchsize} ${set_train_params1} ${set_amp_config} "
elif [ ${#ips} -le 26 ];then # train with multi-gpu elif [ ${#ips} -le 26 ];then # train with multi-gpu
cmd="${python} -m paddle.distributed.launch --gpus=${gpu} ${run_train} ${set_use_gpu} ${set_save_model} ${set_epoch} ${set_pretrain} ${set_autocast} ${set_batchsize} ${set_train_params1} ${set_amp_config}" cmd="${python} -m paddle.distributed.launch --gpus=${gpu} ${run_train} ${set_use_gpu} ${set_save_model} ${set_epoch} ${set_pretrain} ${set_autocast} ${set_batchsize} ${set_train_params1} ${set_amp_config}"
......
...@@ -76,7 +76,7 @@ def export_single_model(model, arch_config, save_path, logger, quanter=None): ...@@ -76,7 +76,7 @@ def export_single_model(model, arch_config, save_path, logger, quanter=None):
else: else:
infer_shape = [3, -1, -1] infer_shape = [3, -1, -1]
if arch_config["model_type"] == "rec": if arch_config["model_type"] == "rec":
infer_shape = [3, 32, -1] # for rec model, H must be 32 infer_shape = [3, 48, -1] # for rec model, H must be 32
if "Transform" in arch_config and arch_config[ if "Transform" in arch_config and arch_config[
"Transform"] is not None and arch_config["Transform"][ "Transform"] is not None and arch_config["Transform"][
"name"] == "TPS": "name"] == "TPS":
......
...@@ -24,6 +24,7 @@ import cv2 ...@@ -24,6 +24,7 @@ import cv2
import numpy as np import numpy as np
import time import time
import sys import sys
from scipy.spatial import distance as dist
import tools.infer.utility as utility import tools.infer.utility as utility
from ppocr.utils.logging import get_logger from ppocr.utils.logging import get_logger
...@@ -154,9 +155,10 @@ class TextDetector(object): ...@@ -154,9 +155,10 @@ class TextDetector(object):
s = pts.sum(axis=1) s = pts.sum(axis=1)
rect[0] = pts[np.argmin(s)] rect[0] = pts[np.argmin(s)]
rect[2] = pts[np.argmax(s)] rect[2] = pts[np.argmax(s)]
diff = np.diff(pts, axis=1) tmp = np.delete(pts, (np.argmin(s), np.argmax(s)), axis=0)
rect[1] = pts[np.argmin(diff)] diff = np.diff(np.array(tmp), axis=1)
rect[3] = pts[np.argmax(diff)] rect[1] = tmp[np.argmin(diff)]
rect[3] = tmp[np.argmax(diff)]
return rect return rect
def clip_det_res(self, points, img_height, img_width): def clip_det_res(self, points, img_height, img_width):
......
...@@ -114,11 +114,14 @@ def sorted_boxes(dt_boxes): ...@@ -114,11 +114,14 @@ def sorted_boxes(dt_boxes):
_boxes = list(sorted_boxes) _boxes = list(sorted_boxes)
for i in range(num_boxes - 1): for i in range(num_boxes - 1):
if abs(_boxes[i + 1][0][1] - _boxes[i][0][1]) < 10 and \ for j in range(i, 0, -1):
(_boxes[i + 1][0][0] < _boxes[i][0][0]): if abs(_boxes[j + 1][0][1] - _boxes[j][0][1]) < 10 and \
tmp = _boxes[i] (_boxes[j + 1][0][0] < _boxes[j][0][0]):
_boxes[i] = _boxes[i + 1] tmp = _boxes[j]
_boxes[i + 1] = tmp _boxes[j] = _boxes[j + 1]
_boxes[j + 1] = tmp
else:
break
return _boxes return _boxes
...@@ -135,7 +138,7 @@ def main(args): ...@@ -135,7 +138,7 @@ def main(args):
logger.info("In PP-OCRv3, rec_image_shape parameter defaults to '3, 48, 320', " logger.info("In PP-OCRv3, rec_image_shape parameter defaults to '3, 48, 320', "
"if you are using recognition model with PP-OCRv2 or an older version, please set --rec_image_shape='3,32,320") "if you are using recognition model with PP-OCRv2 or an older version, please set --rec_image_shape='3,32,320")
# warm up 10 times # warm up 10 times
if args.warmup: if args.warmup:
img = np.random.uniform(0, 255, [640, 640, 3]).astype(np.uint8) img = np.random.uniform(0, 255, [640, 640, 3]).astype(np.uint8)
...@@ -198,7 +201,12 @@ def main(args): ...@@ -198,7 +201,12 @@ def main(args):
text_sys.text_detector.autolog.report() text_sys.text_detector.autolog.report()
text_sys.text_recognizer.autolog.report() text_sys.text_recognizer.autolog.report()
with open(os.path.join(draw_img_save_dir, "system_results.txt"), 'w', encoding='utf-8') as f: if args.total_process_num > 1:
save_results_path = os.path.join(draw_img_save_dir, f"system_results_{args.process_id}.txt")
else:
save_results_path = os.path.join(draw_img_save_dir, "system_results.txt")
with open(save_results_path, 'w', encoding='utf-8') as f:
f.writelines(save_results) f.writelines(save_results)
......
...@@ -55,6 +55,7 @@ def init_args(): ...@@ -55,6 +55,7 @@ def init_args():
parser.add_argument("--max_batch_size", type=int, default=10) parser.add_argument("--max_batch_size", type=int, default=10)
parser.add_argument("--use_dilation", type=str2bool, default=False) parser.add_argument("--use_dilation", type=str2bool, default=False)
parser.add_argument("--det_db_score_mode", type=str, default="fast") parser.add_argument("--det_db_score_mode", type=str, default="fast")
parser.add_argument("--vis_seg_map", type=str2bool, default=False)
# EAST parmas # EAST parmas
parser.add_argument("--det_east_score_thresh", type=float, default=0.8) parser.add_argument("--det_east_score_thresh", type=float, default=0.8)
parser.add_argument("--det_east_cover_thresh", type=float, default=0.1) parser.add_argument("--det_east_cover_thresh", type=float, default=0.1)
...@@ -276,6 +277,7 @@ def create_predictor(args, mode, logger): ...@@ -276,6 +277,7 @@ def create_predictor(args, mode, logger):
min_input_shape = {"x": [1, 3, imgH, 10]} min_input_shape = {"x": [1, 3, imgH, 10]}
max_input_shape = {"x": [args.rec_batch_num, 3, imgH, 2304]} max_input_shape = {"x": [args.rec_batch_num, 3, imgH, 2304]}
opt_input_shape = {"x": [args.rec_batch_num, 3, imgH, 320]} opt_input_shape = {"x": [args.rec_batch_num, 3, imgH, 320]}
config.exp_disable_tensorrt_ops(["transpose2"])
elif mode == "cls": elif mode == "cls":
min_input_shape = {"x": [1, 3, 48, 10]} min_input_shape = {"x": [1, 3, 48, 10]}
max_input_shape = {"x": [args.rec_batch_num, 3, 48, 1024]} max_input_shape = {"x": [args.rec_batch_num, 3, 48, 1024]}
...@@ -587,7 +589,7 @@ def text_visual(texts, ...@@ -587,7 +589,7 @@ def text_visual(texts,
def base64_to_cv2(b64str): def base64_to_cv2(b64str):
import base64 import base64
data = base64.b64decode(b64str.encode('utf8')) data = base64.b64decode(b64str.encode('utf8'))
data = np.fromstring(data, np.uint8) data = np.frombuffer(data, np.uint8)
data = cv2.imdecode(data, cv2.IMREAD_COLOR) data = cv2.imdecode(data, cv2.IMREAD_COLOR)
return data return data
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment