Commit c89bc397 authored by andyjpaddle's avatar andyjpaddle
Browse files

Merge branch 'release/2.5' of https://github.com/PaddlePaddle/PaddleOCR into release/2.5

parents 2274364b 61b03628
...@@ -19,7 +19,6 @@ from __future__ import print_function ...@@ -19,7 +19,6 @@ from __future__ import print_function
import paddle import paddle
from paddle import nn from paddle import nn
from paddle.nn import functional as F from paddle.nn import functional as F
from paddle import fluid
class TableAttentionLoss(nn.Layer): class TableAttentionLoss(nn.Layer):
def __init__(self, structure_weight, loc_weight, use_giou=False, giou_weight=1.0, **kwargs): def __init__(self, structure_weight, loc_weight, use_giou=False, giou_weight=1.0, **kwargs):
...@@ -36,13 +35,13 @@ class TableAttentionLoss(nn.Layer): ...@@ -36,13 +35,13 @@ class TableAttentionLoss(nn.Layer):
:param bbox:[[x1,y1,x2,y2], [x1,y1,x2,y2],,,] :param bbox:[[x1,y1,x2,y2], [x1,y1,x2,y2],,,]
:return: loss :return: loss
''' '''
ix1 = fluid.layers.elementwise_max(preds[:, 0], bbox[:, 0]) ix1 = paddle.maximum(preds[:, 0], bbox[:, 0])
iy1 = fluid.layers.elementwise_max(preds[:, 1], bbox[:, 1]) iy1 = paddle.maximum(preds[:, 1], bbox[:, 1])
ix2 = fluid.layers.elementwise_min(preds[:, 2], bbox[:, 2]) ix2 = paddle.minimum(preds[:, 2], bbox[:, 2])
iy2 = fluid.layers.elementwise_min(preds[:, 3], bbox[:, 3]) iy2 = paddle.minimum(preds[:, 3], bbox[:, 3])
iw = fluid.layers.clip(ix2 - ix1 + 1e-3, 0., 1e10) iw = paddle.clip(ix2 - ix1 + 1e-3, 0., 1e10)
ih = fluid.layers.clip(iy2 - iy1 + 1e-3, 0., 1e10) ih = paddle.clip(iy2 - iy1 + 1e-3, 0., 1e10)
# overlap # overlap
inters = iw * ih inters = iw * ih
...@@ -55,12 +54,12 @@ class TableAttentionLoss(nn.Layer): ...@@ -55,12 +54,12 @@ class TableAttentionLoss(nn.Layer):
# ious # ious
ious = inters / uni ious = inters / uni
ex1 = fluid.layers.elementwise_min(preds[:, 0], bbox[:, 0]) ex1 = paddle.minimum(preds[:, 0], bbox[:, 0])
ey1 = fluid.layers.elementwise_min(preds[:, 1], bbox[:, 1]) ey1 = paddle.minimum(preds[:, 1], bbox[:, 1])
ex2 = fluid.layers.elementwise_max(preds[:, 2], bbox[:, 2]) ex2 = paddle.maximum(preds[:, 2], bbox[:, 2])
ey2 = fluid.layers.elementwise_max(preds[:, 3], bbox[:, 3]) ey2 = paddle.maximum(preds[:, 3], bbox[:, 3])
ew = fluid.layers.clip(ex2 - ex1 + 1e-3, 0., 1e10) ew = paddle.clip(ex2 - ex1 + 1e-3, 0., 1e10)
eh = fluid.layers.clip(ey2 - ey1 + 1e-3, 0., 1e10) eh = paddle.clip(ey2 - ey1 + 1e-3, 0., 1e10)
# enclose erea # enclose erea
enclose = ew * eh + eps enclose = ew * eh + eps
......
...@@ -12,7 +12,7 @@ ...@@ -12,7 +12,7 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import Levenshtein from rapidfuzz.distance import Levenshtein
import string import string
...@@ -45,8 +45,7 @@ class RecMetric(object): ...@@ -45,8 +45,7 @@ class RecMetric(object):
if self.is_filter: if self.is_filter:
pred = self._normalize_text(pred) pred = self._normalize_text(pred)
target = self._normalize_text(target) target = self._normalize_text(target)
norm_edit_dis += Levenshtein.distance(pred, target) / max( norm_edit_dis += Levenshtein.normalized_distance(pred, target)
len(pred), len(target), 1)
if pred == target: if pred == target:
correct_num += 1 correct_num += 1
all_num += 1 all_num += 1
......
...@@ -175,12 +175,7 @@ class Kie_backbone(nn.Layer): ...@@ -175,12 +175,7 @@ class Kie_backbone(nn.Layer):
img, relations, texts, gt_bboxes, tag, img_size) img, relations, texts, gt_bboxes, tag, img_size)
x = self.img_feat(img) x = self.img_feat(img)
boxes, rois_num = self.bbox2roi(gt_bboxes) boxes, rois_num = self.bbox2roi(gt_bboxes)
feats = paddle.fluid.layers.roi_align( feats = paddle.vision.ops.roi_align(
x, x, boxes, spatial_scale=1.0, output_size=7, boxes_num=rois_num)
boxes,
spatial_scale=1.0,
pooled_height=7,
pooled_width=7,
rois_num=rois_num)
feats = self.maxpool(feats).squeeze(-1).squeeze(-1) feats = self.maxpool(feats).squeeze(-1).squeeze(-1)
return [relations, texts, feats] return [relations, texts, feats]
...@@ -18,7 +18,6 @@ from __future__ import print_function ...@@ -18,7 +18,6 @@ from __future__ import print_function
from paddle import nn, ParamAttr from paddle import nn, ParamAttr
from paddle.nn import functional as F from paddle.nn import functional as F
import paddle.fluid as fluid
import paddle import paddle
import numpy as np import numpy as np
......
...@@ -20,13 +20,11 @@ import math ...@@ -20,13 +20,11 @@ import math
import paddle import paddle
from paddle import nn, ParamAttr from paddle import nn, ParamAttr
from paddle.nn import functional as F from paddle.nn import functional as F
import paddle.fluid as fluid
import numpy as np import numpy as np
from .self_attention import WrapEncoderForFeature from .self_attention import WrapEncoderForFeature
from .self_attention import WrapEncoder from .self_attention import WrapEncoder
from paddle.static import Program from paddle.static import Program
from ppocr.modeling.backbones.rec_resnet_fpn import ResNetFPN from ppocr.modeling.backbones.rec_resnet_fpn import ResNetFPN
import paddle.fluid.framework as framework
from collections import OrderedDict from collections import OrderedDict
gradient_clip = 10 gradient_clip = 10
......
...@@ -22,7 +22,6 @@ import paddle ...@@ -22,7 +22,6 @@ import paddle
from paddle import ParamAttr, nn from paddle import ParamAttr, nn
from paddle import nn, ParamAttr from paddle import nn, ParamAttr
from paddle.nn import functional as F from paddle.nn import functional as F
import paddle.fluid as fluid
import numpy as np import numpy as np
gradient_clip = 10 gradient_clip = 10
...@@ -288,10 +287,10 @@ class PrePostProcessLayer(nn.Layer): ...@@ -288,10 +287,10 @@ class PrePostProcessLayer(nn.Layer):
"layer_norm_%d" % len(self.sublayers()), "layer_norm_%d" % len(self.sublayers()),
paddle.nn.LayerNorm( paddle.nn.LayerNorm(
normalized_shape=d_model, normalized_shape=d_model,
weight_attr=fluid.ParamAttr( weight_attr=paddle.ParamAttr(
initializer=fluid.initializer.Constant(1.)), initializer=paddle.nn.initializer.Constant(1.)),
bias_attr=fluid.ParamAttr( bias_attr=paddle.ParamAttr(
initializer=fluid.initializer.Constant(0.))))) initializer=paddle.nn.initializer.Constant(0.)))))
elif cmd == "d": # add dropout elif cmd == "d": # add dropout
self.functors.append(lambda x: F.dropout( self.functors.append(lambda x: F.dropout(
x, p=dropout_rate, mode="downscale_in_infer") x, p=dropout_rate, mode="downscale_in_infer")
...@@ -324,7 +323,7 @@ class PrepareEncoder(nn.Layer): ...@@ -324,7 +323,7 @@ class PrepareEncoder(nn.Layer):
def forward(self, src_word, src_pos): def forward(self, src_word, src_pos):
src_word_emb = src_word src_word_emb = src_word
src_word_emb = fluid.layers.cast(src_word_emb, 'float32') src_word_emb = paddle.cast(src_word_emb, 'float32')
src_word_emb = paddle.scale(x=src_word_emb, scale=self.src_emb_dim**0.5) src_word_emb = paddle.scale(x=src_word_emb, scale=self.src_emb_dim**0.5)
src_pos = paddle.squeeze(src_pos, axis=-1) src_pos = paddle.squeeze(src_pos, axis=-1)
src_pos_enc = self.emb(src_pos) src_pos_enc = self.emb(src_pos)
...@@ -367,7 +366,7 @@ class PrepareDecoder(nn.Layer): ...@@ -367,7 +366,7 @@ class PrepareDecoder(nn.Layer):
self.dropout_rate = dropout_rate self.dropout_rate = dropout_rate
def forward(self, src_word, src_pos): def forward(self, src_word, src_pos):
src_word = fluid.layers.cast(src_word, 'int64') src_word = paddle.cast(src_word, 'int64')
src_word = paddle.squeeze(src_word, axis=-1) src_word = paddle.squeeze(src_word, axis=-1)
src_word_emb = self.emb0(src_word) src_word_emb = self.emb0(src_word)
src_word_emb = paddle.scale(x=src_word_emb, scale=self.src_emb_dim**0.5) src_word_emb = paddle.scale(x=src_word_emb, scale=self.src_emb_dim**0.5)
......
...@@ -38,6 +38,7 @@ class DBPostProcess(object): ...@@ -38,6 +38,7 @@ class DBPostProcess(object):
unclip_ratio=2.0, unclip_ratio=2.0,
use_dilation=False, use_dilation=False,
score_mode="fast", score_mode="fast",
visual_output=False,
**kwargs): **kwargs):
self.thresh = thresh self.thresh = thresh
self.box_thresh = box_thresh self.box_thresh = box_thresh
...@@ -51,6 +52,7 @@ class DBPostProcess(object): ...@@ -51,6 +52,7 @@ class DBPostProcess(object):
self.dilation_kernel = None if not use_dilation else np.array( self.dilation_kernel = None if not use_dilation else np.array(
[[1, 1], [1, 1]]) [[1, 1], [1, 1]])
self.visual = visual_output
def boxes_from_bitmap(self, pred, _bitmap, dest_width, dest_height): def boxes_from_bitmap(self, pred, _bitmap, dest_width, dest_height):
''' '''
...@@ -169,12 +171,19 @@ class DBPostProcess(object): ...@@ -169,12 +171,19 @@ class DBPostProcess(object):
cv2.fillPoly(mask, contour.reshape(1, -1, 2).astype(np.int32), 1) cv2.fillPoly(mask, contour.reshape(1, -1, 2).astype(np.int32), 1)
return cv2.mean(bitmap[ymin:ymax + 1, xmin:xmax + 1], mask)[0] return cv2.mean(bitmap[ymin:ymax + 1, xmin:xmax + 1], mask)[0]
def visual_output(self, pred):
im = np.array(pred[0] * 255).astype(np.uint8)
cv2.imwrite("db_probability_map.png", im)
print("The probalibity map is visualized in db_probability_map.png")
def __call__(self, outs_dict, shape_list): def __call__(self, outs_dict, shape_list):
pred = outs_dict['maps'] pred = outs_dict['maps']
if isinstance(pred, paddle.Tensor): if isinstance(pred, paddle.Tensor):
pred = pred.numpy() pred = pred.numpy()
pred = pred[:, 0, :, :] pred = pred[:, 0, :, :]
segmentation = pred > self.thresh segmentation = pred > self.thresh
if self.visual:
self.visual_output(pred)
boxes_batch = [] boxes_batch = []
for batch_index in range(pred.shape[0]): for batch_index in range(pred.shape[0]):
......
...@@ -177,9 +177,9 @@ def save_model(model, ...@@ -177,9 +177,9 @@ def save_model(model,
model.backbone.model.save_pretrained(model_prefix) model.backbone.model.save_pretrained(model_prefix)
metric_prefix = os.path.join(model_prefix, 'metric') metric_prefix = os.path.join(model_prefix, 'metric')
# save metric and config # save metric and config
with open(metric_prefix + '.states', 'wb') as f:
pickle.dump(kwargs, f, protocol=2)
if is_best: if is_best:
with open(metric_prefix + '.states', 'wb') as f:
pickle.dump(kwargs, f, protocol=2)
logger.info('save best model is to {}'.format(model_prefix)) logger.info('save best model is to {}'.format(model_prefix))
else: else:
logger.info("save model in {}".format(model_prefix)) logger.info("save model in {}".format(model_prefix))
...@@ -19,6 +19,24 @@ SDMGR是一个关键信息提取算法,将每个检测到的文本区域分类 ...@@ -19,6 +19,24 @@ SDMGR是一个关键信息提取算法,将每个检测到的文本区域分类
wget https://paddleocr.bj.bcebos.com/dygraph_v2.1/kie/wildreceipt.tar && tar xf wildreceipt.tar wget https://paddleocr.bj.bcebos.com/dygraph_v2.1/kie/wildreceipt.tar && tar xf wildreceipt.tar
``` ```
数据集格式:
```
./wildreceipt
├── class_list.txt # box内的文本类别,比如金额、时间、日期等。
├── dict.txt # 识别的字典文件,数据集中包含的字符列表
├── wildreceipt_train.txt # 训练数据标签文件
└── wildreceipt_test.txt # 评估数据标签文件
└── image_files/ # 图像数据文件夹
```
其中标签文件里的格式为:
```
" 图像文件名 json.dumps编码的图像标注信息"
image_files/Image_16/11/d5de7f2a20751e50b84c747c17a24cd98bed3554.jpeg [{"label": 1, "transcription": "SAFEWAY", "points": [[550.0, 190.0], [937.0, 190.0], [937.0, 104.0], [550.0, 104.0]]}, {"label": 25, "transcription": "TM", "points": [[1048.0, 211.0], [1074.0, 211.0], [1074.0, 196.0], [1048.0, 196.0]]}, {"label": 25, "transcription": "ATOREMGRTOMMILAZZO", "points": [[535.0, 239.0], [833.0, 239.0], [833.0, 200.0], [535.0, 200.0]]}, {"label": 5, "transcription": "703-777-5833", "points": [[907.0, 256.0], [1081.0, 256.0], [1081.0, 223.0], [907.0, 223.0]]}......
```
**注:如果您希望在自己的数据集上训练,建议按照上述数据个数准备数据集。**
执行预测: 执行预测:
``` ```
......
...@@ -18,6 +18,22 @@ This section provides a tutorial example on how to quickly use, train, and evalu ...@@ -18,6 +18,22 @@ This section provides a tutorial example on how to quickly use, train, and evalu
wget https://paddleocr.bj.bcebos.com/dygraph_v2.1/kie/wildreceipt.tar && tar xf wildreceipt.tar wget https://paddleocr.bj.bcebos.com/dygraph_v2.1/kie/wildreceipt.tar && tar xf wildreceipt.tar
``` ```
The dataset format are as follows:
```
./wildreceipt
├── class_list.txt # The text category inside the box, such as amount, time, date, etc.
├── dict.txt # A recognized dictionary file, a list of characters contained in the dataset
├── wildreceipt_train.txt # training data label file
└── wildreceipt_test.txt # testing data label file
└── image_files/ # image dataset file
```
The format in the label file is:
```
" The image file path Image annotation information encoded by json.dumps"
image_files/Image_16/11/d5de7f2a20751e50b84c747c17a24cd98bed3554.jpeg [{"label": 1, "transcription": "SAFEWAY", "points": [[550.0, 190.0], [937.0, 190.0], [937.0, 104.0], [550.0, 104.0]]}, {"label": 25, "transcription": "TM", "points": [[1048.0, 211.0], [1074.0, 211.0], [1074.0, 196.0], [1048.0, 196.0]]}, {"label": 25, "transcription": "ATOREMGRTOMMILAZZO", "points": [[535.0, 239.0], [833.0, 239.0], [833.0, 200.0], [535.0, 200.0]]}, {"label": 5, "transcription": "703-777-5833", "points": [[907.0, 256.0], [1081.0, 256.0], [1081.0, 223.0], [907.0, 223.0]]}......
```
Download the pretrained model and predict the result: Download the pretrained model and predict the result:
```shell ```shell
......
...@@ -9,7 +9,7 @@ ...@@ -9,7 +9,7 @@
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# Apache 2.0 License for more details. # Apache 2.0 License for more details.
import distance from rapidfuzz.distance import Levenshtein
from apted import APTED, Config from apted import APTED, Config
from apted.helpers import Tree from apted.helpers import Tree
from lxml import etree, html from lxml import etree, html
...@@ -39,17 +39,6 @@ class TableTree(Tree): ...@@ -39,17 +39,6 @@ class TableTree(Tree):
class CustomConfig(Config): class CustomConfig(Config):
@staticmethod
def maximum(*sequences):
"""Get maximum possible value
"""
return max(map(len, sequences))
def normalized_distance(self, *sequences):
"""Get distance from 0 to 1
"""
return float(distance.levenshtein(*sequences)) / self.maximum(*sequences)
def rename(self, node1, node2): def rename(self, node1, node2):
"""Compares attributes of trees""" """Compares attributes of trees"""
#print(node1.tag) #print(node1.tag)
...@@ -58,23 +47,12 @@ class CustomConfig(Config): ...@@ -58,23 +47,12 @@ class CustomConfig(Config):
if node1.tag == 'td': if node1.tag == 'td':
if node1.content or node2.content: if node1.content or node2.content:
#print(node1.content, ) #print(node1.content, )
return self.normalized_distance(node1.content, node2.content) return Levenshtein.normalized_distance(node1.content, node2.content)
return 0. return 0.
class CustomConfig_del_short(Config): class CustomConfig_del_short(Config):
@staticmethod
def maximum(*sequences):
"""Get maximum possible value
"""
return max(map(len, sequences))
def normalized_distance(self, *sequences):
"""Get distance from 0 to 1
"""
return float(distance.levenshtein(*sequences)) / self.maximum(*sequences)
def rename(self, node1, node2): def rename(self, node1, node2):
"""Compares attributes of trees""" """Compares attributes of trees"""
if (node1.tag != node2.tag) or (node1.colspan != node2.colspan) or (node1.rowspan != node2.rowspan): if (node1.tag != node2.tag) or (node1.colspan != node2.colspan) or (node1.rowspan != node2.rowspan):
...@@ -90,21 +68,10 @@ class CustomConfig_del_short(Config): ...@@ -90,21 +68,10 @@ class CustomConfig_del_short(Config):
node1_content = ['####'] node1_content = ['####']
if len(node2_content) < 3: if len(node2_content) < 3:
node2_content = ['####'] node2_content = ['####']
return self.normalized_distance(node1_content, node2_content) return Levenshtein.normalized_distance(node1_content, node2_content)
return 0. return 0.
class CustomConfig_del_block(Config): class CustomConfig_del_block(Config):
@staticmethod
def maximum(*sequences):
"""Get maximum possible value
"""
return max(map(len, sequences))
def normalized_distance(self, *sequences):
"""Get distance from 0 to 1
"""
return float(distance.levenshtein(*sequences)) / self.maximum(*sequences)
def rename(self, node1, node2): def rename(self, node1, node2):
"""Compares attributes of trees""" """Compares attributes of trees"""
if (node1.tag != node2.tag) or (node1.colspan != node2.colspan) or (node1.rowspan != node2.rowspan): if (node1.tag != node2.tag) or (node1.colspan != node2.colspan) or (node1.rowspan != node2.rowspan):
...@@ -120,7 +87,7 @@ class CustomConfig_del_block(Config): ...@@ -120,7 +87,7 @@ class CustomConfig_del_block(Config):
while ' ' in node2_content: while ' ' in node2_content:
print(node2_content.index(' ')) print(node2_content.index(' '))
node2_content.pop(node2_content.index(' ')) node2_content.pop(node2_content.index(' '))
return self.normalized_distance(node1_content, node2_content) return Levenshtein.normalized_distance(node1_content, node2_content)
return 0. return 0.
class TEDS(object): class TEDS(object):
......
...@@ -192,7 +192,7 @@ Finally, `precision`, `recall`, `hmean` and other indicators will be printed ...@@ -192,7 +192,7 @@ Finally, `precision`, `recall`, `hmean` and other indicators will be printed
Use the following command to complete the series prediction of `OCR engine + SER`, taking the pretrained SER model as an example: Use the following command to complete the series prediction of `OCR engine + SER`, taking the pretrained SER model as an example:
```shell ```shell
CUDA_VISIBLE_DEVICES=0 python3 tools/infer_vqa_token_ser.py -c configs/vqa/ser/layoutxlm.yml -o Architecture.Backbone.checkpoints=pretrain/ser_LayoutXLM_xfun_zh/Global.infer_img=doc/vqa/input/zh_val_42.jpg CUDA_VISIBLE_DEVICES=0 python3 tools/infer_vqa_token_ser.py -c configs/vqa/ser/layoutxlm.yml -o Architecture.Backbone.checkpoints=pretrain/ser_LayoutXLM_xfun_zh/ Global.infer_img=doc/vqa/input/zh_val_42.jpg
```` ````
Finally, the prediction result visualization image and the prediction result text file will be saved in the directory configured by the `config.Global.save_res_path` field. The prediction result text file is named `infer_results.txt`. Finally, the prediction result visualization image and the prediction result text file will be saved in the directory configured by the `config.Global.save_res_path` field. The prediction result text file is named `infer_results.txt`.
...@@ -203,7 +203,7 @@ First use the `tools/infer_vqa_token_ser.py` script to complete the prediction o ...@@ -203,7 +203,7 @@ First use the `tools/infer_vqa_token_ser.py` script to complete the prediction o
```shell ```shell
export CUDA_VISIBLE_DEVICES=0 export CUDA_VISIBLE_DEVICES=0
python3 tools/eval_with_label_end2end.py --gt_json_path XFUND/zh_val/xfun_normalize_val.json --pred_json_path output_res/infer_results.txt python3 ppstructure/vqa/tools/eval_with_label_end2end.py --gt_json_path XFUND/zh_val/xfun_normalize_val.json --pred_json_path output_res/infer_results.txt
```` ````
<a name="53"></a> <a name="53"></a>
...@@ -247,7 +247,7 @@ Finally, `precision`, `recall`, `hmean` and other indicators will be printed ...@@ -247,7 +247,7 @@ Finally, `precision`, `recall`, `hmean` and other indicators will be printed
Use the following command to complete the series prediction of `OCR engine + SER + RE`, taking the pretrained SER and RE models as an example: Use the following command to complete the series prediction of `OCR engine + SER + RE`, taking the pretrained SER and RE models as an example:
```shell ```shell
export CUDA_VISIBLE_DEVICES=0 export CUDA_VISIBLE_DEVICES=0
python3 tools/infer_vqa_token_ser_re.py -c configs/vqa/re/layoutxlm.yml -o Architecture.Backbone.checkpoints=pretrain/re_LayoutXLM_xfun_zh/Global.infer_img=doc/vqa/input/zh_val_21.jpg -c_ser configs/vqa/ser/layoutxlm. yml -o_ser Architecture.Backbone.checkpoints=pretrain/ser_LayoutXLM_xfun_zh/ python3 tools/infer_vqa_token_ser_re.py -c configs/vqa/re/layoutxlm.yml -o Architecture.Backbone.checkpoints=pretrain/re_LayoutXLM_xfun_zh/ Global.infer_img=doc/vqa/input/zh_val_21.jpg -c_ser configs/vqa/ser/layoutxlm. yml -o_ser Architecture.Backbone.checkpoints=pretrain/ser_LayoutXLM_xfun_zh/
```` ````
Finally, the prediction result visualization image and the prediction result text file will be saved in the directory configured by the `config.Global.save_res_path` field. The prediction result text file is named `infer_results.txt`. Finally, the prediction result visualization image and the prediction result text file will be saved in the directory configured by the `config.Global.save_res_path` field. The prediction result text file is named `infer_results.txt`.
......
...@@ -198,7 +198,7 @@ CUDA_VISIBLE_DEVICES=0 python3 tools/infer_vqa_token_ser.py -c configs/vqa/ser/l ...@@ -198,7 +198,7 @@ CUDA_VISIBLE_DEVICES=0 python3 tools/infer_vqa_token_ser.py -c configs/vqa/ser/l
```shell ```shell
export CUDA_VISIBLE_DEVICES=0 export CUDA_VISIBLE_DEVICES=0
python3 tools/eval_with_label_end2end.py --gt_json_path XFUND/zh_val/xfun_normalize_val.json --pred_json_path output_res/infer_results.txt python3 ppstructure/vqa/tools/eval_with_label_end2end.py --gt_json_path XFUND/zh_val/xfun_normalize_val.json --pred_json_path output_res/infer_results.txt
``` ```
### 5.3 RE ### 5.3 RE
......
...@@ -20,7 +20,7 @@ from shapely.geometry import Polygon ...@@ -20,7 +20,7 @@ from shapely.geometry import Polygon
import numpy as np import numpy as np
from collections import defaultdict from collections import defaultdict
import operator import operator
import Levenshtein from rapidfuzz.distance import Levenshtein
import argparse import argparse
import json import json
import copy import copy
......
...@@ -329,6 +329,7 @@ else ...@@ -329,6 +329,7 @@ else
set_save_model=$(func_set_params "${save_model_key}" "${save_log}") set_save_model=$(func_set_params "${save_model_key}" "${save_log}")
if [ ${#gpu} -le 2 ];then # train with cpu or single gpu if [ ${#gpu} -le 2 ];then # train with cpu or single gpu
eval ${env}
cmd="${python} ${run_train} ${set_use_gpu} ${set_save_model} ${set_epoch} ${set_pretrain} ${set_autocast} ${set_batchsize} ${set_train_params1} ${set_amp_config} " cmd="${python} ${run_train} ${set_use_gpu} ${set_save_model} ${set_epoch} ${set_pretrain} ${set_autocast} ${set_batchsize} ${set_train_params1} ${set_amp_config} "
elif [ ${#ips} -le 26 ];then # train with multi-gpu elif [ ${#ips} -le 26 ];then # train with multi-gpu
cmd="${python} -m paddle.distributed.launch --gpus=${gpu} ${run_train} ${set_use_gpu} ${set_save_model} ${set_epoch} ${set_pretrain} ${set_autocast} ${set_batchsize} ${set_train_params1} ${set_amp_config}" cmd="${python} -m paddle.distributed.launch --gpus=${gpu} ${run_train} ${set_use_gpu} ${set_save_model} ${set_epoch} ${set_pretrain} ${set_autocast} ${set_batchsize} ${set_train_params1} ${set_amp_config}"
......
...@@ -31,7 +31,12 @@ from ppocr.utils.logging import get_logger ...@@ -31,7 +31,12 @@ from ppocr.utils.logging import get_logger
from tools.program import load_config, merge_config, ArgsParser from tools.program import load_config, merge_config, ArgsParser
def export_single_model(model, arch_config, save_path, logger, quanter=None): def export_single_model(model,
arch_config,
save_path,
logger,
input_shape=None,
quanter=None):
if arch_config["algorithm"] == "SRN": if arch_config["algorithm"] == "SRN":
max_text_length = arch_config["Head"]["max_text_length"] max_text_length = arch_config["Head"]["max_text_length"]
other_shape = [ other_shape = [
...@@ -64,7 +69,7 @@ def export_single_model(model, arch_config, save_path, logger, quanter=None): ...@@ -64,7 +69,7 @@ def export_single_model(model, arch_config, save_path, logger, quanter=None):
else: else:
other_shape = [ other_shape = [
paddle.static.InputSpec( paddle.static.InputSpec(
shape=[None, 3, 64, 256], dtype="float32"), shape=[None] + input_shape, dtype="float32"),
] ]
model = to_static(model, input_spec=other_shape) model = to_static(model, input_spec=other_shape)
elif arch_config["algorithm"] == "PREN": elif arch_config["algorithm"] == "PREN":
...@@ -76,7 +81,7 @@ def export_single_model(model, arch_config, save_path, logger, quanter=None): ...@@ -76,7 +81,7 @@ def export_single_model(model, arch_config, save_path, logger, quanter=None):
else: else:
infer_shape = [3, -1, -1] infer_shape = [3, -1, -1]
if arch_config["model_type"] == "rec": if arch_config["model_type"] == "rec":
infer_shape = [3, 32, -1] # for rec model, H must be 32 infer_shape = [3, 48, -1] # for rec model, H must be 32
if "Transform" in arch_config and arch_config[ if "Transform" in arch_config and arch_config[
"Transform"] is not None and arch_config["Transform"][ "Transform"] is not None and arch_config["Transform"][
"name"] == "TPS": "name"] == "TPS":
...@@ -157,6 +162,13 @@ def main(): ...@@ -157,6 +162,13 @@ def main():
arch_config = config["Architecture"] arch_config = config["Architecture"]
if arch_config["algorithm"] == "SVTR" and arch_config["Head"][
"name"] != 'MultiHead':
input_shape = config["Eval"]["dataset"]["transforms"][-2][
'SVTRRecResizeImg']['image_shape']
else:
input_shape = None
if arch_config["algorithm"] in ["Distillation", ]: # distillation model if arch_config["algorithm"] in ["Distillation", ]: # distillation model
archs = list(arch_config["Models"].values()) archs = list(arch_config["Models"].values())
for idx, name in enumerate(model.model_name_list): for idx, name in enumerate(model.model_name_list):
...@@ -165,7 +177,8 @@ def main(): ...@@ -165,7 +177,8 @@ def main():
sub_model_save_path, logger) sub_model_save_path, logger)
else: else:
save_path = os.path.join(save_path, "inference") save_path = os.path.join(save_path, "inference")
export_single_model(model, arch_config, save_path, logger) export_single_model(
model, arch_config, save_path, logger, input_shape=input_shape)
if __name__ == "__main__": if __name__ == "__main__":
......
...@@ -24,6 +24,7 @@ import cv2 ...@@ -24,6 +24,7 @@ import cv2
import numpy as np import numpy as np
import time import time
import sys import sys
from scipy.spatial import distance as dist
import tools.infer.utility as utility import tools.infer.utility as utility
from ppocr.utils.logging import get_logger from ppocr.utils.logging import get_logger
...@@ -154,9 +155,10 @@ class TextDetector(object): ...@@ -154,9 +155,10 @@ class TextDetector(object):
s = pts.sum(axis=1) s = pts.sum(axis=1)
rect[0] = pts[np.argmin(s)] rect[0] = pts[np.argmin(s)]
rect[2] = pts[np.argmax(s)] rect[2] = pts[np.argmax(s)]
diff = np.diff(pts, axis=1) tmp = np.delete(pts, (np.argmin(s), np.argmax(s)), axis=0)
rect[1] = pts[np.argmin(diff)] diff = np.diff(np.array(tmp), axis=1)
rect[3] = pts[np.argmax(diff)] rect[1] = tmp[np.argmin(diff)]
rect[3] = tmp[np.argmax(diff)]
return rect return rect
def clip_det_res(self, points, img_height, img_width): def clip_det_res(self, points, img_height, img_width):
......
...@@ -114,11 +114,14 @@ def sorted_boxes(dt_boxes): ...@@ -114,11 +114,14 @@ def sorted_boxes(dt_boxes):
_boxes = list(sorted_boxes) _boxes = list(sorted_boxes)
for i in range(num_boxes - 1): for i in range(num_boxes - 1):
if abs(_boxes[i + 1][0][1] - _boxes[i][0][1]) < 10 and \ for j in range(i, 0, -1):
(_boxes[i + 1][0][0] < _boxes[i][0][0]): if abs(_boxes[j + 1][0][1] - _boxes[j][0][1]) < 10 and \
tmp = _boxes[i] (_boxes[j + 1][0][0] < _boxes[j][0][0]):
_boxes[i] = _boxes[i + 1] tmp = _boxes[j]
_boxes[i + 1] = tmp _boxes[j] = _boxes[j + 1]
_boxes[j + 1] = tmp
else:
break
return _boxes return _boxes
...@@ -135,7 +138,7 @@ def main(args): ...@@ -135,7 +138,7 @@ def main(args):
logger.info("In PP-OCRv3, rec_image_shape parameter defaults to '3, 48, 320', " logger.info("In PP-OCRv3, rec_image_shape parameter defaults to '3, 48, 320', "
"if you are using recognition model with PP-OCRv2 or an older version, please set --rec_image_shape='3,32,320") "if you are using recognition model with PP-OCRv2 or an older version, please set --rec_image_shape='3,32,320")
# warm up 10 times # warm up 10 times
if args.warmup: if args.warmup:
img = np.random.uniform(0, 255, [640, 640, 3]).astype(np.uint8) img = np.random.uniform(0, 255, [640, 640, 3]).astype(np.uint8)
...@@ -198,7 +201,12 @@ def main(args): ...@@ -198,7 +201,12 @@ def main(args):
text_sys.text_detector.autolog.report() text_sys.text_detector.autolog.report()
text_sys.text_recognizer.autolog.report() text_sys.text_recognizer.autolog.report()
with open(os.path.join(draw_img_save_dir, "system_results.txt"), 'w', encoding='utf-8') as f: if args.total_process_num > 1:
save_results_path = os.path.join(draw_img_save_dir, f"system_results_{args.process_id}.txt")
else:
save_results_path = os.path.join(draw_img_save_dir, "system_results.txt")
with open(save_results_path, 'w', encoding='utf-8') as f:
f.writelines(save_results) f.writelines(save_results)
......
...@@ -55,6 +55,7 @@ def init_args(): ...@@ -55,6 +55,7 @@ def init_args():
parser.add_argument("--max_batch_size", type=int, default=10) parser.add_argument("--max_batch_size", type=int, default=10)
parser.add_argument("--use_dilation", type=str2bool, default=False) parser.add_argument("--use_dilation", type=str2bool, default=False)
parser.add_argument("--det_db_score_mode", type=str, default="fast") parser.add_argument("--det_db_score_mode", type=str, default="fast")
parser.add_argument("--vis_seg_map", type=str2bool, default=False)
# EAST parmas # EAST parmas
parser.add_argument("--det_east_score_thresh", type=float, default=0.8) parser.add_argument("--det_east_score_thresh", type=float, default=0.8)
parser.add_argument("--det_east_cover_thresh", type=float, default=0.1) parser.add_argument("--det_east_cover_thresh", type=float, default=0.1)
...@@ -276,6 +277,7 @@ def create_predictor(args, mode, logger): ...@@ -276,6 +277,7 @@ def create_predictor(args, mode, logger):
min_input_shape = {"x": [1, 3, imgH, 10]} min_input_shape = {"x": [1, 3, imgH, 10]}
max_input_shape = {"x": [args.rec_batch_num, 3, imgH, 2304]} max_input_shape = {"x": [args.rec_batch_num, 3, imgH, 2304]}
opt_input_shape = {"x": [args.rec_batch_num, 3, imgH, 320]} opt_input_shape = {"x": [args.rec_batch_num, 3, imgH, 320]}
config.exp_disable_tensorrt_ops(["transpose2"])
elif mode == "cls": elif mode == "cls":
min_input_shape = {"x": [1, 3, 48, 10]} min_input_shape = {"x": [1, 3, 48, 10]}
max_input_shape = {"x": [args.rec_batch_num, 3, 48, 1024]} max_input_shape = {"x": [args.rec_batch_num, 3, 48, 1024]}
...@@ -587,7 +589,7 @@ def text_visual(texts, ...@@ -587,7 +589,7 @@ def text_visual(texts,
def base64_to_cv2(b64str): def base64_to_cv2(b64str):
import base64 import base64
data = base64.b64decode(b64str.encode('utf8')) data = base64.b64decode(b64str.encode('utf8'))
data = np.fromstring(data, np.uint8) data = np.frombuffer(data, np.uint8)
data = cv2.imdecode(data, cv2.IMREAD_COLOR) data = cv2.imdecode(data, cv2.IMREAD_COLOR)
return data return data
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment