Commit aa59fca5 authored by Leif's avatar Leif
Browse files

Merge remote-tracking branch 'origin/dygraph' into dygraph

parents 12d15752 f01f24c7
...@@ -16,9 +16,11 @@ from __future__ import absolute_import ...@@ -16,9 +16,11 @@ from __future__ import absolute_import
from __future__ import division from __future__ import division
from __future__ import print_function from __future__ import print_function
import paddle
from paddle import nn from paddle import nn
from ppocr.modeling.heads.rec_ctc_head import get_para_bias_attr from ppocr.modeling.heads.rec_ctc_head import get_para_bias_attr
from ppocr.modeling.backbones.rec_svtrnet import Block, ConvBNLayer, trunc_normal_, zeros_, ones_
class Im2Seq(nn.Layer): class Im2Seq(nn.Layer):
...@@ -64,29 +66,126 @@ class EncoderWithFC(nn.Layer): ...@@ -64,29 +66,126 @@ class EncoderWithFC(nn.Layer):
return x return x
class EncoderWithSVTR(nn.Layer):
def __init__(
self,
in_channels,
dims=64, # XS
depth=2,
hidden_dims=120,
use_guide=False,
num_heads=8,
qkv_bias=True,
mlp_ratio=2.0,
drop_rate=0.1,
attn_drop_rate=0.1,
drop_path=0.,
qk_scale=None):
super(EncoderWithSVTR, self).__init__()
self.depth = depth
self.use_guide = use_guide
self.conv1 = ConvBNLayer(
in_channels, in_channels // 8, padding=1, act=nn.Swish)
self.conv2 = ConvBNLayer(
in_channels // 8, hidden_dims, kernel_size=1, act=nn.Swish)
self.svtr_block = nn.LayerList([
Block(
dim=hidden_dims,
num_heads=num_heads,
mixer='Global',
HW=None,
mlp_ratio=mlp_ratio,
qkv_bias=qkv_bias,
qk_scale=qk_scale,
drop=drop_rate,
act_layer=nn.Swish,
attn_drop=attn_drop_rate,
drop_path=drop_path,
norm_layer='nn.LayerNorm',
epsilon=1e-05,
prenorm=False) for i in range(depth)
])
self.norm = nn.LayerNorm(hidden_dims, epsilon=1e-6)
self.conv3 = ConvBNLayer(
hidden_dims, in_channels, kernel_size=1, act=nn.Swish)
# last conv-nxn, the input is concat of input tensor and conv3 output tensor
self.conv4 = ConvBNLayer(
2 * in_channels, in_channels // 8, padding=1, act=nn.Swish)
self.conv1x1 = ConvBNLayer(
in_channels // 8, dims, kernel_size=1, act=nn.Swish)
self.out_channels = dims
self.apply(self._init_weights)
def _init_weights(self, m):
if isinstance(m, nn.Linear):
trunc_normal_(m.weight)
if isinstance(m, nn.Linear) and m.bias is not None:
zeros_(m.bias)
elif isinstance(m, nn.LayerNorm):
zeros_(m.bias)
ones_(m.weight)
def forward(self, x):
# for use guide
if self.use_guide:
z = x.clone()
z.stop_gradient = True
else:
z = x
# for short cut
h = z
# reduce dim
z = self.conv1(z)
z = self.conv2(z)
# SVTR global block
B, C, H, W = z.shape
z = z.flatten(2).transpose([0, 2, 1])
for blk in self.svtr_block:
z = blk(z)
z = self.norm(z)
# last stage
z = z.reshape([0, H, W, C]).transpose([0, 3, 1, 2])
z = self.conv3(z)
z = paddle.concat((h, z), axis=1)
z = self.conv1x1(self.conv4(z))
return z
class SequenceEncoder(nn.Layer): class SequenceEncoder(nn.Layer):
def __init__(self, in_channels, encoder_type, hidden_size=48, **kwargs): def __init__(self, in_channels, encoder_type, hidden_size=48, **kwargs):
super(SequenceEncoder, self).__init__() super(SequenceEncoder, self).__init__()
self.encoder_reshape = Im2Seq(in_channels) self.encoder_reshape = Im2Seq(in_channels)
self.out_channels = self.encoder_reshape.out_channels self.out_channels = self.encoder_reshape.out_channels
self.encoder_type = encoder_type
if encoder_type == 'reshape': if encoder_type == 'reshape':
self.only_reshape = True self.only_reshape = True
else: else:
support_encoder_dict = { support_encoder_dict = {
'reshape': Im2Seq, 'reshape': Im2Seq,
'fc': EncoderWithFC, 'fc': EncoderWithFC,
'rnn': EncoderWithRNN 'rnn': EncoderWithRNN,
'svtr': EncoderWithSVTR
} }
assert encoder_type in support_encoder_dict, '{} must in {}'.format( assert encoder_type in support_encoder_dict, '{} must in {}'.format(
encoder_type, support_encoder_dict.keys()) encoder_type, support_encoder_dict.keys())
if encoder_type == "svtr":
self.encoder = support_encoder_dict[encoder_type]( self.encoder = support_encoder_dict[encoder_type](
self.encoder_reshape.out_channels, hidden_size) self.encoder_reshape.out_channels, **kwargs)
else:
self.encoder = support_encoder_dict[encoder_type](
self.encoder_reshape.out_channels, hidden_size)
self.out_channels = self.encoder.out_channels self.out_channels = self.encoder.out_channels
self.only_reshape = False self.only_reshape = False
def forward(self, x): def forward(self, x):
x = self.encoder_reshape(x) if self.encoder_type != 'svtr':
if not self.only_reshape: x = self.encoder_reshape(x)
if not self.only_reshape:
x = self.encoder(x)
return x
else:
x = self.encoder(x) x = self.encoder(x)
return x x = self.encoder_reshape(x)
return x
...@@ -128,6 +128,8 @@ class STN_ON(nn.Layer): ...@@ -128,6 +128,8 @@ class STN_ON(nn.Layer):
self.out_channels = in_channels self.out_channels = in_channels
def forward(self, image): def forward(self, image):
if len(image.shape)==5:
image = image.reshape([0, image.shape[-3], image.shape[-2], image.shape[-1]])
stn_input = paddle.nn.functional.interpolate( stn_input = paddle.nn.functional.interpolate(
image, self.tps_inputsize, mode="bilinear", align_corners=True) image, self.tps_inputsize, mode="bilinear", align_corners=True)
stn_img_feat, ctrl_points = self.stn_head(stn_input) stn_img_feat, ctrl_points = self.stn_head(stn_input)
......
...@@ -138,9 +138,9 @@ class TPSSpatialTransformer(nn.Layer): ...@@ -138,9 +138,9 @@ class TPSSpatialTransformer(nn.Layer):
assert source_control_points.shape[2] == 2 assert source_control_points.shape[2] == 2
batch_size = paddle.shape(source_control_points)[0] batch_size = paddle.shape(source_control_points)[0]
self.padding_matrix = paddle.expand( padding_matrix = paddle.expand(
self.padding_matrix, shape=[batch_size, 3, 2]) self.padding_matrix, shape=[batch_size, 3, 2])
Y = paddle.concat([source_control_points, self.padding_matrix], 1) Y = paddle.concat([source_control_points, padding_matrix], 1)
mapping_matrix = paddle.matmul(self.inverse_kernel, Y) mapping_matrix = paddle.matmul(self.inverse_kernel, Y)
source_coordinate = paddle.matmul(self.target_coordinate_repr, source_coordinate = paddle.matmul(self.target_coordinate_repr,
mapping_matrix) mapping_matrix)
......
...@@ -30,7 +30,7 @@ def build_lr_scheduler(lr_config, epochs, step_each_epoch): ...@@ -30,7 +30,7 @@ def build_lr_scheduler(lr_config, epochs, step_each_epoch):
return lr return lr
def build_optimizer(config, epochs, step_each_epoch, parameters): def build_optimizer(config, epochs, step_each_epoch, model):
from . import regularizer, optimizer from . import regularizer, optimizer
config = copy.deepcopy(config) config = copy.deepcopy(config)
# step1 build lr # step1 build lr
...@@ -43,6 +43,8 @@ def build_optimizer(config, epochs, step_each_epoch, parameters): ...@@ -43,6 +43,8 @@ def build_optimizer(config, epochs, step_each_epoch, parameters):
if not hasattr(regularizer, reg_name): if not hasattr(regularizer, reg_name):
reg_name += 'Decay' reg_name += 'Decay'
reg = getattr(regularizer, reg_name)(**reg_config)() reg = getattr(regularizer, reg_name)(**reg_config)()
elif 'weight_decay' in config:
reg = config.pop('weight_decay')
else: else:
reg = None reg = None
...@@ -57,4 +59,4 @@ def build_optimizer(config, epochs, step_each_epoch, parameters): ...@@ -57,4 +59,4 @@ def build_optimizer(config, epochs, step_each_epoch, parameters):
weight_decay=reg, weight_decay=reg,
grad_clip=grad_clip, grad_clip=grad_clip,
**config) **config)
return optim(parameters), lr return optim(model), lr
...@@ -42,13 +42,13 @@ class Momentum(object): ...@@ -42,13 +42,13 @@ class Momentum(object):
self.weight_decay = weight_decay self.weight_decay = weight_decay
self.grad_clip = grad_clip self.grad_clip = grad_clip
def __call__(self, parameters): def __call__(self, model):
opt = optim.Momentum( opt = optim.Momentum(
learning_rate=self.learning_rate, learning_rate=self.learning_rate,
momentum=self.momentum, momentum=self.momentum,
weight_decay=self.weight_decay, weight_decay=self.weight_decay,
grad_clip=self.grad_clip, grad_clip=self.grad_clip,
parameters=parameters) parameters=model.parameters())
return opt return opt
...@@ -75,7 +75,7 @@ class Adam(object): ...@@ -75,7 +75,7 @@ class Adam(object):
self.name = name self.name = name
self.lazy_mode = lazy_mode self.lazy_mode = lazy_mode
def __call__(self, parameters): def __call__(self, model):
opt = optim.Adam( opt = optim.Adam(
learning_rate=self.learning_rate, learning_rate=self.learning_rate,
beta1=self.beta1, beta1=self.beta1,
...@@ -85,7 +85,7 @@ class Adam(object): ...@@ -85,7 +85,7 @@ class Adam(object):
grad_clip=self.grad_clip, grad_clip=self.grad_clip,
name=self.name, name=self.name,
lazy_mode=self.lazy_mode, lazy_mode=self.lazy_mode,
parameters=parameters) parameters=model.parameters())
return opt return opt
...@@ -117,7 +117,7 @@ class RMSProp(object): ...@@ -117,7 +117,7 @@ class RMSProp(object):
self.weight_decay = weight_decay self.weight_decay = weight_decay
self.grad_clip = grad_clip self.grad_clip = grad_clip
def __call__(self, parameters): def __call__(self, model):
opt = optim.RMSProp( opt = optim.RMSProp(
learning_rate=self.learning_rate, learning_rate=self.learning_rate,
momentum=self.momentum, momentum=self.momentum,
...@@ -125,7 +125,7 @@ class RMSProp(object): ...@@ -125,7 +125,7 @@ class RMSProp(object):
epsilon=self.epsilon, epsilon=self.epsilon,
weight_decay=self.weight_decay, weight_decay=self.weight_decay,
grad_clip=self.grad_clip, grad_clip=self.grad_clip,
parameters=parameters) parameters=model.parameters())
return opt return opt
...@@ -148,7 +148,7 @@ class Adadelta(object): ...@@ -148,7 +148,7 @@ class Adadelta(object):
self.grad_clip = grad_clip self.grad_clip = grad_clip
self.name = name self.name = name
def __call__(self, parameters): def __call__(self, model):
opt = optim.Adadelta( opt = optim.Adadelta(
learning_rate=self.learning_rate, learning_rate=self.learning_rate,
epsilon=self.epsilon, epsilon=self.epsilon,
...@@ -156,7 +156,7 @@ class Adadelta(object): ...@@ -156,7 +156,7 @@ class Adadelta(object):
weight_decay=self.weight_decay, weight_decay=self.weight_decay,
grad_clip=self.grad_clip, grad_clip=self.grad_clip,
name=self.name, name=self.name,
parameters=parameters) parameters=model.parameters())
return opt return opt
...@@ -165,31 +165,55 @@ class AdamW(object): ...@@ -165,31 +165,55 @@ class AdamW(object):
learning_rate=0.001, learning_rate=0.001,
beta1=0.9, beta1=0.9,
beta2=0.999, beta2=0.999,
epsilon=1e-08, epsilon=1e-8,
weight_decay=0.01, weight_decay=0.01,
multi_precision=False,
grad_clip=None, grad_clip=None,
no_weight_decay_name=None,
one_dim_param_no_weight_decay=False,
name=None, name=None,
lazy_mode=False, lazy_mode=False,
**kwargs): **args):
super().__init__()
self.learning_rate = learning_rate self.learning_rate = learning_rate
self.beta1 = beta1 self.beta1 = beta1
self.beta2 = beta2 self.beta2 = beta2
self.epsilon = epsilon self.epsilon = epsilon
self.learning_rate = learning_rate self.grad_clip = grad_clip
self.weight_decay = 0.01 if weight_decay is None else weight_decay self.weight_decay = 0.01 if weight_decay is None else weight_decay
self.grad_clip = grad_clip self.grad_clip = grad_clip
self.name = name self.name = name
self.lazy_mode = lazy_mode self.lazy_mode = lazy_mode
self.multi_precision = multi_precision
def __call__(self, parameters): self.no_weight_decay_name_list = no_weight_decay_name.split(
) if no_weight_decay_name else []
self.one_dim_param_no_weight_decay = one_dim_param_no_weight_decay
def __call__(self, model):
parameters = model.parameters()
self.no_weight_decay_param_name_list = [
p.name for n, p in model.named_parameters() if any(nd in n for nd in self.no_weight_decay_name_list)
]
if self.one_dim_param_no_weight_decay:
self.no_weight_decay_param_name_list += [
p.name for n, p in model.named_parameters() if len(p.shape) == 1
]
opt = optim.AdamW( opt = optim.AdamW(
learning_rate=self.learning_rate, learning_rate=self.learning_rate,
beta1=self.beta1, beta1=self.beta1,
beta2=self.beta2, beta2=self.beta2,
epsilon=self.epsilon, epsilon=self.epsilon,
parameters=parameters,
weight_decay=self.weight_decay, weight_decay=self.weight_decay,
multi_precision=self.multi_precision,
grad_clip=self.grad_clip, grad_clip=self.grad_clip,
name=self.name, name=self.name,
lazy_mode=self.lazy_mode, lazy_mode=self.lazy_mode,
parameters=parameters) apply_decay_param_fun=self._apply_decay_param_fun)
return opt return opt
def _apply_decay_param_fun(self, name):
return name not in self.no_weight_decay_param_name_list
\ No newline at end of file
...@@ -27,7 +27,7 @@ from .sast_postprocess import SASTPostProcess ...@@ -27,7 +27,7 @@ from .sast_postprocess import SASTPostProcess
from .fce_postprocess import FCEPostProcess from .fce_postprocess import FCEPostProcess
from .rec_postprocess import CTCLabelDecode, AttnLabelDecode, SRNLabelDecode, \ from .rec_postprocess import CTCLabelDecode, AttnLabelDecode, SRNLabelDecode, \
DistillationCTCLabelDecode, TableLabelDecode, NRTRLabelDecode, SARLabelDecode, \ DistillationCTCLabelDecode, TableLabelDecode, NRTRLabelDecode, SARLabelDecode, \
SEEDLabelDecode, PRENLabelDecode SEEDLabelDecode, PRENLabelDecode, SVTRLabelDecode
from .cls_postprocess import ClsPostProcess from .cls_postprocess import ClsPostProcess
from .pg_postprocess import PGPostProcess from .pg_postprocess import PGPostProcess
from .vqa_token_ser_layoutlm_postprocess import VQASerTokenLayoutLMPostProcess from .vqa_token_ser_layoutlm_postprocess import VQASerTokenLayoutLMPostProcess
...@@ -41,7 +41,8 @@ def build_post_process(config, global_config=None): ...@@ -41,7 +41,8 @@ def build_post_process(config, global_config=None):
'PGPostProcess', 'DistillationCTCLabelDecode', 'TableLabelDecode', 'PGPostProcess', 'DistillationCTCLabelDecode', 'TableLabelDecode',
'DistillationDBPostProcess', 'NRTRLabelDecode', 'SARLabelDecode', 'DistillationDBPostProcess', 'NRTRLabelDecode', 'SARLabelDecode',
'SEEDLabelDecode', 'VQASerTokenLayoutLMPostProcess', 'SEEDLabelDecode', 'VQASerTokenLayoutLMPostProcess',
'VQAReTokenLayoutLMPostProcess', 'PRENLabelDecode' 'VQAReTokenLayoutLMPostProcess', 'PRENLabelDecode',
'DistillationSARLabelDecode', 'SVTRLabelDecode'
] ]
if config['name'] == 'PSEPostProcess': if config['name'] == 'PSEPostProcess':
......
...@@ -17,17 +17,26 @@ import paddle ...@@ -17,17 +17,26 @@ import paddle
class ClsPostProcess(object): class ClsPostProcess(object):
""" Convert between text-label and text-index """ """ Convert between text-label and text-index """
def __init__(self, label_list, **kwargs): def __init__(self, label_list=None, key=None, **kwargs):
super(ClsPostProcess, self).__init__() super(ClsPostProcess, self).__init__()
self.label_list = label_list self.label_list = label_list
self.key = key
def __call__(self, preds, label=None, *args, **kwargs): def __call__(self, preds, label=None, *args, **kwargs):
if self.key is not None:
preds = preds[self.key]
label_list = self.label_list
if label_list is None:
label_list = {idx: idx for idx in range(preds.shape[-1])}
if isinstance(preds, paddle.Tensor): if isinstance(preds, paddle.Tensor):
preds = preds.numpy() preds = preds.numpy()
pred_idxs = preds.argmax(axis=1) pred_idxs = preds.argmax(axis=1)
decode_out = [(self.label_list[idx], preds[i, idx]) decode_out = [(label_list[idx], preds[i, idx])
for i, idx in enumerate(pred_idxs)] for i, idx in enumerate(pred_idxs)]
if label is None: if label is None:
return decode_out return decode_out
label = [(self.label_list[idx], 1.0) for idx in label] label = [(label_list[idx], 1.0) for idx in label]
return decode_out, label return decode_out, label
...@@ -73,7 +73,7 @@ class BaseRecLabelDecode(object): ...@@ -73,7 +73,7 @@ class BaseRecLabelDecode(object):
conf_list = [0] conf_list = [0]
text = ''.join(char_list) text = ''.join(char_list)
result_list.append((text, np.mean(conf_list))) result_list.append((text, np.mean(conf_list).tolist()))
return result_list return result_list
def get_ignored_tokens(self): def get_ignored_tokens(self):
...@@ -117,6 +117,7 @@ class DistillationCTCLabelDecode(CTCLabelDecode): ...@@ -117,6 +117,7 @@ class DistillationCTCLabelDecode(CTCLabelDecode):
use_space_char=False, use_space_char=False,
model_name=["student"], model_name=["student"],
key=None, key=None,
multi_head=False,
**kwargs): **kwargs):
super(DistillationCTCLabelDecode, self).__init__(character_dict_path, super(DistillationCTCLabelDecode, self).__init__(character_dict_path,
use_space_char) use_space_char)
...@@ -125,6 +126,7 @@ class DistillationCTCLabelDecode(CTCLabelDecode): ...@@ -125,6 +126,7 @@ class DistillationCTCLabelDecode(CTCLabelDecode):
self.model_name = model_name self.model_name = model_name
self.key = key self.key = key
self.multi_head = multi_head
def __call__(self, preds, label=None, *args, **kwargs): def __call__(self, preds, label=None, *args, **kwargs):
output = dict() output = dict()
...@@ -132,6 +134,8 @@ class DistillationCTCLabelDecode(CTCLabelDecode): ...@@ -132,6 +134,8 @@ class DistillationCTCLabelDecode(CTCLabelDecode):
pred = preds[name] pred = preds[name]
if self.key is not None: if self.key is not None:
pred = pred[self.key] pred = pred[self.key]
if self.multi_head and isinstance(pred, dict):
pred = pred['ctc']
output[name] = super().__call__(pred, label=label, *args, **kwargs) output[name] = super().__call__(pred, label=label, *args, **kwargs)
return output return output
...@@ -196,7 +200,7 @@ class NRTRLabelDecode(BaseRecLabelDecode): ...@@ -196,7 +200,7 @@ class NRTRLabelDecode(BaseRecLabelDecode):
else: else:
conf_list.append(1) conf_list.append(1)
text = ''.join(char_list) text = ''.join(char_list)
result_list.append((text.lower(), np.mean(conf_list))) result_list.append((text.lower(), np.mean(conf_list).tolist()))
return result_list return result_list
...@@ -241,7 +245,7 @@ class AttnLabelDecode(BaseRecLabelDecode): ...@@ -241,7 +245,7 @@ class AttnLabelDecode(BaseRecLabelDecode):
else: else:
conf_list.append(1) conf_list.append(1)
text = ''.join(char_list) text = ''.join(char_list)
result_list.append((text, np.mean(conf_list))) result_list.append((text, np.mean(conf_list).tolist()))
return result_list return result_list
def __call__(self, preds, label=None, *args, **kwargs): def __call__(self, preds, label=None, *args, **kwargs):
...@@ -333,7 +337,7 @@ class SEEDLabelDecode(BaseRecLabelDecode): ...@@ -333,7 +337,7 @@ class SEEDLabelDecode(BaseRecLabelDecode):
else: else:
conf_list.append(1) conf_list.append(1)
text = ''.join(char_list) text = ''.join(char_list)
result_list.append((text, np.mean(conf_list))) result_list.append((text, np.mean(conf_list).tolist()))
return result_list return result_list
def __call__(self, preds, label=None, *args, **kwargs): def __call__(self, preds, label=None, *args, **kwargs):
...@@ -417,7 +421,7 @@ class SRNLabelDecode(BaseRecLabelDecode): ...@@ -417,7 +421,7 @@ class SRNLabelDecode(BaseRecLabelDecode):
conf_list.append(1) conf_list.append(1)
text = ''.join(char_list) text = ''.join(char_list)
result_list.append((text, np.mean(conf_list))) result_list.append((text, np.mean(conf_list).tolist()))
return result_list return result_list
def add_special_char(self, dict_character): def add_special_char(self, dict_character):
...@@ -636,7 +640,7 @@ class SARLabelDecode(BaseRecLabelDecode): ...@@ -636,7 +640,7 @@ class SARLabelDecode(BaseRecLabelDecode):
comp = re.compile('[^A-Z^a-z^0-9^\u4e00-\u9fa5]') comp = re.compile('[^A-Z^a-z^0-9^\u4e00-\u9fa5]')
text = text.lower() text = text.lower()
text = comp.sub('', text) text = comp.sub('', text)
result_list.append((text, np.mean(conf_list))) result_list.append((text, np.mean(conf_list).tolist()))
return result_list return result_list
def __call__(self, preds, label=None, *args, **kwargs): def __call__(self, preds, label=None, *args, **kwargs):
...@@ -656,6 +660,40 @@ class SARLabelDecode(BaseRecLabelDecode): ...@@ -656,6 +660,40 @@ class SARLabelDecode(BaseRecLabelDecode):
return [self.padding_idx] return [self.padding_idx]
class DistillationSARLabelDecode(SARLabelDecode):
"""
Convert
Convert between text-label and text-index
"""
def __init__(self,
character_dict_path=None,
use_space_char=False,
model_name=["student"],
key=None,
multi_head=False,
**kwargs):
super(DistillationSARLabelDecode, self).__init__(character_dict_path,
use_space_char)
if not isinstance(model_name, list):
model_name = [model_name]
self.model_name = model_name
self.key = key
self.multi_head = multi_head
def __call__(self, preds, label=None, *args, **kwargs):
output = dict()
for name in self.model_name:
pred = preds[name]
if self.key is not None:
pred = pred[self.key]
if self.multi_head and isinstance(pred, dict):
pred = pred['sar']
output[name] = super().__call__(pred, label=label, *args, **kwargs)
return output
class PRENLabelDecode(BaseRecLabelDecode): class PRENLabelDecode(BaseRecLabelDecode):
""" Convert between text-label and text-index """ """ Convert between text-label and text-index """
...@@ -699,7 +737,7 @@ class PRENLabelDecode(BaseRecLabelDecode): ...@@ -699,7 +737,7 @@ class PRENLabelDecode(BaseRecLabelDecode):
text = ''.join(char_list) text = ''.join(char_list)
if len(text) > 0: if len(text) > 0:
result_list.append((text, np.mean(conf_list))) result_list.append((text, np.mean(conf_list).tolist()))
else: else:
# here confidence of empty recog result is 1 # here confidence of empty recog result is 1
result_list.append(('', 1)) result_list.append(('', 1))
...@@ -714,3 +752,40 @@ class PRENLabelDecode(BaseRecLabelDecode): ...@@ -714,3 +752,40 @@ class PRENLabelDecode(BaseRecLabelDecode):
return text return text
label = self.decode(label) label = self.decode(label)
return text, label return text, label
class SVTRLabelDecode(BaseRecLabelDecode):
""" Convert between text-label and text-index """
def __init__(self, character_dict_path=None, use_space_char=False,
**kwargs):
super(SVTRLabelDecode, self).__init__(character_dict_path,
use_space_char)
def __call__(self, preds, label=None, *args, **kwargs):
if isinstance(preds, tuple):
preds = preds[-1]
if isinstance(preds, paddle.Tensor):
preds = preds.numpy()
preds_idx = preds.argmax(axis=-1)
preds_prob = preds.max(axis=-1)
text = self.decode(preds_idx, preds_prob, is_remove_duplicate=True)
return_text = []
for i in range(0, len(text), 3):
text0 = text[i]
text1 = text[i + 1]
text2 = text[i + 2]
text_pred = [text0[0], text1[0], text2[0]]
text_prob = [text0[1], text1[1], text2[1]]
id_max = text_prob.index(max(text_prob))
return_text.append((text_pred[id_max], text_prob[id_max]))
if label is None:
return return_text
label = self.decode(label)
return return_text, label
def add_special_char(self, dict_character):
dict_character = ['blank'] + dict_character
return dict_character
\ No newline at end of file
...@@ -21,7 +21,7 @@ l ...@@ -21,7 +21,7 @@ l
8 8
. .
j j
p p
......
...@@ -22,7 +22,7 @@ l ...@@ -22,7 +22,7 @@ l
8 8
. .
j j
p p
......
[English](README.md) | 简体中文 [English](README.md) | 简体中文
- [1. 简介](#1-简介) # PP-Structure 文档分析
- [2. 近期更新](#2-近期更新)
- [3. 特性](#3-特性) - [1. 简介](#1)
- [4. 效果展示](#4-效果展示) - [2. 近期更新](#2)
- [4.1 版面分析和表格识别](#41-版面分析和表格识别) - [3. 特性](#3)
- [4.2 DOC-VQA](#42-doc-vqa) - [4. 效果展示](#4)
- [5. 快速体验](#5-快速体验) - [4.1 版面分析和表格识别](#41)
- [6. PP-Structure 介绍](#6-pp-structure-介绍) - [4.2 DocVQA](#42)
- [6.1 版面分析+表格识别](#61-版面分析表格识别) - [5. 快速体验](#5)
- [6.1.1 版面分析](#611-版面分析) - [6. PP-Structure 介绍](#6)
- [6.1.2 表格识别](#612-表格识别) - [6.1 版面分析+表格识别](#61)
- [6.2 DOC-VQA](#62-doc-vqa) - [6.1.1 版面分析](#611)
- [7. 模型库](#7-模型库) - [6.1.2 表格识别](#612)
- [7.1 版面分析模型](#71-版面分析模型) - [6.2 DocVQA](#62)
- [7.2 OCR和表格识别模型](#72-ocr和表格识别模型) - [7. 模型库](#7)
- [7.2 DOC-VQA 模型](#72-doc-vqa-模型) - [7.1 版面分析模型](#71)
- [7.2 OCR和表格识别模型](#72)
- [7.3 DocVQA 模型](#73)
<a name="1"></a>
## 1. 简介 ## 1. 简介
PP-Structure是一个可用于复杂文档结构分析和处理的OCR工具包,旨在帮助开发者更好的完成文档理解相关任务。 PP-Structure是一个可用于复杂文档结构分析和处理的OCR工具包,旨在帮助开发者更好的完成文档理解相关任务。
<a name="2"></a>
## 2. 近期更新 ## 2. 近期更新
* 2022.02.12 DOC-VQA增加LayoutLMv2模型。 * 2022.02.12 DocVQA增加LayoutLMv2模型。
* 2021.12.07 新增[DOC-VQA任务SER和RE](vqa/README.md) * 2021.12.07 新增[DOC-VQA任务SER和RE](vqa/README.md)
<a name="3"></a>
## 3. 特性 ## 3. 特性
PP-Structure的主要特性如下: PP-Structure的主要特性如下:
...@@ -33,21 +37,24 @@ PP-Structure的主要特性如下: ...@@ -33,21 +37,24 @@ PP-Structure的主要特性如下:
- 支持表格区域进行结构化分析,最终结果输出Excel文件 - 支持表格区域进行结构化分析,最终结果输出Excel文件
- 支持python whl包和命令行两种方式,简单易用 - 支持python whl包和命令行两种方式,简单易用
- 支持版面分析和表格结构化两类任务自定义训练 - 支持版面分析和表格结构化两类任务自定义训练
- 支持文档视觉问答(Document Visual Question Answering,DOC-VQA)任务-语义实体识别(Semantic Entity Recognition,SER)和关系抽取(Relation Extraction,RE) - 支持文档视觉问答(Document Visual Question Answering,DocVQA)任务-语义实体识别(Semantic Entity Recognition,SER)和关系抽取(Relation Extraction,RE)
<a name="4"></a>
## 4. 效果展示 ## 4. 效果展示
<a name="41"></a>
### 4.1 版面分析和表格识别 ### 4.1 版面分析和表格识别
<img src="../doc/table/ppstructure.GIF" width="100%"/> <img src="./docs/table/ppstructure.GIF" width="100%"/>
图中展示了版面分析+表格识别的整体流程,图片先有版面分析划分为图像、文本、标题和表格四种区域,然后对图像、文本和标题三种区域进行OCR的检测识别,对表格进行表格识别,其中图像还会被存储下来以便使用。 图中展示了版面分析+表格识别的整体流程,图片先有版面分析划分为图像、文本、标题和表格四种区域,然后对图像、文本和标题三种区域进行OCR的检测识别,对表格进行表格识别,其中图像还会被存储下来以便使用。
<a name="42"></a>
### 4.2 DOC-VQA ### 4.2 DOC-VQA
* SER * SER
![](../doc/vqa/result_ser/zh_val_0_ser.jpg) | ![](../doc/vqa/result_ser/zh_val_42_ser.jpg) ![](./docs/vqa/result_ser/zh_val_0_ser.jpg) | ![](./docs/vqa/result_ser/zh_val_42_ser.jpg)
---|--- ---|---
图中不同颜色的框表示不同的类别,对于XFUN数据集,有`QUESTION`, `ANSWER`, `HEADER` 3种类别 图中不同颜色的框表示不同的类别,对于XFUN数据集,有`QUESTION`, `ANSWER`, `HEADER` 3种类别
...@@ -60,46 +67,55 @@ PP-Structure的主要特性如下: ...@@ -60,46 +67,55 @@ PP-Structure的主要特性如下:
* RE * RE
![](../doc/vqa/result_re/zh_val_21_re.jpg) | ![](../doc/vqa/result_re/zh_val_40_re.jpg) ![](./docs/vqa/result_re/zh_val_21_re.jpg) | ![](./docs/vqa/result_re/zh_val_40_re.jpg)
---|--- ---|---
图中红色框表示问题,蓝色框表示答案,问题和答案之间使用绿色线连接。在OCR检测框的左上方也标出了对应的类别和OCR识别结果。 图中红色框表示问题,蓝色框表示答案,问题和答案之间使用绿色线连接。在OCR检测框的左上方也标出了对应的类别和OCR识别结果。
<a name="5"></a>
## 5. 快速体验 ## 5. 快速体验
请参考[快速安装](./docs/quickstart.md)教程。 请参考[快速使用](./docs/quickstart.md)教程。
<a name="6"></a>
## 6. PP-Structure 介绍 ## 6. PP-Structure 介绍
<a name="61"></a>
### 6.1 版面分析+表格识别 ### 6.1 版面分析+表格识别
![pipeline](../doc/table/pipeline.jpg) ![pipeline](./docs/table/pipeline.jpg)
在PP-Structure中,图片会先经由Layout-Parser进行版面分析,在版面分析中,会对图片里的区域进行分类,包括**文字、标题、图片、列表和表格**5类。对于前4类区域,直接使用PP-OCR完成对应区域文字检测与识别。对于表格类区域,经过表格结构化处理后,表格图片转换为相同表格样式的Excel文件。 在PP-Structure中,图片会先经由Layout-Parser进行版面分析,在版面分析中,会对图片里的区域进行分类,包括**文字、标题、图片、列表和表格**5类。对于前4类区域,直接使用PP-OCR完成对应区域文字检测与识别。对于表格类区域,经过表格结构化处理后,表格图片转换为相同表格样式的Excel文件。
<a name="611"></a>
#### 6.1.1 版面分析 #### 6.1.1 版面分析
版面分析对文档数据进行区域分类,其中包括版面分析工具的Python脚本使用、提取指定类别检测框、性能指标以及自定义训练版面分析模型,详细内容可以参考[文档](layout/README_ch.md) 版面分析对文档数据进行区域分类,其中包括版面分析工具的Python脚本使用、提取指定类别检测框、性能指标以及自定义训练版面分析模型,详细内容可以参考[文档](layout/README_ch.md)
<a name="612"></a>
#### 6.1.2 表格识别 #### 6.1.2 表格识别
表格识别将表格图片转换为excel文档,其中包含对于表格文本的检测和识别以及对于表格结构和单元格坐标的预测,详细说明参考[文档](table/README_ch.md) 表格识别将表格图片转换为excel文档,其中包含对于表格文本的检测和识别以及对于表格结构和单元格坐标的预测,详细说明参考[文档](table/README_ch.md)
### 6.2 DOC-VQA <a name="62"></a>
### 6.2 DocVQA
DOC-VQA指文档视觉问答,其中包括语义实体识别 (Semantic Entity Recognition, SER) 和关系抽取 (Relation Extraction, RE) 任务。基于 SER 任务,可以完成对图像中的文本识别与分类;基于 RE 任务,可以完成对图象中的文本内容的关系提取,如判断问题对(pair),详细说明参考[文档](vqa/README.md) DocVQA指文档视觉问答,其中包括语义实体识别 (Semantic Entity Recognition, SER) 和关系抽取 (Relation Extraction, RE) 任务。基于 SER 任务,可以完成对图像中的文本识别与分类;基于 RE 任务,可以完成对图象中的文本内容的关系提取,如判断问题对(pair),详细说明参考[文档](vqa/README.md)
<a name="7"></a>
## 7. 模型库 ## 7. 模型库
PP-Structure系列模型列表(更新中) PP-Structure系列模型列表(更新中)
<a name="71"></a>
### 7.1 版面分析模型 ### 7.1 版面分析模型
|模型名称|模型简介|下载地址| label_map| |模型名称|模型简介|下载地址| label_map|
| --- | --- | --- | --- | | --- | --- | --- | --- |
| ppyolov2_r50vd_dcn_365e_publaynet | PubLayNet 数据集训练的版面分析模型,可以划分**文字、标题、表格、图片以及列表**5类区域 | [PubLayNet](https://paddle-model-ecology.bj.bcebos.com/model/layout-parser/ppyolov2_r50vd_dcn_365e_publaynet.tar) | {0: "Text", 1: "Title", 2: "List", 3:"Table", 4:"Figure"}| | ppyolov2_r50vd_dcn_365e_publaynet | PubLayNet 数据集训练的版面分析模型,可以划分**文字、标题、表格、图片以及列表**5类区域 | [PubLayNet](https://paddle-model-ecology.bj.bcebos.com/model/layout-parser/ppyolov2_r50vd_dcn_365e_publaynet.tar) | {0: "Text", 1: "Title", 2: "List", 3:"Table", 4:"Figure"}|
<a name="72"></a>
### 7.2 OCR和表格识别模型 ### 7.2 OCR和表格识别模型
|模型名称|模型简介|模型大小|下载地址| |模型名称|模型简介|模型大小|下载地址|
...@@ -108,7 +124,8 @@ PP-Structure系列模型列表(更新中) ...@@ -108,7 +124,8 @@ PP-Structure系列模型列表(更新中)
|ch_PP-OCRv2_rec_slim|【最新】slim量化版超轻量模型,支持中英文、数字识别| 9M |[推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_slim_quant_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_slim_quant_train.tar) | |ch_PP-OCRv2_rec_slim|【最新】slim量化版超轻量模型,支持中英文、数字识别| 9M |[推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_slim_quant_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_slim_quant_train.tar) |
|en_ppocr_mobile_v2.0_table_structure|PubLayNet数据集训练的英文表格场景的表格结构预测|18.6M|[推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/table/en_ppocr_mobile_v2.0_table_structure_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.1/table/en_ppocr_mobile_v2.0_table_structure_train.tar) | |en_ppocr_mobile_v2.0_table_structure|PubLayNet数据集训练的英文表格场景的表格结构预测|18.6M|[推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/table/en_ppocr_mobile_v2.0_table_structure_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.1/table/en_ppocr_mobile_v2.0_table_structure_train.tar) |
### 7.2 DOC-VQA 模型 <a name="73"></a>
### 7.3 DocVQA 模型
|模型名称|模型简介|模型大小|下载地址| |模型名称|模型简介|模型大小|下载地址|
| --- | --- | --- | --- | | --- | --- | --- | --- |
......
# 基于Python预测引擎推理
- [1. Structure](#1)
- [1.1 版面分析+表格识别](#1.1)
- [1.2 版面分析](#1.2)
- [1.3 表格识别](#1.3)
- [2. DocVQA](#2)
<a name="1"></a>
## 1. Structure
进入`ppstructure`目录
```bash
cd ppstructure
````
下载模型
```bash
mkdir inference && cd inference
# 下载PP-OCRv2文本检测模型并解压
wget https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_slim_quant_infer.tar && tar xf ch_PP-OCRv2_det_slim_quant_infer.tar
# 下载PP-OCRv2文本识别模型并解压
wget https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_slim_quant_infer.tar && tar xf ch_PP-OCRv2_rec_slim_quant_infer.tar
# 下载超轻量级英文表格预测模型并解压
wget https://paddleocr.bj.bcebos.com/dygraph_v2.0/table/en_ppocr_mobile_v2.0_table_structure_infer.tar && tar xf en_ppocr_mobile_v2.0_table_structure_infer.tar
cd ..
```
<a name="1.1"></a>
### 1.1 版面分析+表格识别
```bash
python3 predict_system.py --det_model_dir=inference/ch_PP-OCRv2_det_slim_quant_infer \
--rec_model_dir=inference/ch_PP-OCRv2_rec_slim_quant_infer \
--table_model_dir=inference/en_ppocr_mobile_v2.0_table_structure_infer \
--image_dir=./docs/table/1.png \
--rec_char_dict_path=../ppocr/utils/ppocr_keys_v1.txt \
--table_char_dict_path=../ppocr/utils/dict/table_structure_dict.txt \
--output=../output \
--vis_font_path=../doc/fonts/simfang.ttf
```
运行完成后,每张图片会在`output`字段指定的目录下的`structure`目录下有一个同名目录,图片里的每个表格会存储为一个excel,图片区域会被裁剪之后保存下来,excel文件和图片名为表格在图片里的坐标。详细的结果会存储在`res.txt`文件中。
<a name="1.2"></a>
### 1.2 版面分析
```bash
python3 predict_system.py --image_dir=./docs/table/1.png --table=false --ocr=false --output=../output/
```
运行完成后,每张图片会在`output`字段指定的目录下的`structure`目录下有一个同名目录,图片区域会被裁剪之后保存下来,图片名为表格在图片里的坐标。版面分析结果会存储在`res.txt`文件中。
<a name="1.3"></a>
### 1.3 表格识别
```bash
python3 predict_system.py --det_model_dir=inference/ch_PP-OCRv2_det_slim_quant_infer \
--rec_model_dir=inference/ch_PP-OCRv2_rec_slim_quant_infer \
--table_model_dir=inference/en_ppocr_mobile_v2.0_table_structure_infer \
--image_dir=./docs/table/table.jpg \
--rec_char_dict_path=../ppocr/utils/ppocr_keys_v1.txt \
--table_char_dict_path=../ppocr/utils/dict/table_structure_dict.txt \
--output=../output \
--vis_font_path=../doc/fonts/simfang.ttf \
--layout=false
```
运行完成后,每张图片会在`output`字段指定的目录下的`structure`目录下有一个同名目录,表格会存储为一个excel,excel文件名为`[0,0,img_h,img_w]`。
<a name="2"></a>
## 2. DocVQA
```bash
cd ppstructure
# 下载模型
mkdir inference && cd inference
# 下载SER xfun 模型并解压
wget https://paddleocr.bj.bcebos.com/pplayout/PP-Layout_v1.0_ser_pretrained.tar && tar xf PP-Layout_v1.0_ser_pretrained.tar
cd ..
python3 predict_system.py --model_name_or_path=vqa/PP-Layout_v1.0_ser_pretrained/ \
--mode=vqa \
--image_dir=vqa/images/input/zh_val_0.jpg \
--vis_font_path=../doc/fonts/simfang.ttf
```
运行完成后,每张图片会在`output`字段指定的目录下的`vqa`目录下存放可视化之后的图片,图片名和输入图片名一致。
# Python Inference
- [1. Structure](#1)
- [1.1 layout analysis + table recognition](#1.1)
- [1.2 layout analysis](#1.2)
- [1.3 table recognition](#1.3)
- [2. DocVQA](#2)
<a name="1"></a>
## 1. Structure
Go to the `ppstructure` directory
```bash
cd ppstructure
````
download model
```bash
mkdir inference && cd inference
# Download the PP-OCRv2 text detection model and unzip it
wget https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_slim_quant_infer.tar && tar xf ch_PP-OCRv2_det_slim_quant_infer.tar
# Download the PP-OCRv2 text recognition model and unzip it
wget https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_slim_quant_infer.tar && tar xf ch_PP-OCRv2_rec_slim_quant_infer.tar
# Download the ultra-lightweight English table structure model and unzip it
wget https://paddleocr.bj.bcebos.com/dygraph_v2.0/table/en_ppocr_mobile_v2.0_table_structure_infer.tar && tar xf en_ppocr_mobile_v2.0_table_structure_infer.tar
cd ..
```
<a name="1.1"></a>
### 1.1 layout analysis + table recognition
```bash
python3 predict_system.py --det_model_dir=inference/ch_PP-OCRv2_det_slim_quant_infer \
--rec_model_dir=inference/ch_PP-OCRv2_rec_slim_quant_infer \
--table_model_dir=inference/en_ppocr_mobile_v2.0_table_structure_infer \
--image_dir=./docs/table/1.png \
--rec_char_dict_path=../ppocr/utils/ppocr_keys_v1.txt \
--table_char_dict_path=../ppocr/utils/dict/table_structure_dict.txt \
--output=../output \
--vis_font_path=../doc/fonts/simfang.ttf
```
After the operation is completed, each image will have a directory with the same name in the `structure` directory under the directory specified by the `output` field. Each table in the image will be stored as an excel, and the picture area will be cropped and saved. The filename of excel and picture is their coordinates in the image. Detailed results are stored in the `res.txt` file.
<a name="1.2"></a>
### 1.2 layout analysis
```bash
python3 predict_system.py --image_dir=./docs/table/1.png --table=false --ocr=false --output=../output/
```
After the operation is completed, each image will have a directory with the same name in the `structure` directory under the directory specified by the `output` field. Each picture in image will be cropped and saved. The filename of picture area is their coordinates in the image. Layout analysis results will be stored in the `res.txt` file
<a name="1.3"></a>
### 1.3 table recognition
```bash
python3 predict_system.py --det_model_dir=inference/ch_PP-OCRv2_det_slim_quant_infer \
--rec_model_dir=inference/ch_PP-OCRv2_rec_slim_quant_infer \
--table_model_dir=inference/en_ppocr_mobile_v2.0_table_structure_infer \
--image_dir=./docs/table/table.jpg \
--rec_char_dict_path=../ppocr/utils/ppocr_keys_v1.txt \
--table_char_dict_path=../ppocr/utils/dict/table_structure_dict.txt \
--output=../output \
--vis_font_path=../doc/fonts/simfang.ttf \
--layout=false
```
After the operation is completed, each image will have a directory with the same name in the `structure` directory under the directory specified by the `output` field. Each table in the image will be stored as an excel. The filename of excel is their coordinates in the image.
<a name="2"></a>
## 2. DocVQA
```bash
cd ppstructure
# download model
mkdir inference && cd inference
wget https://paddleocr.bj.bcebos.com/pplayout/PP-Layout_v1.0_ser_pretrained.tar && tar xf PP-Layout_v1.0_ser_pretrained.tar
cd ..
python3 predict_system.py --model_name_or_path=vqa/PP-Layout_v1.0_ser_pretrained/ \
--mode=vqa \
--image_dir=vqa/images/input/zh_val_0.jpg \
--vis_font_path=../doc/fonts/simfang.ttf
```
After the operation is completed, each image will store the visualized image in the `vqa` directory under the directory specified by the `output` field, and the image name is the same as the input image name.
- [PP-Structure 系列模型列表](#pp-structure-系列模型列表)
- [1. LayoutParser 模型](#1-layoutparser-模型)
- [2. OCR和表格识别模型](#2-ocr和表格识别模型)
- [2.1 OCR](#21-ocr)
- [2.2 表格识别模型](#22-表格识别模型)
- [3. VQA模型](#3-vqa模型)
- [4. KIE模型](#4-kie模型)
# PP-Structure 系列模型列表 # PP-Structure 系列模型列表
- [1. 版面分析模型](#1)
- [2. OCR和表格识别模型](#2)
- [2.1 OCR](#21)
- [2.2 表格识别模型](#22)
- [3. VQA模型](#3)
- [4. KIE模型](#4)
## 1. LayoutParser 模型 <a name="1"></a>
## 1. 版面分析模型
|模型名称|模型简介|下载地址|label_map| |模型名称|模型简介|下载地址|label_map|
| --- | --- | --- | --- | | --- | --- | --- | --- |
...@@ -17,8 +17,10 @@ ...@@ -17,8 +17,10 @@
| ppyolov2_r50vd_dcn_365e_tableBank_word | TableBank Word 数据集训练的版面分析模型,只能检测表格 | [推理模型](https://paddle-model-ecology.bj.bcebos.com/model/layout-parser/ppyolov2_r50vd_dcn_365e_tableBank_word.tar) | {0:"Table"}| | ppyolov2_r50vd_dcn_365e_tableBank_word | TableBank Word 数据集训练的版面分析模型,只能检测表格 | [推理模型](https://paddle-model-ecology.bj.bcebos.com/model/layout-parser/ppyolov2_r50vd_dcn_365e_tableBank_word.tar) | {0:"Table"}|
| ppyolov2_r50vd_dcn_365e_tableBank_latex | TableBank Latex 数据集训练的版面分析模型,只能检测表格 | [推理模型](https://paddle-model-ecology.bj.bcebos.com/model/layout-parser/ppyolov2_r50vd_dcn_365e_tableBank_latex.tar) | {0:"Table"}| | ppyolov2_r50vd_dcn_365e_tableBank_latex | TableBank Latex 数据集训练的版面分析模型,只能检测表格 | [推理模型](https://paddle-model-ecology.bj.bcebos.com/model/layout-parser/ppyolov2_r50vd_dcn_365e_tableBank_latex.tar) | {0:"Table"}|
<a name="2"></a>
## 2. OCR和表格识别模型 ## 2. OCR和表格识别模型
<a name="21"></a>
### 2.1 OCR ### 2.1 OCR
|模型名称|模型简介|推理模型大小|下载地址| |模型名称|模型简介|推理模型大小|下载地址|
...@@ -28,12 +30,14 @@ ...@@ -28,12 +30,14 @@
如需要使用其他OCR模型,可以在 [PP-OCR model_list](../../doc/doc_ch/models_list.md) 下载模型或者使用自己训练好的模型配置到 `det_model_dir`, `rec_model_dir`两个字段即可。 如需要使用其他OCR模型,可以在 [PP-OCR model_list](../../doc/doc_ch/models_list.md) 下载模型或者使用自己训练好的模型配置到 `det_model_dir`, `rec_model_dir`两个字段即可。
<a name="22"></a>
### 2.2 表格识别模型 ### 2.2 表格识别模型
|模型名称|模型简介|推理模型大小|下载地址| |模型名称|模型简介|推理模型大小|下载地址|
| --- | --- | --- | --- | | --- | --- | --- | --- |
|en_ppocr_mobile_v2.0_table_structure|PubLayNet数据集训练的英文表格场景的表格结构预测|18.6M|[推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/table/en_ppocr_mobile_v2.0_table_structure_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.1/table/en_ppocr_mobile_v2.0_table_structure_train.tar) | |en_ppocr_mobile_v2.0_table_structure|PubLayNet数据集训练的英文表格场景的表格结构预测|18.6M|[推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/table/en_ppocr_mobile_v2.0_table_structure_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.1/table/en_ppocr_mobile_v2.0_table_structure_train.tar) |
<a name="3"></a>
## 3. VQA模型 ## 3. VQA模型
|模型名称|模型简介|推理模型大小|下载地址| |模型名称|模型简介|推理模型大小|下载地址|
...@@ -44,6 +48,7 @@ ...@@ -44,6 +48,7 @@
|re_LayoutLMv2_xfun_zh|基于LayoutLMv2在xfun中文数据集上训练的RE模型|765M|[推理模型 coming soon]() / [训练模型](https://paddleocr.bj.bcebos.com/pplayout/re_LayoutLMv2_xfun_zh.tar) | |re_LayoutLMv2_xfun_zh|基于LayoutLMv2在xfun中文数据集上训练的RE模型|765M|[推理模型 coming soon]() / [训练模型](https://paddleocr.bj.bcebos.com/pplayout/re_LayoutLMv2_xfun_zh.tar) |
|ser_LayoutLM_xfun_zh|基于LayoutLM在xfun中文数据集上训练的SER模型|430M|[推理模型 coming soon]() / [训练模型](https://paddleocr.bj.bcebos.com/pplayout/ser_LayoutLM_xfun_zh.tar) | |ser_LayoutLM_xfun_zh|基于LayoutLM在xfun中文数据集上训练的SER模型|430M|[推理模型 coming soon]() / [训练模型](https://paddleocr.bj.bcebos.com/pplayout/ser_LayoutLM_xfun_zh.tar) |
<a name="4"></a>
## 4. KIE模型 ## 4. KIE模型
|模型名称|模型简介|模型大小|下载地址| |模型名称|模型简介|模型大小|下载地址|
......
# PP-Structure Model list
- [1. Layout Analysis](#1)
- [2. OCR and Table Recognition](#2)
- [2.1 OCR](#21)
- [2.2 Table Recognition](#22)
- [3. VQA](#3)
- [4. KIE](#4)
<a name="1"></a>
## 1. Layout Analysis
|model name| description |download|label_map|
| --- |---------------------------------------------------------------------------------------------------------------------------------------------------------| --- | --- |
| ppyolov2_r50vd_dcn_365e_publaynet | The layout analysis model trained on the PubLayNet dataset, the model can recognition 5 types of areas such as **text, title, table, picture and list** | [inference model](https://paddle-model-ecology.bj.bcebos.com/model/layout-parser/ppyolov2_r50vd_dcn_365e_publaynet.tar) / [trained model](https://paddle-model-ecology.bj.bcebos.com/model/layout-parser/ppyolov2_r50vd_dcn_365e_publaynet_pretrained.pdparams) |{0: "Text", 1: "Title", 2: "List", 3:"Table", 4:"Figure"}|
| ppyolov2_r50vd_dcn_365e_tableBank_word | The layout analysis model trained on the TableBank Word dataset, the model can only detect tables | [inference model](https://paddle-model-ecology.bj.bcebos.com/model/layout-parser/ppyolov2_r50vd_dcn_365e_tableBank_word.tar) | {0:"Table"}|
| ppyolov2_r50vd_dcn_365e_tableBank_latex | The layout analysis model trained on the TableBank Latex dataset, the model can only detect tables | [inference model](https://paddle-model-ecology.bj.bcebos.com/model/layout-parser/ppyolov2_r50vd_dcn_365e_tableBank_latex.tar) | {0:"Table"}|
<a name="2"></a>
## 2. OCR and Table Recognition
<a name="21"></a>
### 2.1 OCR
|model name| description | inference model size |download|
| --- |---|---| --- |
|en_ppocr_mobile_v2.0_table_det| Text detection model of English table scenes trained on PubTabNet dataset | 4.7M |[inference model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/table/en_ppocr_mobile_v2.0_table_det_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.1/table/en_ppocr_mobile_v2.0_table_det_train.tar) |
|en_ppocr_mobile_v2.0_table_rec| Text recognition model of English table scenes trained on PubTabNet dataset | 6.9M |[inference model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/table/en_ppocr_mobile_v2.0_table_rec_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.1/table/en_ppocr_mobile_v2.0_table_rec_train.tar) |
If you need to use other OCR models, you can download the model in [PP-OCR model_list](../../doc/doc_ch/models_list.md) or use the model you trained yourself to configure to `det_model_dir`, `rec_model_dir` field.
<a name="22"></a>
### 2.2 Table Recognition
|model| description |inference model size|download|
| --- |-----------------------------------------------------------------------------| --- | --- |
|en_ppocr_mobile_v2.0_table_structure| Table structure model for English table scenes trained on PubTabNet dataset |18.6M|[inference model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/table/en_ppocr_mobile_v2.0_table_structure_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.1/table/en_ppocr_mobile_v2.0_table_structure_train.tar) |
<a name="3"></a>
## 3. VQA
|model| description |inference model size|download|
| --- |----------------------------------------------------------------| --- | --- |
|ser_LayoutXLM_xfun_zh| SER model trained on xfun Chinese dataset based on LayoutXLM |1.4G|[inference model coming soon]() / [trained model](https://paddleocr.bj.bcebos.com/pplayout/re_LayoutXLM_xfun_zh.tar) |
|re_LayoutXLM_xfun_zh| Re model trained on xfun Chinese dataset based on LayoutXLM |1.4G|[inference model coming soon]() / [trained model](https://paddleocr.bj.bcebos.com/pplayout/ser_LayoutXLM_xfun_zh.tar) |
|ser_LayoutLMv2_xfun_zh| SER model trained on xfun Chinese dataset based on LayoutXLMv2 |778M|[inference model coming soon]() / [trained model](https://paddleocr.bj.bcebos.com/pplayout/ser_LayoutLMv2_xfun_zh.tar) |
|re_LayoutLMv2_xfun_zh| Re model trained on xfun Chinese dataset based on LayoutXLMv2 |765M|[inference model coming soon]() / [trained model](https://paddleocr.bj.bcebos.com/pplayout/re_LayoutLMv2_xfun_zh.tar) |
|ser_LayoutLM_xfun_zh| SER model trained on xfun Chinese dataset based on LayoutLM |430M|[inference model coming soon]() / [trained model](https://paddleocr.bj.bcebos.com/pplayout/ser_LayoutLM_xfun_zh.tar) |
<a name="4"></a>
## 4. KIE
|model|description|model size|download|
| --- | --- | --- | --- |
|SDMGR|Key Information Extraction Model|78M|[inference model coming soon]() / [trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.1/kie/kie_vgg16.tar)|
# PP-Structure 快速开始 # PP-Structure 快速开始
- [PP-Structure 快速开始](#pp-structure-快速开始) - [1. 安装依赖包](#1)
- [1. 安装依赖包](#1-安装依赖包) - [2. 便捷使用](#2)
- [2. 便捷使用](#2-便捷使用) - [2.1 命令行使用](#21)
- [2.1 命令行使用](#21-命令行使用) - [2.1.1 版面分析+表格识别](#211)
- [2.2 Python脚本使用](#22-python脚本使用) - [2.1.2 版面分析](#212)
- [2.3 返回结果说明](#23-返回结果说明) - [2.1.3 表格识别](#213)
- [2.4 参数说明](#24-参数说明) - [2.1.4 DocVQA](#214)
- [3. Python脚本使用](#3-python脚本使用) - [2.2 代码使用](#22)
- [2.2.1 版面分析+表格识别](#221)
- [2.2.2 版面分析](#222)
- [2.2.3 表格识别](#223)
- [2.2.4 DocVQA](#224)
- [2.3 返回结果说明](#23)
- [2.3.1 版面分析+表格识别](#231)
- [2.3.2 DocVQA](#232)
- [2.4 参数说明](#24)
<a name="1"></a>
## 1. 安装依赖包 ## 1. 安装依赖包
```bash ```bash
pip install "paddleocr>=2.3.0.2" # 推荐使用2.3.0.2+版本 # 安装 paddleocr,推荐使用2.5+版本
pip3 install "paddleocr>=2.5"
# 安装 版面分析依赖包layoutparser(如不需要版面分析功能,可跳过)
pip3 install -U https://paddleocr.bj.bcebos.com/whl/layoutparser-0.0.0-py3-none-any.whl pip3 install -U https://paddleocr.bj.bcebos.com/whl/layoutparser-0.0.0-py3-none-any.whl
# 安装 DocVQA依赖包paddlenlp(如不需要DocVQA功能,可跳过)
# 安装 PaddleNLP pip install paddlenlp
git clone https://github.com/PaddlePaddle/PaddleNLP -b develop
cd PaddleNLP
pip3 install -e .
``` ```
<a name="2"></a>
## 2. 便捷使用 ## 2. 便捷使用
### 2.1 命令行使用 <a name="21"></a>
### 2.1 命令行使用
* 版面分析+表格识别 <a name="211"></a>
#### 2.1.1 版面分析+表格识别
```bash ```bash
paddleocr --image_dir=../doc/table/1.png --type=structure paddleocr --image_dir=PaddleOCR/ppstructure/docs/table/1.png --type=structure
``` ```
* VQA <a name="212"></a>
#### 2.1.2 版面分析
```bash
paddleocr --image_dir=PaddleOCR/ppstructure/docs/table/1.png --type=structure --table=false --ocr=false
```
<a name="213"></a>
#### 2.1.3 表格识别
```bash
paddleocr --image_dir=PaddleOCR/ppstructure/docs/table/table.jpg --type=structure --layout=false
```
<a name="214"></a>
#### 2.1.4 DocVQA
请参考:[文档视觉问答](../vqa/README.md) 请参考:[文档视觉问答](../vqa/README.md)
### 2.2 Python脚本使用 <a name="22"></a>
### 2.2 代码使用
<a name="221"></a>
#### 2.2.1 版面分析+表格识别
* 版面分析+表格识别
```python ```python
import os import os
import cv2 import cv2
...@@ -45,8 +73,8 @@ from paddleocr import PPStructure,draw_structure_result,save_structure_res ...@@ -45,8 +73,8 @@ from paddleocr import PPStructure,draw_structure_result,save_structure_res
table_engine = PPStructure(show_log=True) table_engine = PPStructure(show_log=True)
save_folder = './output/table' save_folder = './output'
img_path = '../doc/table/1.png' img_path = 'PaddleOCR/ppstructure/docs/table/1.png'
img = cv2.imread(img_path) img = cv2.imread(img_path)
result = table_engine(img) result = table_engine(img)
save_structure_res(result, save_folder,os.path.basename(img_path).split('.')[0]) save_structure_res(result, save_folder,os.path.basename(img_path).split('.')[0])
...@@ -57,21 +85,66 @@ for line in result: ...@@ -57,21 +85,66 @@ for line in result:
from PIL import Image from PIL import Image
font_path = '../doc/fonts/simfang.ttf' # PaddleOCR下提供字体包 font_path = 'PaddleOCR/doc/fonts/simfang.ttf' # PaddleOCR下提供字体包
image = Image.open(img_path).convert('RGB') image = Image.open(img_path).convert('RGB')
im_show = draw_structure_result(image, result,font_path=font_path) im_show = draw_structure_result(image, result,font_path=font_path)
im_show = Image.fromarray(im_show) im_show = Image.fromarray(im_show)
im_show.save('result.jpg') im_show.save('result.jpg')
``` ```
* VQA <a name="222"></a>
#### 2.2.2 版面分析
```python
import os
import cv2
from paddleocr import PPStructure,save_structure_res
table_engine = PPStructure(table=False, ocr=False, show_log=True)
save_folder = './output'
img_path = 'PaddleOCR/ppstructure/docs/table/1.png'
img = cv2.imread(img_path)
result = table_engine(img)
save_structure_res(result, save_folder, os.path.basename(img_path).split('.')[0])
for line in result:
line.pop('img')
print(line)
```
<a name="223"></a>
#### 2.2.3 表格识别
```python
import os
import cv2
from paddleocr import PPStructure,save_structure_res
table_engine = PPStructure(layout=False, show_log=True)
save_folder = './output'
img_path = 'PaddleOCR/ppstructure/docs/table/table.jpg'
img = cv2.imread(img_path)
result = table_engine(img)
save_structure_res(result, save_folder, os.path.basename(img_path).split('.')[0])
for line in result:
line.pop('img')
print(line)
```
<a name="224"></a>
#### 2.2.4 DocVQA
请参考:[文档视觉问答](../vqa/README.md) 请参考:[文档视觉问答](../vqa/README.md)
<a name="23"></a>
### 2.3 返回结果说明 ### 2.3 返回结果说明
PP-Structure的返回结果为一个dict组成的list,示例如下 PP-Structure的返回结果为一个dict组成的list,示例如下
* 版面分析+表格识别 <a name="231"></a>
#### 2.3.1 版面分析+表格识别
```shell ```shell
[ [
{ 'type': 'Text', { 'type': 'Text',
...@@ -83,77 +156,43 @@ PP-Structure的返回结果为一个dict组成的list,示例如下 ...@@ -83,77 +156,43 @@ PP-Structure的返回结果为一个dict组成的list,示例如下
``` ```
dict 里各个字段说明如下 dict 里各个字段说明如下
| 字段 | 说明 | | 字段 | 说明 |
| --------------- | -------------| | --------------- |-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
|type|图片区域的类型| |type| 图片区域的类型 |
|bbox|图片区域的在原图的坐标,分别[左上角x,左上角y,右下角x,右下角y]| |bbox| 图片区域的在原图的坐标,分别[左上角x,左上角y,右下角x,右下角y] |
|res|图片区域的OCR或表格识别结果。<br> 表格: 表格的HTML字符串; <br> OCR: 一个包含各个单行文字的检测坐标和识别结果的元组| |res| 图片区域的OCR或表格识别结果。<br> 表格: 一个dict,字段说明如下<br>&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;&emsp; `html`: 表格的HTML字符串<br>&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;&emsp; 在代码使用模式下,前向传入return_ocr_result_in_table=True可以拿到表格中每个文本的检测识别结果,对应为如下字段: <br>&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;&emsp; `boxes`: 文本检测坐标<br>&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;&emsp; `rec_res`: 文本识别结果。<br> OCR: 一个包含各个单行文字的检测坐标和识别结果的元组 |
* VQA 运行完成后,每张图片会在`output`字段指定的目录下有一个同名目录,图片里的每个表格会存储为一个excel,图片区域会被裁剪之后保存下来,excel文件和图片名为表格在图片里的坐标。
```
/output/table/1/
└─ res.txt
└─ [454, 360, 824, 658].xlsx 表格识别结果
└─ [16, 2, 828, 305].jpg 被裁剪出的图片区域
└─ [17, 361, 404, 711].xlsx 表格识别结果
```
<a name="232"></a>
#### 2.3.2 DocVQA
请参考:[文档视觉问答](../vqa/README.md) 请参考:[文档视觉问答](../vqa/README.md)
<a name="24"></a>
### 2.4 参数说明 ### 2.4 参数说明
| 字段 | 说明 | 默认值 | | 字段 | 说明 | 默认值 |
| --------------- | ---------------------------------------- | ------------------------------------------- | |----------------------|----------------------------------------------------------------------------------------------------------------------------------------------------|---------------------------------------------------------|
| output | excel和识别结果保存的地址 | ./output/table | | output | excel和识别结果保存的地址 | ./output/table |
| table_max_len | 表格结构模型预测时,图像的长边resize尺度 | 488 | | table_max_len | 表格结构模型预测时,图像的长边resize尺度 | 488 |
| table_model_dir | 表格结构模型 inference 模型地址 | None | | table_model_dir | 表格结构模型 inference 模型地址 | None |
| table_char_dict_path | 表格结构模型所用字典地址 | ../ppocr/utils/dict/table_structure_dict.txt | | table_char_dict_path | 表格结构模型所用字典地址 | ../ppocr/utils/dict/table_structure_dict.txt |
| layout_path_model | 版面分析模型模型地址,可以为在线地址或者本地地址,当为本地地址时,需要指定 layout_label_map, 命令行模式下可通过--layout_label_map='{0: "Text", 1: "Title", 2: "List", 3:"Table", 4:"Figure"}' 指定 | lp://PubLayNet/ppyolov2_r50vd_dcn_365e_publaynet/config | | layout_path_model | 版面分析模型模型地址,可以为在线地址或者本地地址,当为本地地址时,需要指定 layout_label_map, 命令行模式下可通过--layout_label_map='{0: "Text", 1: "Title", 2: "List", 3:"Table", 4:"Figure"}' 指定 | lp://PubLayNet/ppyolov2_r50vd_dcn_365e_publaynet/config |
| layout_label_map | 版面分析模型模型label映射字典 | None | | layout_label_map | 版面分析模型模型label映射字典 | None |
| model_name_or_path | VQA SER模型地址 | None | | model_name_or_path | VQA SER模型地址 | None |
| max_seq_length | VQA SER模型最大支持token长度 | 512 | | max_seq_length | VQA SER模型最大支持token长度 | 512 |
| label_map_path | VQA SER 标签文件地址 | ./vqa/labels/labels_ser.txt | | label_map_path | VQA SER 标签文件地址 | ./vqa/labels/labels_ser.txt |
| mode | pipeline预测模式,structure: 版面分析+表格识别; VQA: SER文档信息抽取 | structure | | layout | 前向中是否执行版面分析 | True |
| table | 前向中是否执行表格识别 | True |
| ocr | 对于版面分析中的非表格区域,是否执行ocr。当layout为False时会被自动设置为False | True |
大部分参数和PaddleOCR whl包保持一致,见 [whl包文档](../../doc/doc_ch/whl.md) 大部分参数和PaddleOCR whl包保持一致,见 [whl包文档](../../doc/doc_ch/whl.md)
运行完成后,每张图片会在`output`字段指定的目录下有一个同名目录,图片里的每个表格会存储为一个excel,图片区域会被裁剪之后保存下来,excel文件和图片名名为表格在图片里的坐标。
## 3. Python脚本使用
* 版面分析+表格识别
```bash
cd ppstructure
# 下载模型
mkdir inference && cd inference
# 下载PP-OCRv2文本检测模型并解压
wget https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_slim_quant_infer.tar && tar xf ch_PP-OCRv2_det_slim_quant_infer.tar
# 下载PP-OCRv2文本识别模型并解压
wget https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_slim_quant_infer.tar && tar xf ch_PP-OCRv2_rec_slim_quant_infer.tar
# 下载超轻量级英文表格预测模型并解压
wget https://paddleocr.bj.bcebos.com/dygraph_v2.0/table/en_ppocr_mobile_v2.0_table_structure_infer.tar && tar xf en_ppocr_mobile_v2.0_table_structure_infer.tar
cd ..
python3 predict_system.py --det_model_dir=inference/ch_PP-OCRv2_det_slim_quant_infer \
--rec_model_dir=inference/ch_PP-OCRv2_rec_slim_quant_infer \
--table_model_dir=inference/en_ppocr_mobile_v2.0_table_structure_infer \
--image_dir=../doc/table/1.png \
--rec_char_dict_path=../ppocr/utils/ppocr_keys_v1.txt \
--table_char_dict_path=../ppocr/utils/dict/table_structure_dict.txt \
--output=../output/table \
--vis_font_path=../doc/fonts/simfang.ttf
```
运行完成后,每张图片会在`output`字段指定的目录下的`talbe`目录下有一个同名目录,图片里的每个表格会存储为一个excel,图片区域会被裁剪之后保存下来,excel文件和图片名名为表格在图片里的坐标。
* VQA
```bash
cd ppstructure
# 下载模型
mkdir inference && cd inference
# 下载SER xfun 模型并解压
wget https://paddleocr.bj.bcebos.com/pplayout/PP-Layout_v1.0_ser_pretrained.tar && tar xf PP-Layout_v1.0_ser_pretrained.tar
cd ..
python3 predict_system.py --model_name_or_path=vqa/PP-Layout_v1.0_ser_pretrained/ \
--mode=vqa \
--image_dir=vqa/images/input/zh_val_0.jpg \
--vis_font_path=../doc/fonts/simfang.ttf
```
运行完成后,每张图片会在`output`字段指定的目录下的`vqa`目录下存放可视化之后的图片,图片名和输入图片名一致。
# PP-Structure Quick Start
- [1. Install package](#1)
- [2. Use](#2)
- [2.1 Use by command line](#21)
- [2.1.1 layout analysis + table recognition](#211)
- [2.1.2 layout analysis](#212)
- [2.1.3 table recognition](#213)
- [2.1.4 DocVQA](#214)
- [2.2 Use by code](#22)
- [2.2.1 layout analysis + table recognition](#221)
- [2.2.2 layout analysis](#222)
- [2.2.3 table recognition](#223)
- [2.2.4 DocVQA](#224)
- [2.3 Result description](#23)
- [2.3.1 layout analysis + table recognition](#231)
- [2.3.2 DocVQA](#232)
- [2.4 Parameter Description](#24)
<a name="1"></a>
## 1. Install package
```bash
# Install paddleocr, version 2.5+ is recommended
pip3 install "paddleocr>=2.5"
# Install layoutparser (if you do not use the layout analysis, you can skip it)
pip3 install -U https://paddleocr.bj.bcebos.com/whl/layoutparser-0.0.0-py3-none-any.whl
# Install the DocVQA dependency package paddlenlp (if you do not use the DocVQA, you can skip it)
pip install paddlenlp
```
<a name="2"></a>
## 2. Use
<a name="21"></a>
### 2.1 Use by command line
<a name="211"></a>
#### 2.1.1 layout analysis + table recognition
```bash
paddleocr --image_dir=PaddleOCR/ppstructure/docs/table/1.png --type=structure
```
<a name="212"></a>
#### 2.1.2 layout analysis
```bash
paddleocr --image_dir=PaddleOCR/ppstructure/docs/table/1.png --type=structure --table=false --ocr=false
```
<a name="213"></a>
#### 2.1.3 table recognition
```bash
paddleocr --image_dir=PaddleOCR/ppstructure/docs/table/table.jpg --type=structure --layout=false
```
<a name="214"></a>
#### 2.1.4 DocVQA
Please refer to: [Documentation Visual Q&A](../vqa/README.md) .
<a name="22"></a>
### 2.2 Use by code
<a name="221"></a>
#### 2.2.1 layout analysis + table recognition
```python
import os
import cv2
from paddleocr import PPStructure,draw_structure_result,save_structure_res
table_engine = PPStructure(show_log=True)
save_folder = './output'
img_path = 'PaddleOCR/ppstructure/docs/table/1.png'
img = cv2.imread(img_path)
result = table_engine(img)
save_structure_res(result, save_folder,os.path.basename(img_path).split('.')[0])
for line in result:
line.pop('img')
print(line)
from PIL import Image
font_path = 'PaddleOCR/doc/fonts/simfang.ttf' # PaddleOCR下提供字体包
image = Image.open(img_path).convert('RGB')
im_show = draw_structure_result(image, result,font_path=font_path)
im_show = Image.fromarray(im_show)
im_show.save('result.jpg')
```
<a name="222"></a>
#### 2.2.2 layout analysis
```python
import os
import cv2
from paddleocr import PPStructure,save_structure_res
table_engine = PPStructure(table=False, ocr=False, show_log=True)
save_folder = './output'
img_path = 'PaddleOCR/ppstructure/docs/table/1.png'
img = cv2.imread(img_path)
result = table_engine(img)
save_structure_res(result, save_folder, os.path.basename(img_path).split('.')[0])
for line in result:
line.pop('img')
print(line)
```
<a name="223"></a>
#### 2.2.3 table recognition
```python
import os
import cv2
from paddleocr import PPStructure,save_structure_res
table_engine = PPStructure(layout=False, show_log=True)
save_folder = './output'
img_path = 'PaddleOCR/ppstructure/docs/table/table.jpg'
img = cv2.imread(img_path)
result = table_engine(img)
save_structure_res(result, save_folder, os.path.basename(img_path).split('.')[0])
for line in result:
line.pop('img')
print(line)
```
<a name="224"></a>
#### 2.2.4 DocVQA
Please refer to: [Documentation Visual Q&A](../vqa/README.md) .
<a name="23"></a>
### 2.3 Result description
The return of PP-Structure is a list of dicts, the example is as follows:
<a name="231"></a>
#### 2.3.1 layout analysis + table recognition
```shell
[
{ 'type': 'Text',
'bbox': [34, 432, 345, 462],
'res': ([[36.0, 437.0, 341.0, 437.0, 341.0, 446.0, 36.0, 447.0], [41.0, 454.0, 125.0, 453.0, 125.0, 459.0, 41.0, 460.0]],
[('Tigure-6. The performance of CNN and IPT models using difforen', 0.90060663), ('Tent ', 0.465441)])
}
]
```
Each field in dict is described as follows:
| field | description |
| --------------- |--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
|type| Type of image area. |
|bbox| The coordinates of the image area in the original image, respectively [upper left corner x, upper left corner y, lower right corner x, lower right corner y]. |
|res| OCR or table recognition result of the image area. <br> table: a dict with field descriptions as follows: <br>&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;&emsp; `html`: html str of table.<br>&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;&emsp; In the code usage mode, set return_ocr_result_in_table=True whrn call can get the detection and recognition results of each text in the table area, corresponding to the following fields: <br>&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;&emsp; `boxes`: text detection boxes.<br>&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;&emsp; `rec_res`: text recognition results.<br> OCR: A tuple containing the detection boxes and recognition results of each single text. |
After the recognition is completed, each image will have a directory with the same name under the directory specified by the `output` field. Each table in the image will be stored as an excel, and the picture area will be cropped and saved. The filename of excel and picture is their coordinates in the image.
```
/output/table/1/
└─ res.txt
└─ [454, 360, 824, 658].xlsx table recognition result
└─ [16, 2, 828, 305].jpg picture in Image
└─ [17, 361, 404, 711].xlsx table recognition result
```
<a name="232"></a>
#### 2.3.2 DocVQA
Please refer to: [Documentation Visual Q&A](../vqa/README.md) .
<a name="24"></a>
### 2.4 Parameter Description
| field | description | default |
|----------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------------------------------------------------------|
| output | The save path of result | ./output/table |
| table_max_len | When the table structure model predicts, the long side of the image | 488 |
| table_model_dir | the path of table structure model | None |
| table_char_dict_path | the dict path of table structure model | ../ppocr/utils/dict/table_structure_dict.txt |
| layout_path_model | The model path of the layout analysis model, which can be an online address or a local path. When it is a local path, layout_label_map needs to be set. In command line mode, use --layout_label_map='{0: "Text", 1: "Title", 2: "List", 3:"Table", 4:"Figure"}' | lp://PubLayNet/ppyolov2_r50vd_dcn_365e_publaynet/config |
| layout_label_map | Layout analysis model model label mapping dictionary path | None |
| model_name_or_path | the model path of VQA SER model | None |
| max_seq_length | the max token length of VQA SER model | 512 |
| label_map_path | the label path of VQA SER model | ./vqa/labels/labels_ser.txt |
| layout | Whether to perform layout analysis in forward | True |
| table | Whether to perform table recognition in forward | True |
| ocr | Whether to perform ocr for non-table areas in layout analysis. When layout is False, it will be automatically set to False | True |
Most of the parameters are consistent with the PaddleOCR whl package, see [whl package documentation](../../doc/doc_en/whl.md)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment