Commit 71d37bab authored by Leif's avatar Leif
Browse files

Merge remote-tracking branch 'Evezerest/dygraph' into dygraph

parents 8e32ef41 fbb68c38
# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Code is refer from:
https://github.com/RuijieJ/pren/blob/main/Nets/EfficientNet.py
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import math
from collections import namedtuple
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
__all__ = ['EfficientNetb3']
class EffB3Params:
@staticmethod
def get_global_params():
"""
The fllowing are efficientnetb3's arch superparams, but to fit for scene
text recognition task, the resolution(image_size) here is changed
from 300 to 64.
"""
GlobalParams = namedtuple('GlobalParams', [
'drop_connect_rate', 'width_coefficient', 'depth_coefficient',
'depth_divisor', 'image_size'
])
global_params = GlobalParams(
drop_connect_rate=0.3,
width_coefficient=1.2,
depth_coefficient=1.4,
depth_divisor=8,
image_size=64)
return global_params
@staticmethod
def get_block_params():
BlockParams = namedtuple('BlockParams', [
'kernel_size', 'num_repeat', 'input_filters', 'output_filters',
'expand_ratio', 'id_skip', 'se_ratio', 'stride'
])
block_params = [
BlockParams(3, 1, 32, 16, 1, True, 0.25, 1),
BlockParams(3, 2, 16, 24, 6, True, 0.25, 2),
BlockParams(5, 2, 24, 40, 6, True, 0.25, 2),
BlockParams(3, 3, 40, 80, 6, True, 0.25, 2),
BlockParams(5, 3, 80, 112, 6, True, 0.25, 1),
BlockParams(5, 4, 112, 192, 6, True, 0.25, 2),
BlockParams(3, 1, 192, 320, 6, True, 0.25, 1)
]
return block_params
class EffUtils:
@staticmethod
def round_filters(filters, global_params):
"""Calculate and round number of filters based on depth multiplier."""
multiplier = global_params.width_coefficient
if not multiplier:
return filters
divisor = global_params.depth_divisor
filters *= multiplier
new_filters = int(filters + divisor / 2) // divisor * divisor
if new_filters < 0.9 * filters:
new_filters += divisor
return int(new_filters)
@staticmethod
def round_repeats(repeats, global_params):
"""Round number of filters based on depth multiplier."""
multiplier = global_params.depth_coefficient
if not multiplier:
return repeats
return int(math.ceil(multiplier * repeats))
class ConvBlock(nn.Layer):
def __init__(self, block_params):
super(ConvBlock, self).__init__()
self.block_args = block_params
self.has_se = (self.block_args.se_ratio is not None) and \
(0 < self.block_args.se_ratio <= 1)
self.id_skip = block_params.id_skip
# expansion phase
self.input_filters = self.block_args.input_filters
output_filters = \
self.block_args.input_filters * self.block_args.expand_ratio
if self.block_args.expand_ratio != 1:
self.expand_conv = nn.Conv2D(
self.input_filters, output_filters, 1, bias_attr=False)
self.bn0 = nn.BatchNorm(output_filters)
# depthwise conv phase
k = self.block_args.kernel_size
s = self.block_args.stride
self.depthwise_conv = nn.Conv2D(
output_filters,
output_filters,
groups=output_filters,
kernel_size=k,
stride=s,
padding='same',
bias_attr=False)
self.bn1 = nn.BatchNorm(output_filters)
# squeeze and excitation layer, if desired
if self.has_se:
num_squeezed_channels = max(1,
int(self.block_args.input_filters *
self.block_args.se_ratio))
self.se_reduce = nn.Conv2D(output_filters, num_squeezed_channels, 1)
self.se_expand = nn.Conv2D(num_squeezed_channels, output_filters, 1)
# output phase
self.final_oup = self.block_args.output_filters
self.project_conv = nn.Conv2D(
output_filters, self.final_oup, 1, bias_attr=False)
self.bn2 = nn.BatchNorm(self.final_oup)
self.swish = nn.Swish()
def drop_connect(self, inputs, p, training):
if not training:
return inputs
batch_size = inputs.shape[0]
keep_prob = 1 - p
random_tensor = keep_prob
random_tensor += paddle.rand([batch_size, 1, 1, 1], dtype=inputs.dtype)
random_tensor = paddle.to_tensor(random_tensor, place=inputs.place)
binary_tensor = paddle.floor(random_tensor)
output = inputs / keep_prob * binary_tensor
return output
def forward(self, inputs, drop_connect_rate=None):
# expansion and depthwise conv
x = inputs
if self.block_args.expand_ratio != 1:
x = self.swish(self.bn0(self.expand_conv(inputs)))
x = self.swish(self.bn1(self.depthwise_conv(x)))
# squeeze and excitation
if self.has_se:
x_squeezed = F.adaptive_avg_pool2d(x, 1)
x_squeezed = self.se_expand(self.swish(self.se_reduce(x_squeezed)))
x = F.sigmoid(x_squeezed) * x
x = self.bn2(self.project_conv(x))
# skip conntection and drop connect
if self.id_skip and self.block_args.stride == 1 and \
self.input_filters == self.final_oup:
if drop_connect_rate:
x = self.drop_connect(
x, p=drop_connect_rate, training=self.training)
x = x + inputs
return x
class EfficientNetb3_PREN(nn.Layer):
def __init__(self, in_channels):
super(EfficientNetb3_PREN, self).__init__()
self.blocks_params = EffB3Params.get_block_params()
self.global_params = EffB3Params.get_global_params()
self.out_channels = []
# stem
stem_channels = EffUtils.round_filters(32, self.global_params)
self.conv_stem = nn.Conv2D(
in_channels, stem_channels, 3, 2, padding='same', bias_attr=False)
self.bn0 = nn.BatchNorm(stem_channels)
self.blocks = []
# to extract three feature maps for fpn based on efficientnetb3 backbone
self.concerned_block_idxes = [7, 17, 25]
concerned_idx = 0
for i, block_params in enumerate(self.blocks_params):
block_params = block_params._replace(
input_filters=EffUtils.round_filters(block_params.input_filters,
self.global_params),
output_filters=EffUtils.round_filters(
block_params.output_filters, self.global_params),
num_repeat=EffUtils.round_repeats(block_params.num_repeat,
self.global_params))
self.blocks.append(
self.add_sublayer("{}-0".format(i), ConvBlock(block_params)))
concerned_idx += 1
if concerned_idx in self.concerned_block_idxes:
self.out_channels.append(block_params.output_filters)
if block_params.num_repeat > 1:
block_params = block_params._replace(
input_filters=block_params.output_filters, stride=1)
for j in range(block_params.num_repeat - 1):
self.blocks.append(
self.add_sublayer('{}-{}'.format(i, j + 1),
ConvBlock(block_params)))
concerned_idx += 1
if concerned_idx in self.concerned_block_idxes:
self.out_channels.append(block_params.output_filters)
self.swish = nn.Swish()
def forward(self, inputs):
outs = []
x = self.swish(self.bn0(self.conv_stem(inputs)))
for idx, block in enumerate(self.blocks):
drop_connect_rate = self.global_params.drop_connect_rate
if drop_connect_rate:
drop_connect_rate *= float(idx) / len(self.blocks)
x = block(x, drop_connect_rate=drop_connect_rate)
if idx in self.concerned_block_idxes:
outs.append(x)
return outs
......@@ -21,6 +21,7 @@ def build_head(config):
from .det_east_head import EASTHead
from .det_sast_head import SASTHead
from .det_pse_head import PSEHead
from .det_fce_head import FCEHead
from .e2e_pg_head import PGHead
# rec head
......@@ -30,6 +31,7 @@ def build_head(config):
from .rec_nrtr_head import Transformer
from .rec_sar_head import SARHead
from .rec_aster_head import AsterHead
from .rec_pren_head import PRENHead
# cls head
from .cls_head import ClsHead
......@@ -40,9 +42,9 @@ def build_head(config):
from .table_att_head import TableAttentionHead
support_dict = [
'DBHead', 'PSEHead', 'EASTHead', 'SASTHead', 'CTCHead', 'ClsHead',
'AttentionHead', 'SRNHead', 'PGHead', 'Transformer',
'TableAttentionHead', 'SARHead', 'AsterHead', 'SDMGRHead'
'DBHead', 'PSEHead', 'FCEHead', 'EASTHead', 'SASTHead', 'CTCHead',
'ClsHead', 'AttentionHead', 'SRNHead', 'PGHead', 'Transformer',
'TableAttentionHead', 'SARHead', 'AsterHead', 'SDMGRHead', 'PRENHead'
]
#table head
......
# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
This code is refer from:
https://github.com/open-mmlab/mmocr/blob/main/mmocr/models/textdet/dense_heads/fce_head.py
"""
from paddle import nn
from paddle import ParamAttr
import paddle.nn.functional as F
from paddle.nn.initializer import Normal
import paddle
from functools import partial
def multi_apply(func, *args, **kwargs):
pfunc = partial(func, **kwargs) if kwargs else func
map_results = map(pfunc, *args)
return tuple(map(list, zip(*map_results)))
class FCEHead(nn.Layer):
"""The class for implementing FCENet head.
FCENet(CVPR2021): Fourier Contour Embedding for Arbitrary-shaped Text
Detection.
[https://arxiv.org/abs/2104.10442]
Args:
in_channels (int): The number of input channels.
scales (list[int]) : The scale of each layer.
fourier_degree (int) : The maximum Fourier transform degree k.
"""
def __init__(self, in_channels, fourier_degree=5):
super().__init__()
assert isinstance(in_channels, int)
self.downsample_ratio = 1.0
self.in_channels = in_channels
self.fourier_degree = fourier_degree
self.out_channels_cls = 4
self.out_channels_reg = (2 * self.fourier_degree + 1) * 2
self.out_conv_cls = nn.Conv2D(
in_channels=self.in_channels,
out_channels=self.out_channels_cls,
kernel_size=3,
stride=1,
padding=1,
groups=1,
weight_attr=ParamAttr(
name='cls_weights',
initializer=Normal(
mean=paddle.to_tensor(0.), std=paddle.to_tensor(0.01))),
bias_attr=True)
self.out_conv_reg = nn.Conv2D(
in_channels=self.in_channels,
out_channels=self.out_channels_reg,
kernel_size=3,
stride=1,
padding=1,
groups=1,
weight_attr=ParamAttr(
name='reg_weights',
initializer=Normal(
mean=paddle.to_tensor(0.), std=paddle.to_tensor(0.01))),
bias_attr=True)
def forward(self, feats, targets=None):
cls_res, reg_res = multi_apply(self.forward_single, feats)
level_num = len(cls_res)
outs = {}
if not self.training:
for i in range(level_num):
tr_pred = F.softmax(cls_res[i][:, 0:2, :, :], axis=1)
tcl_pred = F.softmax(cls_res[i][:, 2:, :, :], axis=1)
outs['level_{}'.format(i)] = paddle.concat(
[tr_pred, tcl_pred, reg_res[i]], axis=1)
else:
preds = [[cls_res[i], reg_res[i]] for i in range(level_num)]
outs['levels'] = preds
return outs
def forward_single(self, x):
cls_predict = self.out_conv_cls(x)
reg_predict = self.out_conv_reg(x)
return cls_predict, reg_predict
# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from paddle import nn
from paddle.nn import functional as F
class PRENHead(nn.Layer):
def __init__(self, in_channels, out_channels, **kwargs):
super(PRENHead, self).__init__()
self.linear = nn.Linear(in_channels, out_channels)
def forward(self, x, targets=None):
predicts = self.linear(x)
if not self.training:
predicts = F.softmax(predicts, axis=2)
return predicts
......@@ -23,7 +23,12 @@ def build_neck(config):
from .pg_fpn import PGFPN
from .table_fpn import TableFPN
from .fpn import FPN
support_dict = ['FPN','DBFPN', 'EASTFPN', 'SASTFPN', 'SequenceEncoder', 'PGFPN', 'TableFPN']
from .fce_fpn import FCEFPN
from .pren_fpn import PRENFPN
support_dict = [
'FPN', 'FCEFPN', 'DBFPN', 'EASTFPN', 'SASTFPN', 'SequenceEncoder',
'PGFPN', 'TableFPN', 'PRENFPN'
]
module_name = config.pop('name')
assert module_name in support_dict, Exception('neck only support {}'.format(
......
# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
This code is refer from:
https://github.com/PaddlePaddle/PaddleDetection/blob/release/2.3/ppdet/modeling/necks/fpn.py
"""
import paddle.nn as nn
import paddle.nn.functional as F
from paddle import ParamAttr
from paddle.nn.initializer import XavierUniform
from paddle.nn.initializer import Normal
from paddle.regularizer import L2Decay
__all__ = ['FCEFPN']
class ConvNormLayer(nn.Layer):
def __init__(self,
ch_in,
ch_out,
filter_size,
stride,
groups=1,
norm_type='bn',
norm_decay=0.,
norm_groups=32,
lr_scale=1.,
freeze_norm=False,
initializer=Normal(
mean=0., std=0.01)):
super(ConvNormLayer, self).__init__()
assert norm_type in ['bn', 'sync_bn', 'gn']
bias_attr = False
self.conv = nn.Conv2D(
in_channels=ch_in,
out_channels=ch_out,
kernel_size=filter_size,
stride=stride,
padding=(filter_size - 1) // 2,
groups=groups,
weight_attr=ParamAttr(
initializer=initializer, learning_rate=1.),
bias_attr=bias_attr)
norm_lr = 0. if freeze_norm else 1.
param_attr = ParamAttr(
learning_rate=norm_lr,
regularizer=L2Decay(norm_decay) if norm_decay is not None else None)
bias_attr = ParamAttr(
learning_rate=norm_lr,
regularizer=L2Decay(norm_decay) if norm_decay is not None else None)
if norm_type == 'bn':
self.norm = nn.BatchNorm2D(
ch_out, weight_attr=param_attr, bias_attr=bias_attr)
elif norm_type == 'sync_bn':
self.norm = nn.SyncBatchNorm(
ch_out, weight_attr=param_attr, bias_attr=bias_attr)
elif norm_type == 'gn':
self.norm = nn.GroupNorm(
num_groups=norm_groups,
num_channels=ch_out,
weight_attr=param_attr,
bias_attr=bias_attr)
def forward(self, inputs):
out = self.conv(inputs)
out = self.norm(out)
return out
class FCEFPN(nn.Layer):
"""
Feature Pyramid Network, see https://arxiv.org/abs/1612.03144
Args:
in_channels (list[int]): input channels of each level which can be
derived from the output shape of backbone by from_config
out_channels (list[int]): output channel of each level
spatial_scales (list[float]): the spatial scales between input feature
maps and original input image which can be derived from the output
shape of backbone by from_config
has_extra_convs (bool): whether to add extra conv to the last level.
default False
extra_stage (int): the number of extra stages added to the last level.
default 1
use_c5 (bool): Whether to use c5 as the input of extra stage,
otherwise p5 is used. default True
norm_type (string|None): The normalization type in FPN module. If
norm_type is None, norm will not be used after conv and if
norm_type is string, bn, gn, sync_bn are available. default None
norm_decay (float): weight decay for normalization layer weights.
default 0.
freeze_norm (bool): whether to freeze normalization layer.
default False
relu_before_extra_convs (bool): whether to add relu before extra convs.
default False
"""
def __init__(self,
in_channels,
out_channels,
spatial_scales=[0.25, 0.125, 0.0625, 0.03125],
has_extra_convs=False,
extra_stage=1,
use_c5=True,
norm_type=None,
norm_decay=0.,
freeze_norm=False,
relu_before_extra_convs=True):
super(FCEFPN, self).__init__()
self.out_channels = out_channels
for s in range(extra_stage):
spatial_scales = spatial_scales + [spatial_scales[-1] / 2.]
self.spatial_scales = spatial_scales
self.has_extra_convs = has_extra_convs
self.extra_stage = extra_stage
self.use_c5 = use_c5
self.relu_before_extra_convs = relu_before_extra_convs
self.norm_type = norm_type
self.norm_decay = norm_decay
self.freeze_norm = freeze_norm
self.lateral_convs = []
self.fpn_convs = []
fan = out_channels * 3 * 3
# stage index 0,1,2,3 stands for res2,res3,res4,res5 on ResNet Backbone
# 0 <= st_stage < ed_stage <= 3
st_stage = 4 - len(in_channels)
ed_stage = st_stage + len(in_channels) - 1
for i in range(st_stage, ed_stage + 1):
if i == 3:
lateral_name = 'fpn_inner_res5_sum'
else:
lateral_name = 'fpn_inner_res{}_sum_lateral'.format(i + 2)
in_c = in_channels[i - st_stage]
if self.norm_type is not None:
lateral = self.add_sublayer(
lateral_name,
ConvNormLayer(
ch_in=in_c,
ch_out=out_channels,
filter_size=1,
stride=1,
norm_type=self.norm_type,
norm_decay=self.norm_decay,
freeze_norm=self.freeze_norm,
initializer=XavierUniform(fan_out=in_c)))
else:
lateral = self.add_sublayer(
lateral_name,
nn.Conv2D(
in_channels=in_c,
out_channels=out_channels,
kernel_size=1,
weight_attr=ParamAttr(
initializer=XavierUniform(fan_out=in_c))))
self.lateral_convs.append(lateral)
for i in range(st_stage, ed_stage + 1):
fpn_name = 'fpn_res{}_sum'.format(i + 2)
if self.norm_type is not None:
fpn_conv = self.add_sublayer(
fpn_name,
ConvNormLayer(
ch_in=out_channels,
ch_out=out_channels,
filter_size=3,
stride=1,
norm_type=self.norm_type,
norm_decay=self.norm_decay,
freeze_norm=self.freeze_norm,
initializer=XavierUniform(fan_out=fan)))
else:
fpn_conv = self.add_sublayer(
fpn_name,
nn.Conv2D(
in_channels=out_channels,
out_channels=out_channels,
kernel_size=3,
padding=1,
weight_attr=ParamAttr(
initializer=XavierUniform(fan_out=fan))))
self.fpn_convs.append(fpn_conv)
# add extra conv levels for RetinaNet(use_c5)/FCOS(use_p5)
if self.has_extra_convs:
for i in range(self.extra_stage):
lvl = ed_stage + 1 + i
if i == 0 and self.use_c5:
in_c = in_channels[-1]
else:
in_c = out_channels
extra_fpn_name = 'fpn_{}'.format(lvl + 2)
if self.norm_type is not None:
extra_fpn_conv = self.add_sublayer(
extra_fpn_name,
ConvNormLayer(
ch_in=in_c,
ch_out=out_channels,
filter_size=3,
stride=2,
norm_type=self.norm_type,
norm_decay=self.norm_decay,
freeze_norm=self.freeze_norm,
initializer=XavierUniform(fan_out=fan)))
else:
extra_fpn_conv = self.add_sublayer(
extra_fpn_name,
nn.Conv2D(
in_channels=in_c,
out_channels=out_channels,
kernel_size=3,
stride=2,
padding=1,
weight_attr=ParamAttr(
initializer=XavierUniform(fan_out=fan))))
self.fpn_convs.append(extra_fpn_conv)
@classmethod
def from_config(cls, cfg, input_shape):
return {
'in_channels': [i.channels for i in input_shape],
'spatial_scales': [1.0 / i.stride for i in input_shape],
}
def forward(self, body_feats):
laterals = []
num_levels = len(body_feats)
for i in range(num_levels):
laterals.append(self.lateral_convs[i](body_feats[i]))
for i in range(1, num_levels):
lvl = num_levels - i
upsample = F.interpolate(
laterals[lvl],
scale_factor=2.,
mode='nearest', )
laterals[lvl - 1] += upsample
fpn_output = []
for lvl in range(num_levels):
fpn_output.append(self.fpn_convs[lvl](laterals[lvl]))
if self.extra_stage > 0:
# use max pool to get more levels on top of outputs (Faster R-CNN, Mask R-CNN)
if not self.has_extra_convs:
assert self.extra_stage == 1, 'extra_stage should be 1 if FPN has not extra convs'
fpn_output.append(F.max_pool2d(fpn_output[-1], 1, stride=2))
# add extra conv levels for RetinaNet(use_c5)/FCOS(use_p5)
else:
if self.use_c5:
extra_source = body_feats[-1]
else:
extra_source = fpn_output[-1]
fpn_output.append(self.fpn_convs[num_levels](extra_source))
for i in range(1, self.extra_stage):
if self.relu_before_extra_convs:
fpn_output.append(self.fpn_convs[num_levels + i](F.relu(
fpn_output[-1])))
else:
fpn_output.append(self.fpn_convs[num_levels + i](
fpn_output[-1]))
return fpn_output
# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Code is refer from:
https://github.com/RuijieJ/pren/blob/main/Nets/Aggregation.py
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import paddle
from paddle import nn
import paddle.nn.functional as F
class PoolAggregate(nn.Layer):
def __init__(self, n_r, d_in, d_middle=None, d_out=None):
super(PoolAggregate, self).__init__()
if not d_middle:
d_middle = d_in
if not d_out:
d_out = d_in
self.d_in = d_in
self.d_middle = d_middle
self.d_out = d_out
self.act = nn.Swish()
self.n_r = n_r
self.aggs = self._build_aggs()
def _build_aggs(self):
aggs = []
for i in range(self.n_r):
aggs.append(
self.add_sublayer(
'{}'.format(i),
nn.Sequential(
('conv1', nn.Conv2D(
self.d_in, self.d_middle, 3, 2, 1, bias_attr=False)
), ('bn1', nn.BatchNorm(self.d_middle)),
('act', self.act), ('conv2', nn.Conv2D(
self.d_middle, self.d_out, 3, 2, 1, bias_attr=False
)), ('bn2', nn.BatchNorm(self.d_out)))))
return aggs
def forward(self, x):
b = x.shape[0]
outs = []
for agg in self.aggs:
y = agg(x)
p = F.adaptive_avg_pool2d(y, 1)
outs.append(p.reshape((b, 1, self.d_out)))
out = paddle.concat(outs, 1)
return out
class WeightAggregate(nn.Layer):
def __init__(self, n_r, d_in, d_middle=None, d_out=None):
super(WeightAggregate, self).__init__()
if not d_middle:
d_middle = d_in
if not d_out:
d_out = d_in
self.n_r = n_r
self.d_out = d_out
self.act = nn.Swish()
self.conv_n = nn.Sequential(
('conv1', nn.Conv2D(
d_in, d_in, 3, 1, 1,
bias_attr=False)), ('bn1', nn.BatchNorm(d_in)),
('act1', self.act), ('conv2', nn.Conv2D(
d_in, n_r, 1, bias_attr=False)), ('bn2', nn.BatchNorm(n_r)),
('act2', nn.Sigmoid()))
self.conv_d = nn.Sequential(
('conv1', nn.Conv2D(
d_in, d_middle, 3, 1, 1,
bias_attr=False)), ('bn1', nn.BatchNorm(d_middle)),
('act1', self.act), ('conv2', nn.Conv2D(
d_middle, d_out, 1,
bias_attr=False)), ('bn2', nn.BatchNorm(d_out)))
def forward(self, x):
b, _, h, w = x.shape
hmaps = self.conv_n(x)
fmaps = self.conv_d(x)
r = paddle.bmm(
hmaps.reshape((b, self.n_r, h * w)),
fmaps.reshape((b, self.d_out, h * w)).transpose((0, 2, 1)))
return r
class GCN(nn.Layer):
def __init__(self, d_in, n_in, d_out=None, n_out=None, dropout=0.1):
super(GCN, self).__init__()
if not d_out:
d_out = d_in
if not n_out:
n_out = d_in
self.conv_n = nn.Conv1D(n_in, n_out, 1)
self.linear = nn.Linear(d_in, d_out)
self.dropout = nn.Dropout(dropout)
self.act = nn.Swish()
def forward(self, x):
x = self.conv_n(x)
x = self.dropout(self.linear(x))
return self.act(x)
class PRENFPN(nn.Layer):
def __init__(self, in_channels, n_r, d_model, max_len, dropout):
super(PRENFPN, self).__init__()
assert len(in_channels) == 3, "in_channels' length must be 3."
c1, c2, c3 = in_channels # the depths are from big to small
# build fpn
assert d_model % 3 == 0, "{} can't be divided by 3.".format(d_model)
self.agg_p1 = PoolAggregate(n_r, c1, d_out=d_model // 3)
self.agg_p2 = PoolAggregate(n_r, c2, d_out=d_model // 3)
self.agg_p3 = PoolAggregate(n_r, c3, d_out=d_model // 3)
self.agg_w1 = WeightAggregate(n_r, c1, 4 * c1, d_model // 3)
self.agg_w2 = WeightAggregate(n_r, c2, 4 * c2, d_model // 3)
self.agg_w3 = WeightAggregate(n_r, c3, 4 * c3, d_model // 3)
self.gcn_pool = GCN(d_model, n_r, d_model, max_len, dropout)
self.gcn_weight = GCN(d_model, n_r, d_model, max_len, dropout)
self.out_channels = d_model
def forward(self, inputs):
f3, f5, f7 = inputs
rp1 = self.agg_p1(f3)
rp2 = self.agg_p2(f5)
rp3 = self.agg_p3(f7)
rp = paddle.concat([rp1, rp2, rp3], 2) # [b,nr,d]
rw1 = self.agg_w1(f3)
rw2 = self.agg_w2(f5)
rw3 = self.agg_w3(f7)
rw = paddle.concat([rw1, rw2, rw3], 2) # [b,nr,d]
y1 = self.gcn_pool(rp)
y2 = self.gcn_weight(rw)
y = 0.5 * (y1 + y2)
return y # [b,max_len,d]
......@@ -24,8 +24,10 @@ __all__ = ['build_post_process']
from .db_postprocess import DBPostProcess, DistillationDBPostProcess
from .east_postprocess import EASTPostProcess
from .sast_postprocess import SASTPostProcess
from .rec_postprocess import CTCLabelDecode, AttnLabelDecode, SRNLabelDecode, DistillationCTCLabelDecode, \
TableLabelDecode, NRTRLabelDecode, SARLabelDecode, SEEDLabelDecode
from .fce_postprocess import FCEPostProcess
from .rec_postprocess import CTCLabelDecode, AttnLabelDecode, SRNLabelDecode, \
DistillationCTCLabelDecode, TableLabelDecode, NRTRLabelDecode, SARLabelDecode, \
SEEDLabelDecode, PRENLabelDecode
from .cls_postprocess import ClsPostProcess
from .pg_postprocess import PGPostProcess
from .vqa_token_ser_layoutlm_postprocess import VQASerTokenLayoutLMPostProcess
......@@ -34,12 +36,12 @@ from .vqa_token_re_layoutlm_postprocess import VQAReTokenLayoutLMPostProcess
def build_post_process(config, global_config=None):
support_dict = [
'DBPostProcess', 'EASTPostProcess', 'SASTPostProcess', 'CTCLabelDecode',
'AttnLabelDecode', 'ClsPostProcess', 'SRNLabelDecode', 'PGPostProcess',
'DistillationCTCLabelDecode', 'TableLabelDecode',
'DBPostProcess', 'EASTPostProcess', 'SASTPostProcess', 'FCEPostProcess',
'CTCLabelDecode', 'AttnLabelDecode', 'ClsPostProcess', 'SRNLabelDecode',
'PGPostProcess', 'DistillationCTCLabelDecode', 'TableLabelDecode',
'DistillationDBPostProcess', 'NRTRLabelDecode', 'SARLabelDecode',
'SEEDLabelDecode', 'VQASerTokenLayoutLMPostProcess',
'VQAReTokenLayoutLMPostProcess'
'VQAReTokenLayoutLMPostProcess', 'PRENLabelDecode'
]
if config['name'] == 'PSEPostProcess':
......
# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
This code is refer from:
https://github.com/open-mmlab/mmocr/blob/v0.3.0/mmocr/models/textdet/postprocess/wrapper.py
"""
import cv2
import paddle
import numpy as np
from numpy.fft import ifft
from ppocr.utils.poly_nms import poly_nms, valid_boundary
def fill_hole(input_mask):
h, w = input_mask.shape
canvas = np.zeros((h + 2, w + 2), np.uint8)
canvas[1:h + 1, 1:w + 1] = input_mask.copy()
mask = np.zeros((h + 4, w + 4), np.uint8)
cv2.floodFill(canvas, mask, (0, 0), 1)
canvas = canvas[1:h + 1, 1:w + 1].astype(np.bool)
return ~canvas | input_mask
def fourier2poly(fourier_coeff, num_reconstr_points=50):
""" Inverse Fourier transform
Args:
fourier_coeff (ndarray): Fourier coefficients shaped (n, 2k+1),
with n and k being candidates number and Fourier degree
respectively.
num_reconstr_points (int): Number of reconstructed polygon points.
Returns:
Polygons (ndarray): The reconstructed polygons shaped (n, n')
"""
a = np.zeros((len(fourier_coeff), num_reconstr_points), dtype='complex')
k = (len(fourier_coeff[0]) - 1) // 2
a[:, 0:k + 1] = fourier_coeff[:, k:]
a[:, -k:] = fourier_coeff[:, :k]
poly_complex = ifft(a) * num_reconstr_points
polygon = np.zeros((len(fourier_coeff), num_reconstr_points, 2))
polygon[:, :, 0] = poly_complex.real
polygon[:, :, 1] = poly_complex.imag
return polygon.astype('int32').reshape((len(fourier_coeff), -1))
class FCEPostProcess(object):
"""
The post process for FCENet.
"""
def __init__(self,
scales,
fourier_degree=5,
num_reconstr_points=50,
decoding_type='fcenet',
score_thr=0.3,
nms_thr=0.1,
alpha=1.0,
beta=1.0,
box_type='poly',
**kwargs):
self.scales = scales
self.fourier_degree = fourier_degree
self.num_reconstr_points = num_reconstr_points
self.decoding_type = decoding_type
self.score_thr = score_thr
self.nms_thr = nms_thr
self.alpha = alpha
self.beta = beta
self.box_type = box_type
def __call__(self, preds, shape_list):
score_maps = []
for key, value in preds.items():
if isinstance(value, paddle.Tensor):
value = value.numpy()
cls_res = value[:, :4, :, :]
reg_res = value[:, 4:, :, :]
score_maps.append([cls_res, reg_res])
return self.get_boundary(score_maps, shape_list)
def resize_boundary(self, boundaries, scale_factor):
"""Rescale boundaries via scale_factor.
Args:
boundaries (list[list[float]]): The boundary list. Each boundary
with size 2k+1 with k>=4.
scale_factor(ndarray): The scale factor of size (4,).
Returns:
boundaries (list[list[float]]): The scaled boundaries.
"""
boxes = []
scores = []
for b in boundaries:
sz = len(b)
valid_boundary(b, True)
scores.append(b[-1])
b = (np.array(b[:sz - 1]) *
(np.tile(scale_factor[:2], int(
(sz - 1) / 2)).reshape(1, sz - 1))).flatten().tolist()
boxes.append(np.array(b).reshape([-1, 2]))
return np.array(boxes, dtype=np.float32), scores
def get_boundary(self, score_maps, shape_list):
assert len(score_maps) == len(self.scales)
boundaries = []
for idx, score_map in enumerate(score_maps):
scale = self.scales[idx]
boundaries = boundaries + self._get_boundary_single(score_map,
scale)
# nms
boundaries = poly_nms(boundaries, self.nms_thr)
boundaries, scores = self.resize_boundary(
boundaries, (1 / shape_list[0, 2:]).tolist()[::-1])
boxes_batch = [dict(points=boundaries, scores=scores)]
return boxes_batch
def _get_boundary_single(self, score_map, scale):
assert len(score_map) == 2
assert score_map[1].shape[1] == 4 * self.fourier_degree + 2
return self.fcenet_decode(
preds=score_map,
fourier_degree=self.fourier_degree,
num_reconstr_points=self.num_reconstr_points,
scale=scale,
alpha=self.alpha,
beta=self.beta,
box_type=self.box_type,
score_thr=self.score_thr,
nms_thr=self.nms_thr)
def fcenet_decode(self,
preds,
fourier_degree,
num_reconstr_points,
scale,
alpha=1.0,
beta=2.0,
box_type='poly',
score_thr=0.3,
nms_thr=0.1):
"""Decoding predictions of FCENet to instances.
Args:
preds (list(Tensor)): The head output tensors.
fourier_degree (int): The maximum Fourier transform degree k.
num_reconstr_points (int): The points number of the polygon
reconstructed from predicted Fourier coefficients.
scale (int): The down-sample scale of the prediction.
alpha (float) : The parameter to calculate final scores. Score_{final}
= (Score_{text region} ^ alpha)
* (Score_{text center region}^ beta)
beta (float) : The parameter to calculate final score.
box_type (str): Boundary encoding type 'poly' or 'quad'.
score_thr (float) : The threshold used to filter out the final
candidates.
nms_thr (float) : The threshold of nms.
Returns:
boundaries (list[list[float]]): The instance boundary and confidence
list.
"""
assert isinstance(preds, list)
assert len(preds) == 2
assert box_type in ['poly', 'quad']
cls_pred = preds[0][0]
tr_pred = cls_pred[0:2]
tcl_pred = cls_pred[2:]
reg_pred = preds[1][0].transpose([1, 2, 0])
x_pred = reg_pred[:, :, :2 * fourier_degree + 1]
y_pred = reg_pred[:, :, 2 * fourier_degree + 1:]
score_pred = (tr_pred[1]**alpha) * (tcl_pred[1]**beta)
tr_pred_mask = (score_pred) > score_thr
tr_mask = fill_hole(tr_pred_mask)
tr_contours, _ = cv2.findContours(
tr_mask.astype(np.uint8), cv2.RETR_TREE,
cv2.CHAIN_APPROX_SIMPLE) # opencv4
mask = np.zeros_like(tr_mask)
boundaries = []
for cont in tr_contours:
deal_map = mask.copy().astype(np.int8)
cv2.drawContours(deal_map, [cont], -1, 1, -1)
score_map = score_pred * deal_map
score_mask = score_map > 0
xy_text = np.argwhere(score_mask)
dxy = xy_text[:, 1] + xy_text[:, 0] * 1j
x, y = x_pred[score_mask], y_pred[score_mask]
c = x + y * 1j
c[:, fourier_degree] = c[:, fourier_degree] + dxy
c *= scale
polygons = fourier2poly(c, num_reconstr_points)
score = score_map[score_mask].reshape(-1, 1)
polygons = poly_nms(np.hstack((polygons, score)).tolist(), nms_thr)
boundaries = boundaries + polygons
boundaries = poly_nms(boundaries, nms_thr)
if box_type == 'quad':
new_boundaries = []
for boundary in boundaries:
poly = np.array(boundary[:-1]).reshape(-1, 2).astype(np.float32)
score = boundary[-1]
points = cv2.boxPoints(cv2.minAreaRect(poly))
points = np.int0(points)
new_boundaries.append(points.reshape(-1).tolist() + [score])
boundaries = new_boundaries
return boundaries
......@@ -37,10 +37,10 @@ class PSEPostProcess(object):
thresh=0.5,
box_thresh=0.85,
min_area=16,
box_type='box',
box_type='quad',
scale=4,
**kwargs):
assert box_type in ['box', 'poly'], 'Only box and poly is supported'
assert box_type in ['quad', 'poly'], 'Only quad and poly is supported'
self.thresh = thresh
self.box_thresh = box_thresh
self.min_area = min_area
......@@ -95,7 +95,7 @@ class PSEPostProcess(object):
label[ind] = 0
continue
if self.box_type == 'box':
if self.box_type == 'quad':
rect = cv2.minAreaRect(points)
bbox = cv2.boxPoints(rect)
elif self.box_type == 'poly':
......
......@@ -11,8 +11,8 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np
import string
import paddle
from paddle.nn import functional as F
import re
......@@ -652,3 +652,63 @@ class SARLabelDecode(BaseRecLabelDecode):
def get_ignored_tokens(self):
return [self.padding_idx]
class PRENLabelDecode(BaseRecLabelDecode):
""" Convert between text-label and text-index """
def __init__(self, character_dict_path=None, use_space_char=False,
**kwargs):
super(PRENLabelDecode, self).__init__(character_dict_path,
use_space_char)
def add_special_char(self, dict_character):
padding_str = '<PAD>' # 0
end_str = '<EOS>' # 1
unknown_str = '<UNK>' # 2
dict_character = [padding_str, end_str, unknown_str] + dict_character
self.padding_idx = 0
self.end_idx = 1
self.unknown_idx = 2
return dict_character
def decode(self, text_index, text_prob=None):
""" convert text-index into text-label. """
result_list = []
batch_size = len(text_index)
for batch_idx in range(batch_size):
char_list = []
conf_list = []
for idx in range(len(text_index[batch_idx])):
if text_index[batch_idx][idx] == self.end_idx:
break
if text_index[batch_idx][idx] in \
[self.padding_idx, self.unknown_idx]:
continue
char_list.append(self.character[int(text_index[batch_idx][
idx])])
if text_prob is not None:
conf_list.append(text_prob[batch_idx][idx])
else:
conf_list.append(1)
text = ''.join(char_list)
if len(text) > 0:
result_list.append((text, np.mean(conf_list)))
else:
# here confidence of empty recog result is 1
result_list.append(('', 1))
return result_list
def __call__(self, preds, label=None, *args, **kwargs):
preds = preds.numpy()
preds_idx = preds.argmax(axis=2)
preds_prob = preds.max(axis=2)
text = self.decode(preds_idx, preds_prob)
if label is None:
return text
label = self.decode(label)
return text, label
# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np
from shapely.geometry import Polygon
def points2polygon(points):
"""Convert k points to 1 polygon.
Args:
points (ndarray or list): A ndarray or a list of shape (2k)
that indicates k points.
Returns:
polygon (Polygon): A polygon object.
"""
if isinstance(points, list):
points = np.array(points)
assert isinstance(points, np.ndarray)
assert (points.size % 2 == 0) and (points.size >= 8)
point_mat = points.reshape([-1, 2])
return Polygon(point_mat)
def poly_intersection(poly_det, poly_gt, buffer=0.0001):
"""Calculate the intersection area between two polygon.
Args:
poly_det (Polygon): A polygon predicted by detector.
poly_gt (Polygon): A gt polygon.
Returns:
intersection_area (float): The intersection area between two polygons.
"""
assert isinstance(poly_det, Polygon)
assert isinstance(poly_gt, Polygon)
if buffer == 0:
poly_inter = poly_det & poly_gt
else:
poly_inter = poly_det.buffer(buffer) & poly_gt.buffer(buffer)
return poly_inter.area, poly_inter
def poly_union(poly_det, poly_gt):
"""Calculate the union area between two polygon.
Args:
poly_det (Polygon): A polygon predicted by detector.
poly_gt (Polygon): A gt polygon.
Returns:
union_area (float): The union area between two polygons.
"""
assert isinstance(poly_det, Polygon)
assert isinstance(poly_gt, Polygon)
area_det = poly_det.area
area_gt = poly_gt.area
area_inters, _ = poly_intersection(poly_det, poly_gt)
return area_det + area_gt - area_inters
def valid_boundary(x, with_score=True):
num = len(x)
if num < 8:
return False
if num % 2 == 0 and (not with_score):
return True
if num % 2 == 1 and with_score:
return True
return False
def boundary_iou(src, target):
"""Calculate the IOU between two boundaries.
Args:
src (list): Source boundary.
target (list): Target boundary.
Returns:
iou (float): The iou between two boundaries.
"""
assert valid_boundary(src, False)
assert valid_boundary(target, False)
src_poly = points2polygon(src)
target_poly = points2polygon(target)
return poly_iou(src_poly, target_poly)
def poly_iou(poly_det, poly_gt):
"""Calculate the IOU between two polygons.
Args:
poly_det (Polygon): A polygon predicted by detector.
poly_gt (Polygon): A gt polygon.
Returns:
iou (float): The IOU between two polygons.
"""
assert isinstance(poly_det, Polygon)
assert isinstance(poly_gt, Polygon)
area_inters, _ = poly_intersection(poly_det, poly_gt)
area_union = poly_union(poly_det, poly_gt)
if area_union == 0:
return 0.0
return area_inters / area_union
def poly_nms(polygons, threshold):
assert isinstance(polygons, list)
polygons = np.array(sorted(polygons, key=lambda x: x[-1]))
keep_poly = []
index = [i for i in range(polygons.shape[0])]
while len(index) > 0:
keep_poly.append(polygons[index[-1]].tolist())
A = polygons[index[-1]][:-1]
index = np.delete(index, -1)
iou_list = np.zeros((len(index), ))
for i in range(len(index)):
B = polygons[index[i]][:-1]
iou_list[i] = boundary_iou(A, B)
remove_index = np.where(iou_list > threshold)
index = np.delete(index, remove_index)
return keep_poly
......@@ -98,9 +98,9 @@ PP-Structure Series Model List (Updating)
### 7.1 Layout analysis model
|model name|description|download|
| --- | --- | --- |
| ppyolov2_r50vd_dcn_365e_publaynet | The layout analysis model trained on the PubLayNet dataset can divide image into 5 types of areas **text, title, table, picture, and list** | [PubLayNet](https://paddle-model-ecology.bj.bcebos.com/model/layout-parser/ppyolov2_r50vd_dcn_365e_publaynet.tar) |
|model name|description|download|label_map|
| --- | --- | --- |--- |
| ppyolov2_r50vd_dcn_365e_publaynet | The layout analysis model trained on the PubLayNet dataset can divide image into 5 types of areas **text, title, table, picture, and list** | [PubLayNet](https://paddle-model-ecology.bj.bcebos.com/model/layout-parser/ppyolov2_r50vd_dcn_365e_publaynet.tar) | {0: "Text", 1: "Title", 2: "List", 3:"Table", 4:"Figure"}|
### 7.2 OCR and table recognition model
......
......@@ -96,9 +96,9 @@ PP-Structure系列模型列表(更新中)
### 7.1 版面分析模型
|模型名称|模型简介|下载地址|
| --- | --- | --- |
| ppyolov2_r50vd_dcn_365e_publaynet | PubLayNet 数据集训练的版面分析模型,可以划分**文字、标题、表格、图片以及列表**5类区域 | [PubLayNet](https://paddle-model-ecology.bj.bcebos.com/model/layout-parser/ppyolov2_r50vd_dcn_365e_publaynet.tar) |
|模型名称|模型简介|下载地址| label_map|
| --- | --- | --- | --- |
| ppyolov2_r50vd_dcn_365e_publaynet | PubLayNet 数据集训练的版面分析模型,可以划分**文字、标题、表格、图片以及列表**5类区域 | [PubLayNet](https://paddle-model-ecology.bj.bcebos.com/model/layout-parser/ppyolov2_r50vd_dcn_365e_publaynet.tar) | {0: "Text", 1: "Title", 2: "List", 3:"Table", 4:"Figure"}|
### 7.2 OCR和表格识别模型
......
......@@ -11,11 +11,11 @@
## 1. LayoutParser 模型
|模型名称|模型简介|下载地址|
| --- | --- | --- |
| ppyolov2_r50vd_dcn_365e_publaynet | PubLayNet 数据集训练的版面分析模型,可以划分**文字、标题、表格、图片以及列表**5类区域 | [PubLayNet](https://paddle-model-ecology.bj.bcebos.com/model/layout-parser/ppyolov2_r50vd_dcn_365e_publaynet.tar) |
| ppyolov2_r50vd_dcn_365e_tableBank_word | TableBank Word 数据集训练的版面分析模型,只能检测表格 | [TableBank Word](https://paddle-model-ecology.bj.bcebos.com/model/layout-parser/ppyolov2_r50vd_dcn_365e_tableBank_word.tar) |
| ppyolov2_r50vd_dcn_365e_tableBank_latex | TableBank Latex 数据集训练的版面分析模型,只能检测表格 | [TableBank Latex](https://paddle-model-ecology.bj.bcebos.com/model/layout-parser/ppyolov2_r50vd_dcn_365e_tableBank_latex.tar) |
|模型名称|模型简介|下载地址|label_map|
| --- | --- | --- | --- |
| ppyolov2_r50vd_dcn_365e_publaynet | PubLayNet 数据集训练的版面分析模型,可以划分**文字、标题、表格、图片以及列表**5类区域 | [推理模型](https://paddle-model-ecology.bj.bcebos.com/model/layout-parser/ppyolov2_r50vd_dcn_365e_publaynet.tar) / [训练模型](https://paddle-model-ecology.bj.bcebos.com/model/layout-parser/ppyolov2_r50vd_dcn_365e_publaynet_pretrained.pdparams) |{0: "Text", 1: "Title", 2: "List", 3:"Table", 4:"Figure"}|
| ppyolov2_r50vd_dcn_365e_tableBank_word | TableBank Word 数据集训练的版面分析模型,只能检测表格 | [推理模型](https://paddle-model-ecology.bj.bcebos.com/model/layout-parser/ppyolov2_r50vd_dcn_365e_tableBank_word.tar) | {0:"Table"}|
| ppyolov2_r50vd_dcn_365e_tableBank_latex | TableBank Latex 数据集训练的版面分析模型,只能检测表格 | [推理模型](https://paddle-model-ecology.bj.bcebos.com/model/layout-parser/ppyolov2_r50vd_dcn_365e_tableBank_latex.tar) | {0:"Table"}|
## 2. OCR和表格识别模型
......
......@@ -100,7 +100,9 @@ dict 里各个字段说明如下
| output | excel和识别结果保存的地址 | ./output/table |
| table_max_len | 表格结构模型预测时,图像的长边resize尺度 | 488 |
| table_model_dir | 表格结构模型 inference 模型地址 | None |
| table_char_type | 表格结构模型所用字典地址 | ../ppocr/utils/dict/table_structure_dict.txt |
| table_char_dict_path | 表格结构模型所用字典地址 | ../ppocr/utils/dict/table_structure_dict.txt |
| layout_path_model | 版面分析模型模型地址,可以为在线地址或者本地地址,当为本地地址时,需要指定 layout_label_map, 命令行模式下可通过--layout_label_map='{0: "Text", 1: "Title", 2: "List", 3:"Table", 4:"Figure"}' 指定 | lp://PubLayNet/ppyolov2_r50vd_dcn_365e_publaynet/config |
| layout_label_map | 版面分析模型模型label映射字典 | None |
| model_name_or_path | VQA SER模型地址 | None |
| max_seq_length | VQA SER模型最大支持token长度 | 512 |
| label_map_path | VQA SER 标签文件地址 | ./vqa/labels/labels_ser.txt |
......
......@@ -52,7 +52,7 @@ The following figure shows the result, with different colored detection boxes re
| threshold | threshold of prediction score | 0.5 | \ |
| input_shape | picture size of reshape | [3,640,640] | \ |
| batch_size | testing batch size | 1 | \ |
| label_map | category mapping table | None | Setting config_ path, it can be none, and the label is automatically obtained according to the dataset name_ map |
| label_map | category mapping table | None | Setting config_ path, it can be none, and the label is automatically obtained according to the dataset name_ map, You need to specify it manually when setting model_path |
| enforce_cpu | whether to use CPU | False | False to use GPU, and True to force the use of CPU |
| enforce_mkldnn | whether mkldnn acceleration is enabled in CPU prediction | True | \ |
| thread_num | the number of CPU threads | 10 | \ |
......
......@@ -52,7 +52,7 @@ show_img.show()
| threshold | 预测得分的阈值 | 0.5 | \ |
| input_shape | reshape之后图片尺寸 | [3,640,640] | \ |
| batch_size | 测试batch size | 1 | \ |
| label_map | 类别映射表 | None | 设置config_path时,可以为None,根据数据集名称自动获取label_map |
| label_map | 类别映射表 | None | 设置config_path时,可以为None,根据数据集名称自动获取label_map,设置model_path时需要手动指定 |
| enforce_cpu | 代码是否使用CPU运行 | False | 设置为False表示使用GPU,True表示强制使用CPU |
| enforce_mkldnn | CPU预测中是否开启MKLDNN加速 | True | \ |
| thread_num | 设置CPU线程数 | 10 | \ |
......
......@@ -58,6 +58,7 @@ class OCRSystem(object):
self.table_layout = lp.PaddleDetectionLayoutModel(
config_path=config_path,
model_path=model_path,
label_map=args.layout_label_map,
threshold=0.5,
enable_mkldnn=args.enable_mkldnn,
enforce_cpu=not args.use_gpu,
......
......@@ -12,6 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import ast
from PIL import Image
import numpy as np
from tools.infer.utility import draw_ocr_box_txt, init_args as infer_args
......@@ -34,7 +35,11 @@ def init_args():
"--layout_path_model",
type=str,
default="lp://PubLayNet/ppyolov2_r50vd_dcn_365e_publaynet/config")
parser.add_argument(
"--layout_label_map",
type=ast.literal_eval,
default=None,
help='label map according to ppstructure/layout/README_ch.md')
# params for ser
parser.add_argument("--model_name_or_path", type=str)
parser.add_argument("--max_seq_length", type=int, default=512)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment