Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
wangsen
paddle_dbnet
Commits
aa59fca5
Commit
aa59fca5
authored
Apr 28, 2022
by
Leif
Browse files
Merge remote-tracking branch 'origin/dygraph' into dygraph
parents
12d15752
f01f24c7
Changes
208
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1163 additions
and
35 deletions
+1163
-35
ppocr/data/imaug/ssl_img_aug.py
ppocr/data/imaug/ssl_img_aug.py
+60
-0
ppocr/data/simple_dataset.py
ppocr/data/simple_dataset.py
+7
-3
ppocr/losses/__init__.py
ppocr/losses/__init__.py
+2
-1
ppocr/losses/basic_loss.py
ppocr/losses/basic_loss.py
+2
-2
ppocr/losses/combined_loss.py
ppocr/losses/combined_loss.py
+2
-0
ppocr/losses/distillation_loss.py
ppocr/losses/distillation_loss.py
+55
-3
ppocr/losses/rec_multi_loss.py
ppocr/losses/rec_multi_loss.py
+58
-0
ppocr/losses/rec_sar_loss.py
ppocr/losses/rec_sar_loss.py
+2
-1
ppocr/metrics/rec_metric.py
ppocr/metrics/rec_metric.py
+9
-3
ppocr/modeling/architectures/base_model.py
ppocr/modeling/architectures/base_model.py
+5
-1
ppocr/modeling/architectures/distillation_model.py
ppocr/modeling/architectures/distillation_model.py
+2
-2
ppocr/modeling/backbones/__init__.py
ppocr/modeling/backbones/__init__.py
+3
-1
ppocr/modeling/backbones/rec_mv1_enhance.py
ppocr/modeling/backbones/rec_mv1_enhance.py
+11
-4
ppocr/modeling/backbones/rec_svtrnet.py
ppocr/modeling/backbones/rec_svtrnet.py
+597
-0
ppocr/modeling/heads/__init__.py
ppocr/modeling/heads/__init__.py
+3
-1
ppocr/modeling/heads/det_db_head.py
ppocr/modeling/heads/det_db_head.py
+8
-7
ppocr/modeling/heads/rec_multi_head.py
ppocr/modeling/heads/rec_multi_head.py
+73
-0
ppocr/modeling/heads/rec_sar_head.py
ppocr/modeling/heads/rec_sar_head.py
+11
-3
ppocr/modeling/necks/__init__.py
ppocr/modeling/necks/__init__.py
+3
-3
ppocr/modeling/necks/db_fpn.py
ppocr/modeling/necks/db_fpn.py
+250
-0
No files found.
ppocr/data/imaug/ssl_img_aug.py
0 → 100644
View file @
aa59fca5
# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
math
import
cv2
import
numpy
as
np
import
random
from
PIL
import
Image
from
.rec_img_aug
import
resize_norm_img
class
SSLRotateResize
(
object
):
def
__init__
(
self
,
image_shape
,
padding
=
False
,
select_all
=
True
,
mode
=
"train"
,
**
kwargs
):
self
.
image_shape
=
image_shape
self
.
padding
=
padding
self
.
select_all
=
select_all
self
.
mode
=
mode
def
__call__
(
self
,
data
):
img
=
data
[
"image"
]
data
[
"image_r90"
]
=
cv2
.
rotate
(
img
,
cv2
.
ROTATE_90_CLOCKWISE
)
data
[
"image_r180"
]
=
cv2
.
rotate
(
data
[
"image_r90"
],
cv2
.
ROTATE_90_CLOCKWISE
)
data
[
"image_r270"
]
=
cv2
.
rotate
(
data
[
"image_r180"
],
cv2
.
ROTATE_90_CLOCKWISE
)
images
=
[]
for
key
in
[
"image"
,
"image_r90"
,
"image_r180"
,
"image_r270"
]:
images
.
append
(
resize_norm_img
(
data
.
pop
(
key
),
image_shape
=
self
.
image_shape
,
padding
=
self
.
padding
)[
0
])
data
[
"image"
]
=
np
.
stack
(
images
,
axis
=
0
)
data
[
"label"
]
=
np
.
array
(
list
(
range
(
4
)))
if
not
self
.
select_all
:
data
[
"image"
]
=
data
[
"image"
][
0
::
2
]
# just choose 0 and 180
data
[
"label"
]
=
data
[
"label"
][
0
:
2
]
# label needs to be continuous
if
self
.
mode
==
"test"
:
data
[
"image"
]
=
data
[
"image"
][
0
]
data
[
"label"
]
=
data
[
"label"
][
0
]
return
data
ppocr/data/simple_dataset.py
View file @
aa59fca5
...
...
@@ -49,7 +49,8 @@ class SimpleDataSet(Dataset):
if
self
.
mode
==
"train"
and
self
.
do_shuffle
:
self
.
shuffle_data_random
()
self
.
ops
=
create_operators
(
dataset_config
[
'transforms'
],
global_config
)
self
.
ext_op_transform_idx
=
dataset_config
.
get
(
"ext_op_transform_idx"
,
2
)
self
.
need_reset
=
True
in
[
x
<
1
for
x
in
ratio_list
]
def
get_image_info_list
(
self
,
file_list
,
ratio_list
):
...
...
@@ -87,7 +88,7 @@ class SimpleDataSet(Dataset):
if
hasattr
(
op
,
'ext_data_num'
):
ext_data_num
=
getattr
(
op
,
'ext_data_num'
)
break
load_data_ops
=
self
.
ops
[:
2
]
load_data_ops
=
self
.
ops
[:
self
.
ext_op_transform_idx
]
ext_data
=
[]
while
len
(
ext_data
)
<
ext_data_num
:
...
...
@@ -108,8 +109,11 @@ class SimpleDataSet(Dataset):
data
[
'image'
]
=
img
data
=
transform
(
data
,
load_data_ops
)
if
data
is
None
or
data
[
'polys'
].
shape
[
1
]
!=
4
:
if
data
is
None
:
continue
if
'polys'
in
data
.
keys
():
if
data
[
'polys'
].
shape
[
1
]
!=
4
:
continue
ext_data
.
append
(
data
)
return
ext_data
...
...
ppocr/losses/__init__.py
View file @
aa59fca5
...
...
@@ -34,6 +34,7 @@ from .rec_nrtr_loss import NRTRLoss
from
.rec_sar_loss
import
SARLoss
from
.rec_aster_loss
import
AsterLoss
from
.rec_pren_loss
import
PRENLoss
from
.rec_multi_loss
import
MultiLoss
# cls loss
from
.cls_loss
import
ClsLoss
...
...
@@ -60,7 +61,7 @@ def build_loss(config):
'DBLoss'
,
'PSELoss'
,
'EASTLoss'
,
'SASTLoss'
,
'FCELoss'
,
'CTCLoss'
,
'ClsLoss'
,
'AttentionLoss'
,
'SRNLoss'
,
'PGLoss'
,
'CombinedLoss'
,
'NRTRLoss'
,
'TableAttentionLoss'
,
'SARLoss'
,
'AsterLoss'
,
'SDMGRLoss'
,
'VQASerTokenLayoutLMLoss'
,
'LossFromOutput'
,
'PRENLoss'
'VQASerTokenLayoutLMLoss'
,
'LossFromOutput'
,
'PRENLoss'
,
'MultiLoss'
]
config
=
copy
.
deepcopy
(
config
)
module_name
=
config
.
pop
(
'name'
)
...
...
ppocr/losses/basic_loss.py
View file @
aa59fca5
...
...
@@ -106,8 +106,8 @@ class DMLLoss(nn.Layer):
def
forward
(
self
,
out1
,
out2
):
if
self
.
act
is
not
None
:
out1
=
self
.
act
(
out1
)
out2
=
self
.
act
(
out2
)
out1
=
self
.
act
(
out1
)
+
1e-10
out2
=
self
.
act
(
out2
)
+
1e-10
if
self
.
use_log
:
# for recognition distillation, log is needed for feature map
log_out1
=
paddle
.
log
(
out1
)
...
...
ppocr/losses/combined_loss.py
View file @
aa59fca5
...
...
@@ -18,8 +18,10 @@ import paddle.nn as nn
from
.rec_ctc_loss
import
CTCLoss
from
.center_loss
import
CenterLoss
from
.ace_loss
import
ACELoss
from
.rec_sar_loss
import
SARLoss
from
.distillation_loss
import
DistillationCTCLoss
from
.distillation_loss
import
DistillationSARLoss
from
.distillation_loss
import
DistillationDMLLoss
from
.distillation_loss
import
DistillationDistanceLoss
,
DistillationDBLoss
,
DistillationDilaDBLoss
...
...
ppocr/losses/distillation_loss.py
View file @
aa59fca5
...
...
@@ -18,6 +18,7 @@ import numpy as np
import
cv2
from
.rec_ctc_loss
import
CTCLoss
from
.rec_sar_loss
import
SARLoss
from
.basic_loss
import
DMLLoss
from
.basic_loss
import
DistanceLoss
from
.det_db_loss
import
DBLoss
...
...
@@ -46,11 +47,15 @@ class DistillationDMLLoss(DMLLoss):
act
=
None
,
use_log
=
False
,
key
=
None
,
multi_head
=
False
,
dis_head
=
'ctc'
,
maps_name
=
None
,
name
=
"dml"
):
super
().
__init__
(
act
=
act
,
use_log
=
use_log
)
assert
isinstance
(
model_name_pairs
,
list
)
self
.
key
=
key
self
.
multi_head
=
multi_head
self
.
dis_head
=
dis_head
self
.
model_name_pairs
=
self
.
_check_model_name_pairs
(
model_name_pairs
)
self
.
name
=
name
self
.
maps_name
=
self
.
_check_maps_name
(
maps_name
)
...
...
@@ -97,7 +102,11 @@ class DistillationDMLLoss(DMLLoss):
out2
=
out2
[
self
.
key
]
if
self
.
maps_name
is
None
:
loss
=
super
().
forward
(
out1
,
out2
)
if
self
.
multi_head
:
loss
=
super
().
forward
(
out1
[
self
.
dis_head
],
out2
[
self
.
dis_head
])
else
:
loss
=
super
().
forward
(
out1
,
out2
)
if
isinstance
(
loss
,
dict
):
for
key
in
loss
:
loss_dict
[
"{}_{}_{}_{}"
.
format
(
key
,
pair
[
0
],
pair
[
1
],
...
...
@@ -123,11 +132,16 @@ class DistillationDMLLoss(DMLLoss):
class
DistillationCTCLoss
(
CTCLoss
):
def
__init__
(
self
,
model_name_list
=
[],
key
=
None
,
name
=
"loss_ctc"
):
def
__init__
(
self
,
model_name_list
=
[],
key
=
None
,
multi_head
=
False
,
name
=
"loss_ctc"
):
super
().
__init__
()
self
.
model_name_list
=
model_name_list
self
.
key
=
key
self
.
name
=
name
self
.
multi_head
=
multi_head
def
forward
(
self
,
predicts
,
batch
):
loss_dict
=
dict
()
...
...
@@ -135,7 +149,45 @@ class DistillationCTCLoss(CTCLoss):
out
=
predicts
[
model_name
]
if
self
.
key
is
not
None
:
out
=
out
[
self
.
key
]
loss
=
super
().
forward
(
out
,
batch
)
if
self
.
multi_head
:
assert
'ctc'
in
out
,
'multi head has multi out'
loss
=
super
().
forward
(
out
[
'ctc'
],
batch
[:
2
]
+
batch
[
3
:])
else
:
loss
=
super
().
forward
(
out
,
batch
)
if
isinstance
(
loss
,
dict
):
for
key
in
loss
:
loss_dict
[
"{}_{}_{}"
.
format
(
self
.
name
,
model_name
,
idx
)]
=
loss
[
key
]
else
:
loss_dict
[
"{}_{}"
.
format
(
self
.
name
,
model_name
)]
=
loss
return
loss_dict
class
DistillationSARLoss
(
SARLoss
):
def
__init__
(
self
,
model_name_list
=
[],
key
=
None
,
multi_head
=
False
,
name
=
"loss_sar"
,
**
kwargs
):
ignore_index
=
kwargs
.
get
(
'ignore_index'
,
92
)
super
().
__init__
(
ignore_index
=
ignore_index
)
self
.
model_name_list
=
model_name_list
self
.
key
=
key
self
.
name
=
name
self
.
multi_head
=
multi_head
def
forward
(
self
,
predicts
,
batch
):
loss_dict
=
dict
()
for
idx
,
model_name
in
enumerate
(
self
.
model_name_list
):
out
=
predicts
[
model_name
]
if
self
.
key
is
not
None
:
out
=
out
[
self
.
key
]
if
self
.
multi_head
:
assert
'sar'
in
out
,
'multi head has multi out'
loss
=
super
().
forward
(
out
[
'sar'
],
batch
[:
1
]
+
batch
[
2
:])
else
:
loss
=
super
().
forward
(
out
,
batch
)
if
isinstance
(
loss
,
dict
):
for
key
in
loss
:
loss_dict
[
"{}_{}_{}"
.
format
(
self
.
name
,
model_name
,
...
...
ppocr/losses/rec_multi_loss.py
0 → 100644
View file @
aa59fca5
# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
absolute_import
from
__future__
import
division
from
__future__
import
print_function
import
paddle
from
paddle
import
nn
from
.rec_ctc_loss
import
CTCLoss
from
.rec_sar_loss
import
SARLoss
class
MultiLoss
(
nn
.
Layer
):
def
__init__
(
self
,
**
kwargs
):
super
().
__init__
()
self
.
loss_funcs
=
{}
self
.
loss_list
=
kwargs
.
pop
(
'loss_config_list'
)
self
.
weight_1
=
kwargs
.
get
(
'weight_1'
,
1.0
)
self
.
weight_2
=
kwargs
.
get
(
'weight_2'
,
1.0
)
self
.
gtc_loss
=
kwargs
.
get
(
'gtc_loss'
,
'sar'
)
for
loss_info
in
self
.
loss_list
:
for
name
,
param
in
loss_info
.
items
():
if
param
is
not
None
:
kwargs
.
update
(
param
)
loss
=
eval
(
name
)(
**
kwargs
)
self
.
loss_funcs
[
name
]
=
loss
def
forward
(
self
,
predicts
,
batch
):
self
.
total_loss
=
{}
total_loss
=
0.0
# batch [image, label_ctc, label_sar, length, valid_ratio]
for
name
,
loss_func
in
self
.
loss_funcs
.
items
():
if
name
==
'CTCLoss'
:
loss
=
loss_func
(
predicts
[
'ctc'
],
batch
[:
2
]
+
batch
[
3
:])[
'loss'
]
*
self
.
weight_1
elif
name
==
'SARLoss'
:
loss
=
loss_func
(
predicts
[
'sar'
],
batch
[:
1
]
+
batch
[
2
:])[
'loss'
]
*
self
.
weight_2
else
:
raise
NotImplementedError
(
'{} is not supported in MultiLoss yet'
.
format
(
name
))
self
.
total_loss
[
name
]
=
loss
total_loss
+=
loss
self
.
total_loss
[
'loss'
]
=
total_loss
return
self
.
total_loss
ppocr/losses/rec_sar_loss.py
View file @
aa59fca5
...
...
@@ -9,8 +9,9 @@ from paddle import nn
class
SARLoss
(
nn
.
Layer
):
def
__init__
(
self
,
**
kwargs
):
super
(
SARLoss
,
self
).
__init__
()
ignore_index
=
kwargs
.
get
(
'ignore_index'
,
92
)
# 6626
self
.
loss_func
=
paddle
.
nn
.
loss
.
CrossEntropyLoss
(
reduction
=
"mean"
,
ignore_index
=
92
)
reduction
=
"mean"
,
ignore_index
=
ignore_index
)
def
forward
(
self
,
predicts
,
batch
):
predict
=
predicts
[:,
:
...
...
ppocr/metrics/rec_metric.py
View file @
aa59fca5
...
...
@@ -17,9 +17,14 @@ import string
class
RecMetric
(
object
):
def
__init__
(
self
,
main_indicator
=
'acc'
,
is_filter
=
False
,
**
kwargs
):
def
__init__
(
self
,
main_indicator
=
'acc'
,
is_filter
=
False
,
ignore_space
=
True
,
**
kwargs
):
self
.
main_indicator
=
main_indicator
self
.
is_filter
=
is_filter
self
.
ignore_space
=
ignore_space
self
.
eps
=
1e-5
self
.
reset
()
...
...
@@ -34,8 +39,9 @@ class RecMetric(object):
all_num
=
0
norm_edit_dis
=
0.0
for
(
pred
,
pred_conf
),
(
target
,
_
)
in
zip
(
preds
,
labels
):
pred
=
pred
.
replace
(
" "
,
""
)
target
=
target
.
replace
(
" "
,
""
)
if
self
.
ignore_space
:
pred
=
pred
.
replace
(
" "
,
""
)
target
=
target
.
replace
(
" "
,
""
)
if
self
.
is_filter
:
pred
=
self
.
_normalize_text
(
pred
)
target
=
self
.
_normalize_text
(
target
)
...
...
ppocr/modeling/architectures/base_model.py
View file @
aa59fca5
...
...
@@ -83,7 +83,11 @@ class BaseModel(nn.Layer):
y
[
"neck_out"
]
=
x
if
self
.
use_head
:
x
=
self
.
head
(
x
,
targets
=
data
)
if
isinstance
(
x
,
dict
):
# for multi head, save ctc neck out for udml
if
isinstance
(
x
,
dict
)
and
'ctc_neck'
in
x
.
keys
():
y
[
"neck_out"
]
=
x
[
"ctc_neck"
]
y
[
"head_out"
]
=
x
elif
isinstance
(
x
,
dict
):
y
.
update
(
x
)
else
:
y
[
"head_out"
]
=
x
...
...
ppocr/modeling/architectures/distillation_model.py
View file @
aa59fca5
...
...
@@ -53,8 +53,8 @@ class DistillationModel(nn.Layer):
self
.
model_list
.
append
(
self
.
add_sublayer
(
key
,
model
))
self
.
model_name_list
.
append
(
key
)
def
forward
(
self
,
x
):
def
forward
(
self
,
x
,
data
=
None
):
result_dict
=
dict
()
for
idx
,
model_name
in
enumerate
(
self
.
model_name_list
):
result_dict
[
model_name
]
=
self
.
model_list
[
idx
](
x
)
result_dict
[
model_name
]
=
self
.
model_list
[
idx
](
x
,
data
)
return
result_dict
ppocr/modeling/backbones/__init__.py
View file @
aa59fca5
...
...
@@ -31,9 +31,11 @@ def build_backbone(config, model_type):
from
.rec_resnet_aster
import
ResNet_ASTER
from
.rec_micronet
import
MicroNet
from
.rec_efficientb3_pren
import
EfficientNetb3_PREN
from
.rec_svtrnet
import
SVTRNet
support_dict
=
[
'MobileNetV1Enhance'
,
'MobileNetV3'
,
'ResNet'
,
'ResNetFPN'
,
'MTB'
,
"ResNet31"
,
"ResNet_ASTER"
,
'MicroNet'
,
'EfficientNetb3_PREN'
"ResNet31"
,
"ResNet_ASTER"
,
'MicroNet'
,
'EfficientNetb3_PREN'
,
'SVTRNet'
]
elif
model_type
==
"e2e"
:
from
.e2e_resnet_vd_pg
import
ResNet
...
...
ppocr/modeling/backbones/rec_mv1_enhance.py
View file @
aa59fca5
...
...
@@ -103,7 +103,12 @@ class DepthwiseSeparable(nn.Layer):
class
MobileNetV1Enhance
(
nn
.
Layer
):
def
__init__
(
self
,
in_channels
=
3
,
scale
=
0.5
,
**
kwargs
):
def
__init__
(
self
,
in_channels
=
3
,
scale
=
0.5
,
last_conv_stride
=
1
,
last_pool_type
=
'max'
,
**
kwargs
):
super
().
__init__
()
self
.
scale
=
scale
self
.
block_list
=
[]
...
...
@@ -200,7 +205,7 @@ class MobileNetV1Enhance(nn.Layer):
num_filters1
=
1024
,
num_filters2
=
1024
,
num_groups
=
1024
,
stride
=
1
,
stride
=
last_conv_stride
,
dw_size
=
5
,
padding
=
2
,
use_se
=
True
,
...
...
@@ -208,8 +213,10 @@ class MobileNetV1Enhance(nn.Layer):
self
.
block_list
.
append
(
conv6
)
self
.
block_list
=
nn
.
Sequential
(
*
self
.
block_list
)
self
.
pool
=
nn
.
MaxPool2D
(
kernel_size
=
2
,
stride
=
2
,
padding
=
0
)
if
last_pool_type
==
'avg'
:
self
.
pool
=
nn
.
AvgPool2D
(
kernel_size
=
2
,
stride
=
2
,
padding
=
0
)
else
:
self
.
pool
=
nn
.
MaxPool2D
(
kernel_size
=
2
,
stride
=
2
,
padding
=
0
)
self
.
out_channels
=
int
(
1024
*
scale
)
def
forward
(
self
,
inputs
):
...
...
ppocr/modeling/backbones/rec_svtrnet.py
0 → 100644
View file @
aa59fca5
# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
collections
import
Callable
from
paddle
import
ParamAttr
from
paddle.nn.initializer
import
KaimingNormal
import
numpy
as
np
import
paddle
import
paddle.nn
as
nn
from
paddle.nn.initializer
import
TruncatedNormal
,
Constant
,
Normal
trunc_normal_
=
TruncatedNormal
(
std
=
.
02
)
normal_
=
Normal
zeros_
=
Constant
(
value
=
0.
)
ones_
=
Constant
(
value
=
1.
)
def
drop_path
(
x
,
drop_prob
=
0.
,
training
=
False
):
"""Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ...
"""
if
drop_prob
==
0.
or
not
training
:
return
x
keep_prob
=
paddle
.
to_tensor
(
1
-
drop_prob
)
shape
=
(
paddle
.
shape
(
x
)[
0
],
)
+
(
1
,
)
*
(
x
.
ndim
-
1
)
random_tensor
=
keep_prob
+
paddle
.
rand
(
shape
,
dtype
=
x
.
dtype
)
random_tensor
=
paddle
.
floor
(
random_tensor
)
# binarize
output
=
x
.
divide
(
keep_prob
)
*
random_tensor
return
output
class
ConvBNLayer
(
nn
.
Layer
):
def
__init__
(
self
,
in_channels
,
out_channels
,
kernel_size
=
3
,
stride
=
1
,
padding
=
0
,
bias_attr
=
False
,
groups
=
1
,
act
=
nn
.
GELU
):
super
().
__init__
()
self
.
conv
=
nn
.
Conv2D
(
in_channels
=
in_channels
,
out_channels
=
out_channels
,
kernel_size
=
kernel_size
,
stride
=
stride
,
padding
=
padding
,
groups
=
groups
,
weight_attr
=
paddle
.
ParamAttr
(
initializer
=
nn
.
initializer
.
KaimingUniform
()),
bias_attr
=
bias_attr
)
self
.
norm
=
nn
.
BatchNorm2D
(
out_channels
)
self
.
act
=
act
()
def
forward
(
self
,
inputs
):
out
=
self
.
conv
(
inputs
)
out
=
self
.
norm
(
out
)
out
=
self
.
act
(
out
)
return
out
class
DropPath
(
nn
.
Layer
):
"""Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
"""
def
__init__
(
self
,
drop_prob
=
None
):
super
(
DropPath
,
self
).
__init__
()
self
.
drop_prob
=
drop_prob
def
forward
(
self
,
x
):
return
drop_path
(
x
,
self
.
drop_prob
,
self
.
training
)
class
Identity
(
nn
.
Layer
):
def
__init__
(
self
):
super
(
Identity
,
self
).
__init__
()
def
forward
(
self
,
input
):
return
input
class
Mlp
(
nn
.
Layer
):
def
__init__
(
self
,
in_features
,
hidden_features
=
None
,
out_features
=
None
,
act_layer
=
nn
.
GELU
,
drop
=
0.
):
super
().
__init__
()
out_features
=
out_features
or
in_features
hidden_features
=
hidden_features
or
in_features
self
.
fc1
=
nn
.
Linear
(
in_features
,
hidden_features
)
self
.
act
=
act_layer
()
self
.
fc2
=
nn
.
Linear
(
hidden_features
,
out_features
)
self
.
drop
=
nn
.
Dropout
(
drop
)
def
forward
(
self
,
x
):
x
=
self
.
fc1
(
x
)
x
=
self
.
act
(
x
)
x
=
self
.
drop
(
x
)
x
=
self
.
fc2
(
x
)
x
=
self
.
drop
(
x
)
return
x
class
ConvMixer
(
nn
.
Layer
):
def
__init__
(
self
,
dim
,
num_heads
=
8
,
HW
=
[
8
,
25
],
local_k
=
[
3
,
3
],
):
super
().
__init__
()
self
.
HW
=
HW
self
.
dim
=
dim
self
.
local_mixer
=
nn
.
Conv2D
(
dim
,
dim
,
local_k
,
1
,
[
local_k
[
0
]
//
2
,
local_k
[
1
]
//
2
],
groups
=
num_heads
,
weight_attr
=
ParamAttr
(
initializer
=
KaimingNormal
()))
def
forward
(
self
,
x
):
h
=
self
.
HW
[
0
]
w
=
self
.
HW
[
1
]
x
=
x
.
transpose
([
0
,
2
,
1
]).
reshape
([
0
,
self
.
dim
,
h
,
w
])
x
=
self
.
local_mixer
(
x
)
x
=
x
.
flatten
(
2
).
transpose
([
0
,
2
,
1
])
return
x
class
Attention
(
nn
.
Layer
):
def
__init__
(
self
,
dim
,
num_heads
=
8
,
mixer
=
'Global'
,
HW
=
[
8
,
25
],
local_k
=
[
7
,
11
],
qkv_bias
=
False
,
qk_scale
=
None
,
attn_drop
=
0.
,
proj_drop
=
0.
):
super
().
__init__
()
self
.
num_heads
=
num_heads
head_dim
=
dim
//
num_heads
self
.
scale
=
qk_scale
or
head_dim
**-
0.5
self
.
qkv
=
nn
.
Linear
(
dim
,
dim
*
3
,
bias_attr
=
qkv_bias
)
self
.
attn_drop
=
nn
.
Dropout
(
attn_drop
)
self
.
proj
=
nn
.
Linear
(
dim
,
dim
)
self
.
proj_drop
=
nn
.
Dropout
(
proj_drop
)
self
.
HW
=
HW
if
HW
is
not
None
:
H
=
HW
[
0
]
W
=
HW
[
1
]
self
.
N
=
H
*
W
self
.
C
=
dim
if
mixer
==
'Local'
and
HW
is
not
None
:
hk
=
local_k
[
0
]
wk
=
local_k
[
1
]
mask
=
np
.
ones
([
H
*
W
,
H
*
W
])
for
h
in
range
(
H
):
for
w
in
range
(
W
):
for
kh
in
range
(
-
(
hk
//
2
),
(
hk
//
2
)
+
1
):
for
kw
in
range
(
-
(
wk
//
2
),
(
wk
//
2
)
+
1
):
if
H
>
(
h
+
kh
)
>=
0
and
W
>
(
w
+
kw
)
>=
0
:
mask
[
h
*
W
+
w
][(
h
+
kh
)
*
W
+
(
w
+
kw
)]
=
0
mask_paddle
=
paddle
.
to_tensor
(
mask
,
dtype
=
'float32'
)
mask_inf
=
paddle
.
full
([
H
*
W
,
H
*
W
],
'-inf'
,
dtype
=
'float32'
)
mask
=
paddle
.
where
(
mask_paddle
<
1
,
mask_paddle
,
mask_inf
)
self
.
mask
=
mask
.
unsqueeze
([
0
,
1
])
self
.
mixer
=
mixer
def
forward
(
self
,
x
):
if
self
.
HW
is
not
None
:
N
=
self
.
N
C
=
self
.
C
else
:
_
,
N
,
C
=
x
.
shape
qkv
=
self
.
qkv
(
x
).
reshape
((
0
,
N
,
3
,
self
.
num_heads
,
C
//
self
.
num_heads
)).
transpose
((
2
,
0
,
3
,
1
,
4
))
q
,
k
,
v
=
qkv
[
0
]
*
self
.
scale
,
qkv
[
1
],
qkv
[
2
]
attn
=
(
q
.
matmul
(
k
.
transpose
((
0
,
1
,
3
,
2
))))
if
self
.
mixer
==
'Local'
:
attn
+=
self
.
mask
attn
=
nn
.
functional
.
softmax
(
attn
,
axis
=-
1
)
attn
=
self
.
attn_drop
(
attn
)
x
=
(
attn
.
matmul
(
v
)).
transpose
((
0
,
2
,
1
,
3
)).
reshape
((
0
,
N
,
C
))
x
=
self
.
proj
(
x
)
x
=
self
.
proj_drop
(
x
)
return
x
class
Block
(
nn
.
Layer
):
def
__init__
(
self
,
dim
,
num_heads
,
mixer
=
'Global'
,
local_mixer
=
[
7
,
11
],
HW
=
[
8
,
25
],
mlp_ratio
=
4.
,
qkv_bias
=
False
,
qk_scale
=
None
,
drop
=
0.
,
attn_drop
=
0.
,
drop_path
=
0.
,
act_layer
=
nn
.
GELU
,
norm_layer
=
'nn.LayerNorm'
,
epsilon
=
1e-6
,
prenorm
=
True
):
super
().
__init__
()
if
isinstance
(
norm_layer
,
str
):
self
.
norm1
=
eval
(
norm_layer
)(
dim
,
epsilon
=
epsilon
)
elif
isinstance
(
norm_layer
,
Callable
):
self
.
norm1
=
norm_layer
(
dim
)
else
:
raise
TypeError
(
"The norm_layer must be str or paddle.nn.layer.Layer class"
)
if
mixer
==
'Global'
or
mixer
==
'Local'
:
self
.
mixer
=
Attention
(
dim
,
num_heads
=
num_heads
,
mixer
=
mixer
,
HW
=
HW
,
local_k
=
local_mixer
,
qkv_bias
=
qkv_bias
,
qk_scale
=
qk_scale
,
attn_drop
=
attn_drop
,
proj_drop
=
drop
)
elif
mixer
==
'Conv'
:
self
.
mixer
=
ConvMixer
(
dim
,
num_heads
=
num_heads
,
HW
=
HW
,
local_k
=
local_mixer
)
else
:
raise
TypeError
(
"The mixer must be one of [Global, Local, Conv]"
)
# NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
self
.
drop_path
=
DropPath
(
drop_path
)
if
drop_path
>
0.
else
Identity
()
if
isinstance
(
norm_layer
,
str
):
self
.
norm2
=
eval
(
norm_layer
)(
dim
,
epsilon
=
epsilon
)
elif
isinstance
(
norm_layer
,
Callable
):
self
.
norm2
=
norm_layer
(
dim
)
else
:
raise
TypeError
(
"The norm_layer must be str or paddle.nn.layer.Layer class"
)
mlp_hidden_dim
=
int
(
dim
*
mlp_ratio
)
self
.
mlp_ratio
=
mlp_ratio
self
.
mlp
=
Mlp
(
in_features
=
dim
,
hidden_features
=
mlp_hidden_dim
,
act_layer
=
act_layer
,
drop
=
drop
)
self
.
prenorm
=
prenorm
def
forward
(
self
,
x
):
if
self
.
prenorm
:
x
=
self
.
norm1
(
x
+
self
.
drop_path
(
self
.
mixer
(
x
)))
x
=
self
.
norm2
(
x
+
self
.
drop_path
(
self
.
mlp
(
x
)))
else
:
x
=
x
+
self
.
drop_path
(
self
.
mixer
(
self
.
norm1
(
x
)))
x
=
x
+
self
.
drop_path
(
self
.
mlp
(
self
.
norm2
(
x
)))
return
x
class
PatchEmbed
(
nn
.
Layer
):
""" Image to Patch Embedding
"""
def
__init__
(
self
,
img_size
=
[
32
,
100
],
in_channels
=
3
,
embed_dim
=
768
,
sub_num
=
2
):
super
().
__init__
()
num_patches
=
(
img_size
[
1
]
//
(
2
**
sub_num
))
*
\
(
img_size
[
0
]
//
(
2
**
sub_num
))
self
.
img_size
=
img_size
self
.
num_patches
=
num_patches
self
.
embed_dim
=
embed_dim
self
.
norm
=
None
if
sub_num
==
2
:
self
.
proj
=
nn
.
Sequential
(
ConvBNLayer
(
in_channels
=
in_channels
,
out_channels
=
embed_dim
//
2
,
kernel_size
=
3
,
stride
=
2
,
padding
=
1
,
act
=
nn
.
GELU
,
bias_attr
=
None
),
ConvBNLayer
(
in_channels
=
embed_dim
//
2
,
out_channels
=
embed_dim
,
kernel_size
=
3
,
stride
=
2
,
padding
=
1
,
act
=
nn
.
GELU
,
bias_attr
=
None
))
if
sub_num
==
3
:
self
.
proj
=
nn
.
Sequential
(
ConvBNLayer
(
in_channels
=
in_channels
,
out_channels
=
embed_dim
//
4
,
kernel_size
=
3
,
stride
=
2
,
padding
=
1
,
act
=
nn
.
GELU
,
bias_attr
=
None
),
ConvBNLayer
(
in_channels
=
embed_dim
//
4
,
out_channels
=
embed_dim
//
2
,
kernel_size
=
3
,
stride
=
2
,
padding
=
1
,
act
=
nn
.
GELU
,
bias_attr
=
None
),
ConvBNLayer
(
embed_dim
//
2
,
embed_dim
,
in_channels
=
embed_dim
//
2
,
out_channels
=
embed_dim
,
kernel_size
=
3
,
stride
=
2
,
padding
=
1
,
act
=
nn
.
GELU
,
bias_attr
=
None
))
def
forward
(
self
,
x
):
B
,
C
,
H
,
W
=
x
.
shape
assert
H
==
self
.
img_size
[
0
]
and
W
==
self
.
img_size
[
1
],
\
f
"Input image size (
{
H
}
*
{
W
}
) doesn't match model (
{
self
.
img_size
[
0
]
}
*
{
self
.
img_size
[
1
]
}
)."
x
=
self
.
proj
(
x
).
flatten
(
2
).
transpose
((
0
,
2
,
1
))
return
x
class
SubSample
(
nn
.
Layer
):
def
__init__
(
self
,
in_channels
,
out_channels
,
types
=
'Pool'
,
stride
=
[
2
,
1
],
sub_norm
=
'nn.LayerNorm'
,
act
=
None
):
super
().
__init__
()
self
.
types
=
types
if
types
==
'Pool'
:
self
.
avgpool
=
nn
.
AvgPool2D
(
kernel_size
=
[
3
,
5
],
stride
=
stride
,
padding
=
[
1
,
2
])
self
.
maxpool
=
nn
.
MaxPool2D
(
kernel_size
=
[
3
,
5
],
stride
=
stride
,
padding
=
[
1
,
2
])
self
.
proj
=
nn
.
Linear
(
in_channels
,
out_channels
)
else
:
self
.
conv
=
nn
.
Conv2D
(
in_channels
,
out_channels
,
kernel_size
=
3
,
stride
=
stride
,
padding
=
1
,
weight_attr
=
ParamAttr
(
initializer
=
KaimingNormal
()))
self
.
norm
=
eval
(
sub_norm
)(
out_channels
)
if
act
is
not
None
:
self
.
act
=
act
()
else
:
self
.
act
=
None
def
forward
(
self
,
x
):
if
self
.
types
==
'Pool'
:
x1
=
self
.
avgpool
(
x
)
x2
=
self
.
maxpool
(
x
)
x
=
(
x1
+
x2
)
*
0.5
out
=
self
.
proj
(
x
.
flatten
(
2
).
transpose
((
0
,
2
,
1
)))
else
:
x
=
self
.
conv
(
x
)
out
=
x
.
flatten
(
2
).
transpose
((
0
,
2
,
1
))
out
=
self
.
norm
(
out
)
if
self
.
act
is
not
None
:
out
=
self
.
act
(
out
)
return
out
class
SVTRNet
(
nn
.
Layer
):
def
__init__
(
self
,
img_size
=
[
32
,
100
],
in_channels
=
3
,
embed_dim
=
[
64
,
128
,
256
],
depth
=
[
3
,
6
,
3
],
num_heads
=
[
2
,
4
,
8
],
mixer
=
[
'Local'
]
*
6
+
[
'Global'
]
*
6
,
# Local atten, Global atten, Conv
local_mixer
=
[[
7
,
11
],
[
7
,
11
],
[
7
,
11
]],
patch_merging
=
'Conv'
,
# Conv, Pool, None
mlp_ratio
=
4
,
qkv_bias
=
True
,
qk_scale
=
None
,
drop_rate
=
0.
,
last_drop
=
0.1
,
attn_drop_rate
=
0.
,
drop_path_rate
=
0.1
,
norm_layer
=
'nn.LayerNorm'
,
sub_norm
=
'nn.LayerNorm'
,
epsilon
=
1e-6
,
out_channels
=
192
,
out_char_num
=
25
,
block_unit
=
'Block'
,
act
=
'nn.GELU'
,
last_stage
=
True
,
sub_num
=
2
,
prenorm
=
True
,
use_lenhead
=
False
,
**
kwargs
):
super
().
__init__
()
self
.
img_size
=
img_size
self
.
embed_dim
=
embed_dim
self
.
out_channels
=
out_channels
self
.
prenorm
=
prenorm
patch_merging
=
None
if
patch_merging
!=
'Conv'
and
patch_merging
!=
'Pool'
else
patch_merging
self
.
patch_embed
=
PatchEmbed
(
img_size
=
img_size
,
in_channels
=
in_channels
,
embed_dim
=
embed_dim
[
0
],
sub_num
=
sub_num
)
num_patches
=
self
.
patch_embed
.
num_patches
self
.
HW
=
[
img_size
[
0
]
//
(
2
**
sub_num
),
img_size
[
1
]
//
(
2
**
sub_num
)]
self
.
pos_embed
=
self
.
create_parameter
(
shape
=
[
1
,
num_patches
,
embed_dim
[
0
]],
default_initializer
=
zeros_
)
self
.
add_parameter
(
"pos_embed"
,
self
.
pos_embed
)
self
.
pos_drop
=
nn
.
Dropout
(
p
=
drop_rate
)
Block_unit
=
eval
(
block_unit
)
dpr
=
np
.
linspace
(
0
,
drop_path_rate
,
sum
(
depth
))
self
.
blocks1
=
nn
.
LayerList
([
Block_unit
(
dim
=
embed_dim
[
0
],
num_heads
=
num_heads
[
0
],
mixer
=
mixer
[
0
:
depth
[
0
]][
i
],
HW
=
self
.
HW
,
local_mixer
=
local_mixer
[
0
],
mlp_ratio
=
mlp_ratio
,
qkv_bias
=
qkv_bias
,
qk_scale
=
qk_scale
,
drop
=
drop_rate
,
act_layer
=
eval
(
act
),
attn_drop
=
attn_drop_rate
,
drop_path
=
dpr
[
0
:
depth
[
0
]][
i
],
norm_layer
=
norm_layer
,
epsilon
=
epsilon
,
prenorm
=
prenorm
)
for
i
in
range
(
depth
[
0
])
])
if
patch_merging
is
not
None
:
self
.
sub_sample1
=
SubSample
(
embed_dim
[
0
],
embed_dim
[
1
],
sub_norm
=
sub_norm
,
stride
=
[
2
,
1
],
types
=
patch_merging
)
HW
=
[
self
.
HW
[
0
]
//
2
,
self
.
HW
[
1
]]
else
:
HW
=
self
.
HW
self
.
patch_merging
=
patch_merging
self
.
blocks2
=
nn
.
LayerList
([
Block_unit
(
dim
=
embed_dim
[
1
],
num_heads
=
num_heads
[
1
],
mixer
=
mixer
[
depth
[
0
]:
depth
[
0
]
+
depth
[
1
]][
i
],
HW
=
HW
,
local_mixer
=
local_mixer
[
1
],
mlp_ratio
=
mlp_ratio
,
qkv_bias
=
qkv_bias
,
qk_scale
=
qk_scale
,
drop
=
drop_rate
,
act_layer
=
eval
(
act
),
attn_drop
=
attn_drop_rate
,
drop_path
=
dpr
[
depth
[
0
]:
depth
[
0
]
+
depth
[
1
]][
i
],
norm_layer
=
norm_layer
,
epsilon
=
epsilon
,
prenorm
=
prenorm
)
for
i
in
range
(
depth
[
1
])
])
if
patch_merging
is
not
None
:
self
.
sub_sample2
=
SubSample
(
embed_dim
[
1
],
embed_dim
[
2
],
sub_norm
=
sub_norm
,
stride
=
[
2
,
1
],
types
=
patch_merging
)
HW
=
[
self
.
HW
[
0
]
//
4
,
self
.
HW
[
1
]]
else
:
HW
=
self
.
HW
self
.
blocks3
=
nn
.
LayerList
([
Block_unit
(
dim
=
embed_dim
[
2
],
num_heads
=
num_heads
[
2
],
mixer
=
mixer
[
depth
[
0
]
+
depth
[
1
]:][
i
],
HW
=
HW
,
local_mixer
=
local_mixer
[
2
],
mlp_ratio
=
mlp_ratio
,
qkv_bias
=
qkv_bias
,
qk_scale
=
qk_scale
,
drop
=
drop_rate
,
act_layer
=
eval
(
act
),
attn_drop
=
attn_drop_rate
,
drop_path
=
dpr
[
depth
[
0
]
+
depth
[
1
]:][
i
],
norm_layer
=
norm_layer
,
epsilon
=
epsilon
,
prenorm
=
prenorm
)
for
i
in
range
(
depth
[
2
])
])
self
.
last_stage
=
last_stage
if
last_stage
:
self
.
avg_pool
=
nn
.
AdaptiveAvgPool2D
([
1
,
out_char_num
])
self
.
last_conv
=
nn
.
Conv2D
(
in_channels
=
embed_dim
[
2
],
out_channels
=
self
.
out_channels
,
kernel_size
=
1
,
stride
=
1
,
padding
=
0
,
bias_attr
=
False
)
self
.
hardswish
=
nn
.
Hardswish
()
self
.
dropout
=
nn
.
Dropout
(
p
=
last_drop
,
mode
=
"downscale_in_infer"
)
if
not
prenorm
:
self
.
norm
=
eval
(
norm_layer
)(
embed_dim
[
-
1
],
epsilon
=
epsilon
)
self
.
use_lenhead
=
use_lenhead
if
use_lenhead
:
self
.
len_conv
=
nn
.
Linear
(
embed_dim
[
2
],
self
.
out_channels
)
self
.
hardswish_len
=
nn
.
Hardswish
()
self
.
dropout_len
=
nn
.
Dropout
(
p
=
last_drop
,
mode
=
"downscale_in_infer"
)
trunc_normal_
(
self
.
pos_embed
)
self
.
apply
(
self
.
_init_weights
)
def
_init_weights
(
self
,
m
):
if
isinstance
(
m
,
nn
.
Linear
):
trunc_normal_
(
m
.
weight
)
if
isinstance
(
m
,
nn
.
Linear
)
and
m
.
bias
is
not
None
:
zeros_
(
m
.
bias
)
elif
isinstance
(
m
,
nn
.
LayerNorm
):
zeros_
(
m
.
bias
)
ones_
(
m
.
weight
)
def
forward_features
(
self
,
x
):
x
=
self
.
patch_embed
(
x
)
x
=
x
+
self
.
pos_embed
x
=
self
.
pos_drop
(
x
)
for
blk
in
self
.
blocks1
:
x
=
blk
(
x
)
if
self
.
patch_merging
is
not
None
:
x
=
self
.
sub_sample1
(
x
.
transpose
([
0
,
2
,
1
]).
reshape
(
[
0
,
self
.
embed_dim
[
0
],
self
.
HW
[
0
],
self
.
HW
[
1
]]))
for
blk
in
self
.
blocks2
:
x
=
blk
(
x
)
if
self
.
patch_merging
is
not
None
:
x
=
self
.
sub_sample2
(
x
.
transpose
([
0
,
2
,
1
]).
reshape
(
[
0
,
self
.
embed_dim
[
1
],
self
.
HW
[
0
]
//
2
,
self
.
HW
[
1
]]))
for
blk
in
self
.
blocks3
:
x
=
blk
(
x
)
if
not
self
.
prenorm
:
x
=
self
.
norm
(
x
)
return
x
def
forward
(
self
,
x
):
x
=
self
.
forward_features
(
x
)
if
self
.
use_lenhead
:
len_x
=
self
.
len_conv
(
x
.
mean
(
1
))
len_x
=
self
.
dropout_len
(
self
.
hardswish_len
(
len_x
))
if
self
.
last_stage
:
if
self
.
patch_merging
is
not
None
:
h
=
self
.
HW
[
0
]
//
4
else
:
h
=
self
.
HW
[
0
]
x
=
self
.
avg_pool
(
x
.
transpose
([
0
,
2
,
1
]).
reshape
(
[
0
,
self
.
embed_dim
[
2
],
h
,
self
.
HW
[
1
]]))
x
=
self
.
last_conv
(
x
)
x
=
self
.
hardswish
(
x
)
x
=
self
.
dropout
(
x
)
if
self
.
use_lenhead
:
return
x
,
len_x
return
x
ppocr/modeling/heads/__init__.py
View file @
aa59fca5
...
...
@@ -32,6 +32,7 @@ def build_head(config):
from
.rec_sar_head
import
SARHead
from
.rec_aster_head
import
AsterHead
from
.rec_pren_head
import
PRENHead
from
.rec_multi_head
import
MultiHead
# cls head
from
.cls_head
import
ClsHead
...
...
@@ -44,7 +45,8 @@ def build_head(config):
support_dict
=
[
'DBHead'
,
'PSEHead'
,
'FCEHead'
,
'EASTHead'
,
'SASTHead'
,
'CTCHead'
,
'ClsHead'
,
'AttentionHead'
,
'SRNHead'
,
'PGHead'
,
'Transformer'
,
'TableAttentionHead'
,
'SARHead'
,
'AsterHead'
,
'SDMGRHead'
,
'PRENHead'
'TableAttentionHead'
,
'SARHead'
,
'AsterHead'
,
'SDMGRHead'
,
'PRENHead'
,
'MultiHead'
]
#table head
...
...
ppocr/modeling/heads/det_db_head.py
View file @
aa59fca5
...
...
@@ -31,13 +31,14 @@ def get_bias_attr(k):
class
Head
(
nn
.
Layer
):
def
__init__
(
self
,
in_channels
,
name_list
):
def
__init__
(
self
,
in_channels
,
name_list
,
kernel_list
=
[
3
,
2
,
2
],
**
kwargs
):
super
(
Head
,
self
).
__init__
()
self
.
conv1
=
nn
.
Conv2D
(
in_channels
=
in_channels
,
out_channels
=
in_channels
//
4
,
kernel_size
=
3
,
padding
=
1
,
kernel_size
=
kernel_list
[
0
]
,
padding
=
int
(
kernel_list
[
0
]
//
2
)
,
weight_attr
=
ParamAttr
(),
bias_attr
=
False
)
self
.
conv_bn1
=
nn
.
BatchNorm
(
...
...
@@ -50,7 +51,7 @@ class Head(nn.Layer):
self
.
conv2
=
nn
.
Conv2DTranspose
(
in_channels
=
in_channels
//
4
,
out_channels
=
in_channels
//
4
,
kernel_size
=
2
,
kernel_size
=
kernel_list
[
1
]
,
stride
=
2
,
weight_attr
=
ParamAttr
(
initializer
=
paddle
.
nn
.
initializer
.
KaimingUniform
()),
...
...
@@ -65,7 +66,7 @@ class Head(nn.Layer):
self
.
conv3
=
nn
.
Conv2DTranspose
(
in_channels
=
in_channels
//
4
,
out_channels
=
1
,
kernel_size
=
2
,
kernel_size
=
kernel_list
[
2
]
,
stride
=
2
,
weight_attr
=
ParamAttr
(
initializer
=
paddle
.
nn
.
initializer
.
KaimingUniform
()),
...
...
@@ -100,8 +101,8 @@ class DBHead(nn.Layer):
'conv2d_57'
,
'batch_norm_49'
,
'conv2d_transpose_2'
,
'batch_norm_50'
,
'conv2d_transpose_3'
,
'thresh'
]
self
.
binarize
=
Head
(
in_channels
,
binarize_name_list
)
self
.
thresh
=
Head
(
in_channels
,
thresh_name_list
)
self
.
binarize
=
Head
(
in_channels
,
binarize_name_list
,
**
kwargs
)
self
.
thresh
=
Head
(
in_channels
,
thresh_name_list
,
**
kwargs
)
def
step_function
(
self
,
x
,
y
):
return
paddle
.
reciprocal
(
1
+
paddle
.
exp
(
-
self
.
k
*
(
x
-
y
)))
...
...
ppocr/modeling/heads/rec_multi_head.py
0 → 100644
View file @
aa59fca5
# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
absolute_import
from
__future__
import
division
from
__future__
import
print_function
import
math
import
paddle
from
paddle
import
ParamAttr
import
paddle.nn
as
nn
import
paddle.nn.functional
as
F
from
ppocr.modeling.necks.rnn
import
Im2Seq
,
EncoderWithRNN
,
EncoderWithFC
,
SequenceEncoder
,
EncoderWithSVTR
from
.rec_ctc_head
import
CTCHead
from
.rec_sar_head
import
SARHead
class
MultiHead
(
nn
.
Layer
):
def
__init__
(
self
,
in_channels
,
out_channels_list
,
**
kwargs
):
super
().
__init__
()
self
.
head_list
=
kwargs
.
pop
(
'head_list'
)
self
.
gtc_head
=
'sar'
assert
len
(
self
.
head_list
)
>=
2
for
idx
,
head_name
in
enumerate
(
self
.
head_list
):
name
=
list
(
head_name
)[
0
]
if
name
==
'SARHead'
:
# sar head
sar_args
=
self
.
head_list
[
idx
][
name
]
self
.
sar_head
=
eval
(
name
)(
in_channels
=
in_channels
,
\
out_channels
=
out_channels_list
[
'SARLabelDecode'
],
**
sar_args
)
elif
name
==
'CTCHead'
:
# ctc neck
self
.
encoder_reshape
=
Im2Seq
(
in_channels
)
neck_args
=
self
.
head_list
[
idx
][
name
][
'Neck'
]
encoder_type
=
neck_args
.
pop
(
'name'
)
self
.
encoder
=
encoder_type
self
.
ctc_encoder
=
SequenceEncoder
(
in_channels
=
in_channels
,
\
encoder_type
=
encoder_type
,
**
neck_args
)
# ctc head
head_args
=
self
.
head_list
[
idx
][
name
][
'Head'
]
self
.
ctc_head
=
eval
(
name
)(
in_channels
=
self
.
ctc_encoder
.
out_channels
,
\
out_channels
=
out_channels_list
[
'CTCLabelDecode'
],
**
head_args
)
else
:
raise
NotImplementedError
(
'{} is not supported in MultiHead yet'
.
format
(
name
))
def
forward
(
self
,
x
,
targets
=
None
):
ctc_encoder
=
self
.
ctc_encoder
(
x
)
ctc_out
=
self
.
ctc_head
(
ctc_encoder
,
targets
)
head_out
=
dict
()
head_out
[
'ctc'
]
=
ctc_out
head_out
[
'ctc_neck'
]
=
ctc_encoder
# eval mode
if
not
self
.
training
:
return
ctc_out
if
self
.
gtc_head
==
'sar'
:
sar_out
=
self
.
sar_head
(
x
,
targets
[
1
:])
head_out
[
'sar'
]
=
sar_out
return
head_out
else
:
return
head_out
ppocr/modeling/heads/rec_sar_head.py
View file @
aa59fca5
...
...
@@ -349,7 +349,10 @@ class ParallelSARDecoder(BaseDecoder):
class
SARHead
(
nn
.
Layer
):
def
__init__
(
self
,
in_channels
,
out_channels
,
enc_dim
=
512
,
max_text_length
=
30
,
enc_bi_rnn
=
False
,
enc_drop_rnn
=
0.1
,
enc_gru
=
False
,
...
...
@@ -358,14 +361,17 @@ class SARHead(nn.Layer):
dec_gru
=
False
,
d_k
=
512
,
pred_dropout
=
0.1
,
max_text_length
=
30
,
pred_concat
=
True
,
**
kwargs
):
super
(
SARHead
,
self
).
__init__
()
# encoder module
self
.
encoder
=
SAREncoder
(
enc_bi_rnn
=
enc_bi_rnn
,
enc_drop_rnn
=
enc_drop_rnn
,
enc_gru
=
enc_gru
)
enc_bi_rnn
=
enc_bi_rnn
,
enc_drop_rnn
=
enc_drop_rnn
,
enc_gru
=
enc_gru
,
d_model
=
in_channels
,
d_enc
=
enc_dim
)
# decoder module
self
.
decoder
=
ParallelSARDecoder
(
...
...
@@ -374,6 +380,8 @@ class SARHead(nn.Layer):
dec_bi_rnn
=
dec_bi_rnn
,
dec_drop_rnn
=
dec_drop_rnn
,
dec_gru
=
dec_gru
,
d_model
=
in_channels
,
d_enc
=
enc_dim
,
d_k
=
d_k
,
pred_dropout
=
pred_dropout
,
max_text_length
=
max_text_length
,
...
...
@@ -390,7 +398,7 @@ class SARHead(nn.Layer):
label
=
paddle
.
to_tensor
(
label
,
dtype
=
'int64'
)
final_out
=
self
.
decoder
(
feat
,
holistic_feat
,
label
,
img_metas
=
targets
)
if
not
self
.
training
:
else
:
final_out
=
self
.
decoder
(
feat
,
holistic_feat
,
...
...
ppocr/modeling/necks/__init__.py
View file @
aa59fca5
...
...
@@ -16,7 +16,7 @@ __all__ = ['build_neck']
def
build_neck
(
config
):
from
.db_fpn
import
DBFPN
from
.db_fpn
import
DBFPN
,
RSEFPN
,
LKPAN
from
.east_fpn
import
EASTFPN
from
.sast_fpn
import
SASTFPN
from
.rnn
import
SequenceEncoder
...
...
@@ -26,8 +26,8 @@ def build_neck(config):
from
.fce_fpn
import
FCEFPN
from
.pren_fpn
import
PRENFPN
support_dict
=
[
'FPN'
,
'FCEFPN'
,
'
DB
FPN'
,
'EASTFPN'
,
'SASTFPN'
,
'SequenceEncoder'
,
'PGFPN'
,
'TableFPN'
,
'PRENFPN'
'FPN'
,
'FCEFPN'
,
'
LKPAN'
,
'DBFPN'
,
'RSE
FPN'
,
'EASTFPN'
,
'SASTFPN'
,
'SequenceEncoder'
,
'PGFPN'
,
'TableFPN'
,
'PRENFPN'
]
module_name
=
config
.
pop
(
'name'
)
...
...
ppocr/modeling/necks/db_fpn.py
View file @
aa59fca5
...
...
@@ -20,6 +20,88 @@ import paddle
from
paddle
import
nn
import
paddle.nn.functional
as
F
from
paddle
import
ParamAttr
import
os
import
sys
__dir__
=
os
.
path
.
dirname
(
os
.
path
.
abspath
(
__file__
))
sys
.
path
.
append
(
__dir__
)
sys
.
path
.
insert
(
0
,
os
.
path
.
abspath
(
os
.
path
.
join
(
__dir__
,
'../../..'
)))
from
ppocr.modeling.backbones.det_mobilenet_v3
import
SEModule
class
DSConv
(
nn
.
Layer
):
def
__init__
(
self
,
in_channels
,
out_channels
,
kernel_size
,
padding
,
stride
=
1
,
groups
=
None
,
if_act
=
True
,
act
=
"relu"
,
**
kwargs
):
super
(
DSConv
,
self
).
__init__
()
if
groups
==
None
:
groups
=
in_channels
self
.
if_act
=
if_act
self
.
act
=
act
self
.
conv1
=
nn
.
Conv2D
(
in_channels
=
in_channels
,
out_channels
=
in_channels
,
kernel_size
=
kernel_size
,
stride
=
stride
,
padding
=
padding
,
groups
=
groups
,
bias_attr
=
False
)
self
.
bn1
=
nn
.
BatchNorm
(
num_channels
=
in_channels
,
act
=
None
)
self
.
conv2
=
nn
.
Conv2D
(
in_channels
=
in_channels
,
out_channels
=
int
(
in_channels
*
4
),
kernel_size
=
1
,
stride
=
1
,
bias_attr
=
False
)
self
.
bn2
=
nn
.
BatchNorm
(
num_channels
=
int
(
in_channels
*
4
),
act
=
None
)
self
.
conv3
=
nn
.
Conv2D
(
in_channels
=
int
(
in_channels
*
4
),
out_channels
=
out_channels
,
kernel_size
=
1
,
stride
=
1
,
bias_attr
=
False
)
self
.
_c
=
[
in_channels
,
out_channels
]
if
in_channels
!=
out_channels
:
self
.
conv_end
=
nn
.
Conv2D
(
in_channels
=
in_channels
,
out_channels
=
out_channels
,
kernel_size
=
1
,
stride
=
1
,
bias_attr
=
False
)
def
forward
(
self
,
inputs
):
x
=
self
.
conv1
(
inputs
)
x
=
self
.
bn1
(
x
)
x
=
self
.
conv2
(
x
)
x
=
self
.
bn2
(
x
)
if
self
.
if_act
:
if
self
.
act
==
"relu"
:
x
=
F
.
relu
(
x
)
elif
self
.
act
==
"hardswish"
:
x
=
F
.
hardswish
(
x
)
else
:
print
(
"The activation function({}) is selected incorrectly."
.
format
(
self
.
act
))
exit
()
x
=
self
.
conv3
(
x
)
if
self
.
_c
[
0
]
!=
self
.
_c
[
1
]:
x
=
x
+
self
.
conv_end
(
inputs
)
return
x
class
DBFPN
(
nn
.
Layer
):
...
...
@@ -106,3 +188,171 @@ class DBFPN(nn.Layer):
fuse
=
paddle
.
concat
([
p5
,
p4
,
p3
,
p2
],
axis
=
1
)
return
fuse
class
RSELayer
(
nn
.
Layer
):
def
__init__
(
self
,
in_channels
,
out_channels
,
kernel_size
,
shortcut
=
True
):
super
(
RSELayer
,
self
).
__init__
()
weight_attr
=
paddle
.
nn
.
initializer
.
KaimingUniform
()
self
.
out_channels
=
out_channels
self
.
in_conv
=
nn
.
Conv2D
(
in_channels
=
in_channels
,
out_channels
=
self
.
out_channels
,
kernel_size
=
kernel_size
,
padding
=
int
(
kernel_size
//
2
),
weight_attr
=
ParamAttr
(
initializer
=
weight_attr
),
bias_attr
=
False
)
self
.
se_block
=
SEModule
(
self
.
out_channels
)
self
.
shortcut
=
shortcut
def
forward
(
self
,
ins
):
x
=
self
.
in_conv
(
ins
)
if
self
.
shortcut
:
out
=
x
+
self
.
se_block
(
x
)
else
:
out
=
self
.
se_block
(
x
)
return
out
class
RSEFPN
(
nn
.
Layer
):
def
__init__
(
self
,
in_channels
,
out_channels
,
shortcut
=
True
,
**
kwargs
):
super
(
RSEFPN
,
self
).
__init__
()
self
.
out_channels
=
out_channels
self
.
ins_conv
=
nn
.
LayerList
()
self
.
inp_conv
=
nn
.
LayerList
()
for
i
in
range
(
len
(
in_channels
)):
self
.
ins_conv
.
append
(
RSELayer
(
in_channels
[
i
],
out_channels
,
kernel_size
=
1
,
shortcut
=
shortcut
))
self
.
inp_conv
.
append
(
RSELayer
(
out_channels
,
out_channels
//
4
,
kernel_size
=
3
,
shortcut
=
shortcut
))
def
forward
(
self
,
x
):
c2
,
c3
,
c4
,
c5
=
x
in5
=
self
.
ins_conv
[
3
](
c5
)
in4
=
self
.
ins_conv
[
2
](
c4
)
in3
=
self
.
ins_conv
[
1
](
c3
)
in2
=
self
.
ins_conv
[
0
](
c2
)
out4
=
in4
+
F
.
upsample
(
in5
,
scale_factor
=
2
,
mode
=
"nearest"
,
align_mode
=
1
)
# 1/16
out3
=
in3
+
F
.
upsample
(
out4
,
scale_factor
=
2
,
mode
=
"nearest"
,
align_mode
=
1
)
# 1/8
out2
=
in2
+
F
.
upsample
(
out3
,
scale_factor
=
2
,
mode
=
"nearest"
,
align_mode
=
1
)
# 1/4
p5
=
self
.
inp_conv
[
3
](
in5
)
p4
=
self
.
inp_conv
[
2
](
out4
)
p3
=
self
.
inp_conv
[
1
](
out3
)
p2
=
self
.
inp_conv
[
0
](
out2
)
p5
=
F
.
upsample
(
p5
,
scale_factor
=
8
,
mode
=
"nearest"
,
align_mode
=
1
)
p4
=
F
.
upsample
(
p4
,
scale_factor
=
4
,
mode
=
"nearest"
,
align_mode
=
1
)
p3
=
F
.
upsample
(
p3
,
scale_factor
=
2
,
mode
=
"nearest"
,
align_mode
=
1
)
fuse
=
paddle
.
concat
([
p5
,
p4
,
p3
,
p2
],
axis
=
1
)
return
fuse
class
LKPAN
(
nn
.
Layer
):
def
__init__
(
self
,
in_channels
,
out_channels
,
mode
=
'large'
,
**
kwargs
):
super
(
LKPAN
,
self
).
__init__
()
self
.
out_channels
=
out_channels
weight_attr
=
paddle
.
nn
.
initializer
.
KaimingUniform
()
self
.
ins_conv
=
nn
.
LayerList
()
self
.
inp_conv
=
nn
.
LayerList
()
# pan head
self
.
pan_head_conv
=
nn
.
LayerList
()
self
.
pan_lat_conv
=
nn
.
LayerList
()
if
mode
.
lower
()
==
'lite'
:
p_layer
=
DSConv
elif
mode
.
lower
()
==
'large'
:
p_layer
=
nn
.
Conv2D
else
:
raise
ValueError
(
"mode can only be one of ['lite', 'large'], but received {}"
.
format
(
mode
))
for
i
in
range
(
len
(
in_channels
)):
self
.
ins_conv
.
append
(
nn
.
Conv2D
(
in_channels
=
in_channels
[
i
],
out_channels
=
self
.
out_channels
,
kernel_size
=
1
,
weight_attr
=
ParamAttr
(
initializer
=
weight_attr
),
bias_attr
=
False
))
self
.
inp_conv
.
append
(
p_layer
(
in_channels
=
self
.
out_channels
,
out_channels
=
self
.
out_channels
//
4
,
kernel_size
=
9
,
padding
=
4
,
weight_attr
=
ParamAttr
(
initializer
=
weight_attr
),
bias_attr
=
False
))
if
i
>
0
:
self
.
pan_head_conv
.
append
(
nn
.
Conv2D
(
in_channels
=
self
.
out_channels
//
4
,
out_channels
=
self
.
out_channels
//
4
,
kernel_size
=
3
,
padding
=
1
,
stride
=
2
,
weight_attr
=
ParamAttr
(
initializer
=
weight_attr
),
bias_attr
=
False
))
self
.
pan_lat_conv
.
append
(
p_layer
(
in_channels
=
self
.
out_channels
//
4
,
out_channels
=
self
.
out_channels
//
4
,
kernel_size
=
9
,
padding
=
4
,
weight_attr
=
ParamAttr
(
initializer
=
weight_attr
),
bias_attr
=
False
))
def
forward
(
self
,
x
):
c2
,
c3
,
c4
,
c5
=
x
in5
=
self
.
ins_conv
[
3
](
c5
)
in4
=
self
.
ins_conv
[
2
](
c4
)
in3
=
self
.
ins_conv
[
1
](
c3
)
in2
=
self
.
ins_conv
[
0
](
c2
)
out4
=
in4
+
F
.
upsample
(
in5
,
scale_factor
=
2
,
mode
=
"nearest"
,
align_mode
=
1
)
# 1/16
out3
=
in3
+
F
.
upsample
(
out4
,
scale_factor
=
2
,
mode
=
"nearest"
,
align_mode
=
1
)
# 1/8
out2
=
in2
+
F
.
upsample
(
out3
,
scale_factor
=
2
,
mode
=
"nearest"
,
align_mode
=
1
)
# 1/4
f5
=
self
.
inp_conv
[
3
](
in5
)
f4
=
self
.
inp_conv
[
2
](
out4
)
f3
=
self
.
inp_conv
[
1
](
out3
)
f2
=
self
.
inp_conv
[
0
](
out2
)
pan3
=
f3
+
self
.
pan_head_conv
[
0
](
f2
)
pan4
=
f4
+
self
.
pan_head_conv
[
1
](
pan3
)
pan5
=
f5
+
self
.
pan_head_conv
[
2
](
pan4
)
p2
=
self
.
pan_lat_conv
[
0
](
f2
)
p3
=
self
.
pan_lat_conv
[
1
](
pan3
)
p4
=
self
.
pan_lat_conv
[
2
](
pan4
)
p5
=
self
.
pan_lat_conv
[
3
](
pan5
)
p5
=
F
.
upsample
(
p5
,
scale_factor
=
8
,
mode
=
"nearest"
,
align_mode
=
1
)
p4
=
F
.
upsample
(
p4
,
scale_factor
=
4
,
mode
=
"nearest"
,
align_mode
=
1
)
p3
=
F
.
upsample
(
p3
,
scale_factor
=
2
,
mode
=
"nearest"
,
align_mode
=
1
)
fuse
=
paddle
.
concat
([
p5
,
p4
,
p3
,
p2
],
axis
=
1
)
return
fuse
Prev
1
…
3
4
5
6
7
8
9
10
11
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment