Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
wangsen
paddle_dbnet
Commits
1f76f449
Commit
1f76f449
authored
Mar 08, 2021
by
Jethong
Browse files
Add PGNet
parent
1a087990
Changes
30
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
2423 additions
and
13 deletions
+2423
-13
configs/det/det_r50_vd_sast_icdar15.yml
configs/det/det_r50_vd_sast_icdar15.yml
+3
-2
configs/e2e/e2e_r50_vd_pg.yml
configs/e2e/e2e_r50_vd_pg.yml
+122
-0
ppocr/data/__init__.py
ppocr/data/__init__.py
+3
-1
ppocr/data/imaug/__init__.py
ppocr/data/imaug/__init__.py
+1
-0
ppocr/data/imaug/label_ops.py
ppocr/data/imaug/label_ops.py
+19
-0
ppocr/data/imaug/operators.py
ppocr/data/imaug/operators.py
+71
-0
ppocr/data/imaug/pg_process.py
ppocr/data/imaug/pg_process.py
+921
-0
ppocr/data/pgnet_dataset.py
ppocr/data/pgnet_dataset.py
+164
-0
ppocr/losses/__init__.py
ppocr/losses/__init__.py
+3
-2
ppocr/losses/e2e_pg_loss.py
ppocr/losses/e2e_pg_loss.py
+219
-0
ppocr/metrics/__init__.py
ppocr/metrics/__init__.py
+2
-1
ppocr/metrics/e2e_metric.py
ppocr/metrics/e2e_metric.py
+87
-0
ppocr/metrics/eval_det_iou.py
ppocr/metrics/eval_det_iou.py
+4
-3
ppocr/modeling/backbones/__init__.py
ppocr/modeling/backbones/__init__.py
+3
-0
ppocr/modeling/backbones/e2e_resnet_vd_pg.py
ppocr/modeling/backbones/e2e_resnet_vd_pg.py
+267
-0
ppocr/modeling/heads/__init__.py
ppocr/modeling/heads/__init__.py
+3
-2
ppocr/modeling/heads/e2e_pg_head.py
ppocr/modeling/heads/e2e_pg_head.py
+249
-0
ppocr/modeling/necks/__init__.py
ppocr/modeling/necks/__init__.py
+3
-1
ppocr/modeling/necks/pg_fpn.py
ppocr/modeling/necks/pg_fpn.py
+277
-0
ppocr/postprocess/__init__.py
ppocr/postprocess/__init__.py
+2
-1
No files found.
configs/det/det_r50_vd_sast_icdar15.yml
View file @
1f76f449
...
@@ -20,6 +20,7 @@ Global:
...
@@ -20,6 +20,7 @@ Global:
infer_img
:
infer_img
:
save_res_path
:
./output/sast_r50_vd_ic15/predicts_sast.txt
save_res_path
:
./output/sast_r50_vd_ic15/predicts_sast.txt
Architecture
:
Architecture
:
model_type
:
det
model_type
:
det
algorithm
:
SAST
algorithm
:
SAST
...
...
configs/e2e/e2e_r50_vd_pg.yml
0 → 100644
View file @
1f76f449
Global
:
use_gpu
:
False
epoch_num
:
600
log_smooth_window
:
20
print_batch_step
:
2
save_model_dir
:
./output/pg_r50_vd_tt/
save_epoch_step
:
1
# evaluation is run every 5000 iterationss after the 4000th iteration
eval_batch_step
:
[
0
,
1000
]
# if pretrained_model is saved in static mode, load_static_weights must set to True
load_static_weights
:
False
cal_metric_during_train
:
False
pretrained_model
:
checkpoints
:
save_inference_dir
:
use_visualdl
:
False
infer_img
:
save_res_path
:
./output/pg_r50_vd_tt/predicts_pg.txt
Architecture
:
model_type
:
e2e
algorithm
:
PG
Transform
:
Backbone
:
name
:
ResNet
layers
:
50
Neck
:
name
:
PGFPN
model_name
:
large
Head
:
name
:
PGHead
model_name
:
large
Loss
:
name
:
PGLoss
#Optimizer:
# name: Adam
# beta1: 0.9
# beta2: 0.999
# lr:
# name: Cosine
# learning_rate: 0.001
# warmup_epoch: 1
# regularizer:
# name: 'L2'
# factor: 0
Optimizer
:
name
:
RMSProp
lr
:
name
:
Piecewise
learning_rate
:
0.001
decay_epochs
:
[
40
,
80
,
120
,
160
,
200
]
values
:
[
0.001
,
0.00033
,
0.0001
,
0.000033
,
0.00001
]
regularizer
:
name
:
'
L2'
factor
:
0.00005
PostProcess
:
name
:
PGPostProcess
score_thresh
:
0.8
cover_thresh
:
0.1
nms_thresh
:
0.2
Metric
:
name
:
E2EMetric
main_indicator
:
hmean
Train
:
dataset
:
name
:
PGDateSet
label_file_list
:
ratio_list
:
data_format
:
textnet
# textnet/partvgg
Lexicon_Table
:
[
'
0'
,
'
1'
,
'
2'
,
'
3'
,
'
4'
,
'
5'
,
'
6'
,
'
7'
,
'
8'
,
'
9'
,
'
A'
,
'
B'
,
'
C'
,
'
D'
,
'
E'
,
'
F'
,
'
G'
,
'
H'
,
'
I'
,
'
J'
,
'
K'
,
'
L'
,
'
M'
,
'
N'
,
'
O'
,
'
P'
,
'
Q'
,
'
R'
,
'
S'
,
'
T'
,
'
U'
,
'
V'
,
'
W'
,
'
X'
,
'
Y'
,
'
Z'
]
transforms
:
-
DecodeImage
:
# load image
img_mode
:
BGR
channel_first
:
False
-
PGProcessTrain
:
batch_size
:
14
data_format
:
icdar
tcl_len
:
64
min_crop_size
:
24
min_text_size
:
4
max_text_size
:
512
-
KeepKeys
:
keep_keys
:
[
'
images'
,
'
tcl_maps'
,
'
tcl_label_maps'
,
'
border_maps'
,
'
direction_maps'
,
'
training_masks'
,
'
label_list'
,
'
pos_list'
,
'
pos_mask'
]
# dataloader will return list in this order
loader
:
shuffle
:
True
drop_last
:
True
batch_size_per_card
:
1
num_workers
:
8
Eval
:
dataset
:
name
:
PGDateSet
data_dir
:
./train_data/
label_file_list
:
transforms
:
-
DecodeImage
:
# load image
img_mode
:
BGR
channel_first
:
False
-
E2ELabelEncode
:
label_list
:
[
'
0'
,
'
1'
,
'
2'
,
'
3'
,
'
4'
,
'
5'
,
'
6'
,
'
7'
,
'
8'
,
'
9'
,
'
A'
,
'
B'
,
'
C'
,
'
D'
,
'
E'
,
'
F'
,
'
G'
,
'
H'
,
'
I'
,
'
J'
,
'
K'
,
'
L'
,
'
M'
,
'
N'
,
'
O'
,
'
P'
,
'
Q'
,
'
R'
,
'
S'
,
'
T'
,
'
U'
,
'
V'
,
'
W'
,
'
X'
,
'
Y'
,
'
Z'
]
-
E2EResizeForTest
:
valid_set
:
totaltext
max_side_len
:
768
-
NormalizeImage
:
scale
:
1./255.
mean
:
[
0.485
,
0.456
,
0.406
]
std
:
[
0.229
,
0.224
,
0.225
]
order
:
'
hwc'
-
ToCHWImage
:
-
KeepKeys
:
keep_keys
:
[
'
image'
,
'
shape'
,
'
polys'
,
'
strs'
,
'
tags'
]
loader
:
shuffle
:
False
drop_last
:
False
batch_size_per_card
:
1
# must be 1
num_workers
:
2
\ No newline at end of file
ppocr/data/__init__.py
View file @
1f76f449
...
@@ -34,6 +34,7 @@ import paddle.distributed as dist
...
@@ -34,6 +34,7 @@ import paddle.distributed as dist
from
ppocr.data.imaug
import
transform
,
create_operators
from
ppocr.data.imaug
import
transform
,
create_operators
from
ppocr.data.simple_dataset
import
SimpleDataSet
from
ppocr.data.simple_dataset
import
SimpleDataSet
from
ppocr.data.lmdb_dataset
import
LMDBDataSet
from
ppocr.data.lmdb_dataset
import
LMDBDataSet
from
ppocr.data.pgnet_dataset
import
PGDateSet
__all__
=
[
'build_dataloader'
,
'transform'
,
'create_operators'
]
__all__
=
[
'build_dataloader'
,
'transform'
,
'create_operators'
]
...
@@ -54,7 +55,8 @@ signal.signal(signal.SIGTERM, term_mp)
...
@@ -54,7 +55,8 @@ signal.signal(signal.SIGTERM, term_mp)
def
build_dataloader
(
config
,
mode
,
device
,
logger
,
seed
=
None
):
def
build_dataloader
(
config
,
mode
,
device
,
logger
,
seed
=
None
):
config
=
copy
.
deepcopy
(
config
)
config
=
copy
.
deepcopy
(
config
)
support_dict
=
[
'SimpleDataSet'
,
'LMDBDataSet'
]
support_dict
=
[
'SimpleDataSet'
,
'LMDBDateSet'
,
'PGDateSet'
]
module_name
=
config
[
mode
][
'dataset'
][
'name'
]
module_name
=
config
[
mode
][
'dataset'
][
'name'
]
assert
module_name
in
support_dict
,
Exception
(
assert
module_name
in
support_dict
,
Exception
(
'DataSet only support {}'
.
format
(
support_dict
))
'DataSet only support {}'
.
format
(
support_dict
))
...
...
ppocr/data/imaug/__init__.py
View file @
1f76f449
...
@@ -28,6 +28,7 @@ from .label_ops import *
...
@@ -28,6 +28,7 @@ from .label_ops import *
from
.east_process
import
*
from
.east_process
import
*
from
.sast_process
import
*
from
.sast_process
import
*
from
.pg_process
import
*
def
transform
(
data
,
ops
=
None
):
def
transform
(
data
,
ops
=
None
):
...
...
ppocr/data/imaug/label_ops.py
View file @
1f76f449
...
@@ -34,6 +34,25 @@ class ClsLabelEncode(object):
...
@@ -34,6 +34,25 @@ class ClsLabelEncode(object):
return
data
return
data
class
E2ELabelEncode
(
object
):
def
__init__
(
self
,
label_list
,
**
kwargs
):
self
.
label_list
=
label_list
def
__call__
(
self
,
data
):
text_label_index_list
,
temp_text
=
[],
[]
texts
=
data
[
'strs'
]
for
text
in
texts
:
text
=
text
.
upper
()
temp_text
=
[]
for
c_
in
text
:
if
c_
in
self
.
label_list
:
temp_text
.
append
(
self
.
label_list
.
index
(
c_
))
temp_text
=
temp_text
+
[
36
]
*
(
50
-
len
(
temp_text
))
text_label_index_list
.
append
(
temp_text
)
data
[
'strs'
]
=
np
.
array
(
text_label_index_list
)
return
data
class
DetLabelEncode
(
object
):
class
DetLabelEncode
(
object
):
def
__init__
(
self
,
**
kwargs
):
def
__init__
(
self
,
**
kwargs
):
pass
pass
...
...
ppocr/data/imaug/operators.py
View file @
1f76f449
...
@@ -223,3 +223,74 @@ class DetResizeForTest(object):
...
@@ -223,3 +223,74 @@ class DetResizeForTest(object):
ratio_w
=
resize_w
/
float
(
w
)
ratio_w
=
resize_w
/
float
(
w
)
return
img
,
[
ratio_h
,
ratio_w
]
return
img
,
[
ratio_h
,
ratio_w
]
class
E2EResizeForTest
(
object
):
def
__init__
(
self
,
**
kwargs
):
super
(
E2EResizeForTest
,
self
).
__init__
()
self
.
max_side_len
=
kwargs
[
'max_side_len'
]
self
.
valid_set
=
kwargs
[
'valid_set'
]
def
__call__
(
self
,
data
):
img
=
data
[
'image'
]
src_h
,
src_w
,
_
=
img
.
shape
if
self
.
valid_set
==
'totaltext'
:
im_resized
,
[
ratio_h
,
ratio_w
]
=
self
.
resize_image_for_totaltext
(
img
,
max_side_len
=
self
.
max_side_len
)
else
:
im_resized
,
(
ratio_h
,
ratio_w
)
=
self
.
resize_image
(
img
,
max_side_len
=
self
.
max_side_len
)
data
[
'image'
]
=
im_resized
data
[
'shape'
]
=
np
.
array
([
src_h
,
src_w
,
ratio_h
,
ratio_w
])
return
data
def
resize_image_for_totaltext
(
self
,
im
,
max_side_len
=
512
):
"""
"""
h
,
w
,
_
=
im
.
shape
resize_w
=
w
resize_h
=
h
ratio
=
1.25
if
h
*
ratio
>
max_side_len
:
ratio
=
float
(
max_side_len
)
/
resize_h
resize_h
=
int
(
resize_h
*
ratio
)
resize_w
=
int
(
resize_w
*
ratio
)
max_stride
=
128
resize_h
=
(
resize_h
+
max_stride
-
1
)
//
max_stride
*
max_stride
resize_w
=
(
resize_w
+
max_stride
-
1
)
//
max_stride
*
max_stride
im
=
cv2
.
resize
(
im
,
(
int
(
resize_w
),
int
(
resize_h
)))
ratio_h
=
resize_h
/
float
(
h
)
ratio_w
=
resize_w
/
float
(
w
)
return
im
,
(
ratio_h
,
ratio_w
)
def
resize_image
(
self
,
im
,
max_side_len
=
512
):
"""
resize image to a size multiple of max_stride which is required by the network
:param im: the resized image
:param max_side_len: limit of max image size to avoid out of memory in gpu
:return: the resized image and the resize ratio
"""
h
,
w
,
_
=
im
.
shape
resize_w
=
w
resize_h
=
h
# Fix the longer side
if
resize_h
>
resize_w
:
ratio
=
float
(
max_side_len
)
/
resize_h
else
:
ratio
=
float
(
max_side_len
)
/
resize_w
resize_h
=
int
(
resize_h
*
ratio
)
resize_w
=
int
(
resize_w
*
ratio
)
max_stride
=
128
resize_h
=
(
resize_h
+
max_stride
-
1
)
//
max_stride
*
max_stride
resize_w
=
(
resize_w
+
max_stride
-
1
)
//
max_stride
*
max_stride
im
=
cv2
.
resize
(
im
,
(
int
(
resize_w
),
int
(
resize_h
)))
ratio_h
=
resize_h
/
float
(
h
)
ratio_w
=
resize_w
/
float
(
w
)
return
im
,
(
ratio_h
,
ratio_w
)
ppocr/data/imaug/pg_process.py
0 → 100644
View file @
1f76f449
# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
math
import
cv2
import
numpy
as
np
import
os
__all__
=
[
'PGProcessTrain'
]
class
PGProcessTrain
(
object
):
def
__init__
(
self
,
batch_size
=
14
,
data_format
=
'icdar'
,
tcl_len
=
64
,
min_crop_size
=
24
,
min_text_size
=
10
,
max_text_size
=
512
,
**
kwargs
):
self
.
batch_size
=
batch_size
self
.
data_format
=
data_format
self
.
tcl_len
=
tcl_len
self
.
min_crop_size
=
min_crop_size
self
.
min_text_size
=
min_text_size
self
.
max_text_size
=
max_text_size
self
.
Lexicon_Table
=
[
'0'
,
'1'
,
'2'
,
'3'
,
'4'
,
'5'
,
'6'
,
'7'
,
'8'
,
'9'
,
'A'
,
'B'
,
'C'
,
'D'
,
'E'
,
'F'
,
'G'
,
'H'
,
'I'
,
'J'
,
'K'
,
'L'
,
'M'
,
'N'
,
'O'
,
'P'
,
'Q'
,
'R'
,
'S'
,
'T'
,
'U'
,
'V'
,
'W'
,
'X'
,
'Y'
,
'Z'
]
self
.
img_id
=
0
def
quad_area
(
self
,
poly
):
"""
compute area of a polygon
:param poly:
:return:
"""
edge
=
[(
poly
[
1
][
0
]
-
poly
[
0
][
0
])
*
(
poly
[
1
][
1
]
+
poly
[
0
][
1
]),
(
poly
[
2
][
0
]
-
poly
[
1
][
0
])
*
(
poly
[
2
][
1
]
+
poly
[
1
][
1
]),
(
poly
[
3
][
0
]
-
poly
[
2
][
0
])
*
(
poly
[
3
][
1
]
+
poly
[
2
][
1
]),
(
poly
[
0
][
0
]
-
poly
[
3
][
0
])
*
(
poly
[
0
][
1
]
+
poly
[
3
][
1
])]
return
np
.
sum
(
edge
)
/
2.
def
gen_quad_from_poly
(
self
,
poly
):
"""
Generate min area quad from poly.
"""
point_num
=
poly
.
shape
[
0
]
min_area_quad
=
np
.
zeros
((
4
,
2
),
dtype
=
np
.
float32
)
if
True
:
rect
=
cv2
.
minAreaRect
(
poly
.
astype
(
np
.
int32
))
# (center (x,y), (width, height), angle of rotation)
center_point
=
rect
[
0
]
box
=
np
.
array
(
cv2
.
boxPoints
(
rect
))
first_point_idx
=
0
min_dist
=
1e4
for
i
in
range
(
4
):
dist
=
np
.
linalg
.
norm
(
box
[(
i
+
0
)
%
4
]
-
poly
[
0
])
+
\
np
.
linalg
.
norm
(
box
[(
i
+
1
)
%
4
]
-
poly
[
point_num
//
2
-
1
])
+
\
np
.
linalg
.
norm
(
box
[(
i
+
2
)
%
4
]
-
poly
[
point_num
//
2
])
+
\
np
.
linalg
.
norm
(
box
[(
i
+
3
)
%
4
]
-
poly
[
-
1
])
if
dist
<
min_dist
:
min_dist
=
dist
first_point_idx
=
i
for
i
in
range
(
4
):
min_area_quad
[
i
]
=
box
[(
first_point_idx
+
i
)
%
4
]
return
min_area_quad
def
check_and_validate_polys
(
self
,
polys
,
tags
,
xxx_todo_changeme
):
"""
check so that the text poly is in the same direction,
and also filter some invalid polygons
:param polys:
:param tags:
:return:
"""
(
h
,
w
)
=
xxx_todo_changeme
if
polys
.
shape
[
0
]
==
0
:
return
polys
,
np
.
array
([]),
np
.
array
([])
polys
[:,
:,
0
]
=
np
.
clip
(
polys
[:,
:,
0
],
0
,
w
-
1
)
polys
[:,
:,
1
]
=
np
.
clip
(
polys
[:,
:,
1
],
0
,
h
-
1
)
validated_polys
=
[]
validated_tags
=
[]
hv_tags
=
[]
for
poly
,
tag
in
zip
(
polys
,
tags
):
quad
=
self
.
gen_quad_from_poly
(
poly
)
p_area
=
self
.
quad_area
(
quad
)
if
abs
(
p_area
)
<
1
:
print
(
'invalid poly'
)
continue
if
p_area
>
0
:
if
tag
==
False
:
print
(
'poly in wrong direction'
)
tag
=
True
# reversed cases should be ignore
poly
=
poly
[(
0
,
15
,
14
,
13
,
12
,
11
,
10
,
9
,
8
,
7
,
6
,
5
,
4
,
3
,
2
,
1
),
:]
quad
=
quad
[(
0
,
3
,
2
,
1
),
:]
len_w
=
np
.
linalg
.
norm
(
quad
[
0
]
-
quad
[
1
])
+
np
.
linalg
.
norm
(
quad
[
3
]
-
quad
[
2
])
len_h
=
np
.
linalg
.
norm
(
quad
[
0
]
-
quad
[
3
])
+
np
.
linalg
.
norm
(
quad
[
1
]
-
quad
[
2
])
hv_tag
=
1
if
len_w
*
2.0
<
len_h
:
hv_tag
=
0
validated_polys
.
append
(
poly
)
validated_tags
.
append
(
tag
)
hv_tags
.
append
(
hv_tag
)
return
np
.
array
(
validated_polys
),
np
.
array
(
validated_tags
),
np
.
array
(
hv_tags
)
def
crop_area
(
self
,
im
,
polys
,
tags
,
hv_tags
,
txts
,
crop_background
=
False
,
max_tries
=
25
):
"""
make random crop from the input image
:param im:
:param polys: [b,4,2]
:param tags:
:param crop_background:
:param max_tries: 50 -> 25
:return:
"""
h
,
w
,
_
=
im
.
shape
pad_h
=
h
//
10
pad_w
=
w
//
10
h_array
=
np
.
zeros
((
h
+
pad_h
*
2
),
dtype
=
np
.
int32
)
w_array
=
np
.
zeros
((
w
+
pad_w
*
2
),
dtype
=
np
.
int32
)
for
poly
in
polys
:
poly
=
np
.
round
(
poly
,
decimals
=
0
).
astype
(
np
.
int32
)
minx
=
np
.
min
(
poly
[:,
0
])
maxx
=
np
.
max
(
poly
[:,
0
])
w_array
[
minx
+
pad_w
:
maxx
+
pad_w
]
=
1
miny
=
np
.
min
(
poly
[:,
1
])
maxy
=
np
.
max
(
poly
[:,
1
])
h_array
[
miny
+
pad_h
:
maxy
+
pad_h
]
=
1
# ensure the cropped area not across a text
h_axis
=
np
.
where
(
h_array
==
0
)[
0
]
w_axis
=
np
.
where
(
w_array
==
0
)[
0
]
if
len
(
h_axis
)
==
0
or
len
(
w_axis
)
==
0
:
return
im
,
polys
,
tags
,
hv_tags
,
txts
for
i
in
range
(
max_tries
):
xx
=
np
.
random
.
choice
(
w_axis
,
size
=
2
)
xmin
=
np
.
min
(
xx
)
-
pad_w
xmax
=
np
.
max
(
xx
)
-
pad_w
xmin
=
np
.
clip
(
xmin
,
0
,
w
-
1
)
xmax
=
np
.
clip
(
xmax
,
0
,
w
-
1
)
yy
=
np
.
random
.
choice
(
h_axis
,
size
=
2
)
ymin
=
np
.
min
(
yy
)
-
pad_h
ymax
=
np
.
max
(
yy
)
-
pad_h
ymin
=
np
.
clip
(
ymin
,
0
,
h
-
1
)
ymax
=
np
.
clip
(
ymax
,
0
,
h
-
1
)
if
xmax
-
xmin
<
self
.
min_crop_size
or
\
ymax
-
ymin
<
self
.
min_crop_size
:
continue
if
polys
.
shape
[
0
]
!=
0
:
poly_axis_in_area
=
(
polys
[:,
:,
0
]
>=
xmin
)
&
(
polys
[:,
:,
0
]
<=
xmax
)
\
&
(
polys
[:,
:,
1
]
>=
ymin
)
&
(
polys
[:,
:,
1
]
<=
ymax
)
selected_polys
=
np
.
where
(
np
.
sum
(
poly_axis_in_area
,
axis
=
1
)
==
4
)[
0
]
else
:
selected_polys
=
[]
if
len
(
selected_polys
)
==
0
:
# no text in this area
if
crop_background
:
txts_tmp
=
[]
for
selected_poly
in
selected_polys
:
txts_tmp
.
append
(
txts
[
selected_poly
])
txts
=
txts_tmp
# print(1111)
return
im
[
ymin
:
ymax
+
1
,
xmin
:
xmax
+
1
,
:],
\
polys
[
selected_polys
],
tags
[
selected_polys
],
hv_tags
[
selected_polys
],
txts
else
:
continue
im
=
im
[
ymin
:
ymax
+
1
,
xmin
:
xmax
+
1
,
:]
polys
=
polys
[
selected_polys
]
tags
=
tags
[
selected_polys
]
hv_tags
=
hv_tags
[
selected_polys
]
txts_tmp
=
[]
for
selected_poly
in
selected_polys
:
txts_tmp
.
append
(
txts
[
selected_poly
])
txts
=
txts_tmp
polys
[:,
:,
0
]
-=
xmin
polys
[:,
:,
1
]
-=
ymin
return
im
,
polys
,
tags
,
hv_tags
,
txts
return
im
,
polys
,
tags
,
hv_tags
,
txts
def
fit_and_gather_tcl_points_v2
(
self
,
min_area_quad
,
poly
,
max_h
,
max_w
,
fixed_point_num
=
64
,
img_id
=
0
,
reference_height
=
3
):
"""
Find the center point of poly as key_points, then fit and gather.
"""
key_point_xys
=
[]
point_num
=
poly
.
shape
[
0
]
for
idx
in
range
(
point_num
//
2
):
center_point
=
(
poly
[
idx
]
+
poly
[
point_num
-
1
-
idx
])
/
2.0
key_point_xys
.
append
(
center_point
)
tmp_image
=
np
.
zeros
(
shape
=
(
max_h
,
max_w
,
),
dtype
=
'float32'
)
cv2
.
polylines
(
tmp_image
,
[
np
.
array
(
key_point_xys
).
astype
(
'int32'
)],
False
,
1.0
)
ys
,
xs
=
np
.
where
(
tmp_image
>
0
)
xy_text
=
np
.
array
(
list
(
zip
(
xs
,
ys
)),
dtype
=
'float32'
)
# left_center_pt = np.array(key_point_xys[0]).reshape(1, 2)
# right_center_pt = np.array(key_point_xys[-1]).reshape(1, 2)
left_center_pt
=
(
(
min_area_quad
[
0
]
-
min_area_quad
[
1
])
/
2.0
).
reshape
(
1
,
2
)
right_center_pt
=
(
(
min_area_quad
[
1
]
-
min_area_quad
[
2
])
/
2.0
).
reshape
(
1
,
2
)
proj_unit_vec
=
(
right_center_pt
-
left_center_pt
)
/
(
np
.
linalg
.
norm
(
right_center_pt
-
left_center_pt
)
+
1e-6
)
proj_unit_vec_tile
=
np
.
tile
(
proj_unit_vec
,
(
xy_text
.
shape
[
0
],
1
))
# (n, 2)
left_center_pt_tile
=
np
.
tile
(
left_center_pt
,
(
xy_text
.
shape
[
0
],
1
))
# (n, 2)
xy_text_to_left_center
=
xy_text
-
left_center_pt_tile
proj_value
=
np
.
sum
(
xy_text_to_left_center
*
proj_unit_vec_tile
,
axis
=
1
)
xy_text
=
xy_text
[
np
.
argsort
(
proj_value
)]
# convert to np and keep the num of point not greater then fixed_point_num
pos_info
=
np
.
array
(
xy_text
).
reshape
(
-
1
,
2
)[:,
::
-
1
]
# xy-> yx
point_num
=
len
(
pos_info
)
if
point_num
>
fixed_point_num
:
keep_ids
=
[
int
((
point_num
*
1.0
/
fixed_point_num
)
*
x
)
for
x
in
range
(
fixed_point_num
)
]
pos_info
=
pos_info
[
keep_ids
,
:]
keep
=
int
(
min
(
len
(
pos_info
),
fixed_point_num
))
if
np
.
random
.
rand
()
<
0.2
and
reference_height
>=
3
:
dl
=
(
np
.
random
.
rand
(
keep
)
-
0.5
)
*
reference_height
*
0.3
random_float
=
np
.
array
([
1
,
0
]).
reshape
([
1
,
2
])
*
dl
.
reshape
(
[
keep
,
1
])
pos_info
+=
random_float
pos_info
[:,
0
]
=
np
.
clip
(
pos_info
[:,
0
],
0
,
max_h
-
1
)
pos_info
[:,
1
]
=
np
.
clip
(
pos_info
[:,
1
],
0
,
max_w
-
1
)
# padding to fixed length
pos_l
=
np
.
zeros
((
self
.
tcl_len
,
3
),
dtype
=
np
.
int32
)
pos_l
[:,
0
]
=
np
.
ones
((
self
.
tcl_len
,
))
*
img_id
pos_m
=
np
.
zeros
((
self
.
tcl_len
,
1
),
dtype
=
np
.
float32
)
pos_l
[:
keep
,
1
:]
=
np
.
round
(
pos_info
).
astype
(
np
.
int32
)
pos_m
[:
keep
]
=
1.0
return
pos_l
,
pos_m
def
generate_direction_map
(
self
,
poly_quads
,
n_char
,
direction_map
):
"""
"""
width_list
=
[]
height_list
=
[]
for
quad
in
poly_quads
:
quad_w
=
(
np
.
linalg
.
norm
(
quad
[
0
]
-
quad
[
1
])
+
np
.
linalg
.
norm
(
quad
[
2
]
-
quad
[
3
]))
/
2.0
quad_h
=
(
np
.
linalg
.
norm
(
quad
[
0
]
-
quad
[
3
])
+
np
.
linalg
.
norm
(
quad
[
2
]
-
quad
[
1
]))
/
2.0
width_list
.
append
(
quad_w
)
height_list
.
append
(
quad_h
)
norm_width
=
max
(
sum
(
width_list
)
/
n_char
,
1.0
)
average_height
=
max
(
sum
(
height_list
)
/
len
(
height_list
),
1.0
)
for
quad
in
poly_quads
:
direct_vector_full
=
(
(
quad
[
1
]
+
quad
[
2
])
-
(
quad
[
0
]
+
quad
[
3
]))
/
2.0
direct_vector
=
direct_vector_full
/
(
np
.
linalg
.
norm
(
direct_vector_full
)
+
1e-6
)
*
norm_width
direction_label
=
tuple
(
map
(
float
,
[
direct_vector
[
0
],
direct_vector
[
1
],
1.0
/
average_height
]))
cv2
.
fillPoly
(
direction_map
,
quad
.
round
().
astype
(
np
.
int32
)[
np
.
newaxis
,
:,
:],
direction_label
)
return
direction_map
def
calculate_average_height
(
self
,
poly_quads
):
"""
"""
height_list
=
[]
for
quad
in
poly_quads
:
quad_h
=
(
np
.
linalg
.
norm
(
quad
[
0
]
-
quad
[
3
])
+
np
.
linalg
.
norm
(
quad
[
2
]
-
quad
[
1
]))
/
2.0
height_list
.
append
(
quad_h
)
average_height
=
max
(
sum
(
height_list
)
/
len
(
height_list
),
1.0
)
return
average_height
def
encode
(
self
,
text
):
text_list
=
[]
for
char
in
text
:
if
char
not
in
self
.
dict
:
continue
text_list
.
append
([
self
.
dict
[
char
]])
if
len
(
text_list
)
==
0
:
return
None
return
text_list
def
generate_tcl_ctc_label
(
self
,
h
,
w
,
polys
,
tags
,
text_strs
,
ds_ratio
,
tcl_ratio
=
0.3
,
shrink_ratio_of_width
=
0.15
):
"""
Generate polygon.
"""
score_map_big
=
np
.
zeros
(
(
h
,
w
,
),
dtype
=
np
.
float32
)
h
,
w
=
int
(
h
*
ds_ratio
),
int
(
w
*
ds_ratio
)
polys
=
polys
*
ds_ratio
score_map
=
np
.
zeros
(
(
h
,
w
,
),
dtype
=
np
.
float32
)
score_label_map
=
np
.
zeros
(
(
h
,
w
,
),
dtype
=
np
.
float32
)
tbo_map
=
np
.
zeros
((
h
,
w
,
5
),
dtype
=
np
.
float32
)
training_mask
=
np
.
ones
(
(
h
,
w
,
),
dtype
=
np
.
float32
)
direction_map
=
np
.
ones
((
h
,
w
,
3
))
*
np
.
array
([
0
,
0
,
1
]).
reshape
(
[
1
,
1
,
3
]).
astype
(
np
.
float32
)
label_idx
=
0
score_label_map_text_label_list
=
[]
pos_list
,
pos_mask
,
label_list
=
[],
[],
[]
for
poly_idx
,
poly_tag
in
enumerate
(
zip
(
polys
,
tags
)):
poly
=
poly_tag
[
0
]
tag
=
poly_tag
[
1
]
# generate min_area_quad
min_area_quad
,
center_point
=
self
.
gen_min_area_quad_from_poly
(
poly
)
min_area_quad_h
=
0.5
*
(
np
.
linalg
.
norm
(
min_area_quad
[
0
]
-
min_area_quad
[
3
])
+
np
.
linalg
.
norm
(
min_area_quad
[
1
]
-
min_area_quad
[
2
]))
min_area_quad_w
=
0.5
*
(
np
.
linalg
.
norm
(
min_area_quad
[
0
]
-
min_area_quad
[
1
])
+
np
.
linalg
.
norm
(
min_area_quad
[
2
]
-
min_area_quad
[
3
]))
if
min
(
min_area_quad_h
,
min_area_quad_w
)
<
self
.
min_text_size
*
ds_ratio
\
or
min
(
min_area_quad_h
,
min_area_quad_w
)
>
self
.
max_text_size
*
ds_ratio
:
continue
if
tag
:
# continue
cv2
.
fillPoly
(
training_mask
,
poly
.
astype
(
np
.
int32
)[
np
.
newaxis
,
:,
:],
0.15
)
else
:
text_label
=
text_strs
[
poly_idx
]
text_label
=
self
.
prepare_text_label
(
text_label
,
self
.
Lexicon_Table
)
# text = text.decode('utf-8')
# text_label_index_list = self.encode(text)
text_label_index_list
=
[[
self
.
Lexicon_Table
.
index
(
c_
)]
for
c_
in
text_label
if
c_
in
self
.
Lexicon_Table
]
if
len
(
text_label_index_list
)
<
1
:
continue
tcl_poly
=
self
.
poly2tcl
(
poly
,
tcl_ratio
)
tcl_quads
=
self
.
poly2quads
(
tcl_poly
)
poly_quads
=
self
.
poly2quads
(
poly
)
# stcl map
stcl_quads
,
quad_index
=
self
.
shrink_poly_along_width
(
tcl_quads
,
shrink_ratio_of_width
=
shrink_ratio_of_width
,
expand_height_ratio
=
1.0
/
tcl_ratio
)
# generate tcl map
cv2
.
fillPoly
(
score_map
,
np
.
round
(
stcl_quads
).
astype
(
np
.
int32
),
1.0
)
cv2
.
fillPoly
(
score_map_big
,
np
.
round
(
stcl_quads
/
ds_ratio
).
astype
(
np
.
int32
),
1.0
)
# generate tbo map
# tbo_tcl_poly = poly2tcl(poly, 0.5)
# tbo_tcl_quads = poly2quads(tbo_tcl_poly)
# for idx, quad in enumerate(tbo_tcl_quads):
for
idx
,
quad
in
enumerate
(
stcl_quads
):
quad_mask
=
np
.
zeros
((
h
,
w
),
dtype
=
np
.
float32
)
quad_mask
=
cv2
.
fillPoly
(
quad_mask
,
np
.
round
(
quad
[
np
.
newaxis
,
:,
:]).
astype
(
np
.
int32
),
1.0
)
tbo_map
=
self
.
gen_quad_tbo
(
poly_quads
[
quad_index
[
idx
]],
quad_mask
,
tbo_map
)
# score label map and score_label_map_text_label_list for refine
if
label_idx
==
0
:
text_pos_list_
=
[[
len
(
self
.
Lexicon_Table
)],
]
score_label_map_text_label_list
.
append
(
text_pos_list_
)
label_idx
+=
1
# cv2.fillPoly(score_label_map, np.round(poly_quads[np.newaxis, :, :]).astype(np.int32), label_idx)
cv2
.
fillPoly
(
score_label_map
,
np
.
round
(
poly_quads
).
astype
(
np
.
int32
),
label_idx
)
score_label_map_text_label_list
.
append
(
text_label_index_list
)
# direction info, fix-me
n_char
=
len
(
text_label_index_list
)
direction_map
=
self
.
generate_direction_map
(
poly_quads
,
n_char
,
direction_map
)
# pos info
average_shrink_height
=
self
.
calculate_average_height
(
stcl_quads
)
pos_l
,
pos_m
=
self
.
fit_and_gather_tcl_points_v2
(
min_area_quad
,
poly
,
max_h
=
h
,
max_w
=
w
,
fixed_point_num
=
64
,
img_id
=
self
.
img_id
,
reference_height
=
average_shrink_height
)
label_l
=
text_label_index_list
if
len
(
text_label_index_list
)
<
2
:
continue
pos_list
.
append
(
pos_l
)
pos_mask
.
append
(
pos_m
)
label_list
.
append
(
label_l
)
# use big score_map for smooth tcl lines
score_map_big_resized
=
cv2
.
resize
(
score_map_big
,
dsize
=
None
,
fx
=
ds_ratio
,
fy
=
ds_ratio
)
score_map
=
np
.
array
(
score_map_big_resized
>
1e-3
,
dtype
=
'float32'
)
return
score_map
,
score_label_map
,
tbo_map
,
direction_map
,
training_mask
,
\
pos_list
,
pos_mask
,
label_list
,
score_label_map_text_label_list
def
adjust_point
(
self
,
poly
):
"""
adjust point order.
"""
point_num
=
poly
.
shape
[
0
]
if
point_num
==
4
:
len_1
=
np
.
linalg
.
norm
(
poly
[
0
]
-
poly
[
1
])
len_2
=
np
.
linalg
.
norm
(
poly
[
1
]
-
poly
[
2
])
len_3
=
np
.
linalg
.
norm
(
poly
[
2
]
-
poly
[
3
])
len_4
=
np
.
linalg
.
norm
(
poly
[
3
]
-
poly
[
0
])
if
(
len_1
+
len_3
)
*
1.5
<
(
len_2
+
len_4
):
poly
=
poly
[[
1
,
2
,
3
,
0
],
:]
elif
point_num
>
4
:
vector_1
=
poly
[
0
]
-
poly
[
1
]
vector_2
=
poly
[
1
]
-
poly
[
2
]
cos_theta
=
np
.
dot
(
vector_1
,
vector_2
)
/
(
np
.
linalg
.
norm
(
vector_1
)
*
np
.
linalg
.
norm
(
vector_2
)
+
1e-6
)
theta
=
np
.
arccos
(
np
.
round
(
cos_theta
,
decimals
=
4
))
if
abs
(
theta
)
>
(
70
/
180
*
math
.
pi
):
index
=
list
(
range
(
1
,
point_num
))
+
[
0
]
poly
=
poly
[
np
.
array
(
index
),
:]
return
poly
def
gen_min_area_quad_from_poly
(
self
,
poly
):
"""
Generate min area quad from poly.
"""
point_num
=
poly
.
shape
[
0
]
min_area_quad
=
np
.
zeros
((
4
,
2
),
dtype
=
np
.
float32
)
if
point_num
==
4
:
min_area_quad
=
poly
center_point
=
np
.
sum
(
poly
,
axis
=
0
)
/
4
else
:
rect
=
cv2
.
minAreaRect
(
poly
.
astype
(
np
.
int32
))
# (center (x,y), (width, height), angle of rotation)
center_point
=
rect
[
0
]
box
=
np
.
array
(
cv2
.
boxPoints
(
rect
))
first_point_idx
=
0
min_dist
=
1e4
for
i
in
range
(
4
):
dist
=
np
.
linalg
.
norm
(
box
[(
i
+
0
)
%
4
]
-
poly
[
0
])
+
\
np
.
linalg
.
norm
(
box
[(
i
+
1
)
%
4
]
-
poly
[
point_num
//
2
-
1
])
+
\
np
.
linalg
.
norm
(
box
[(
i
+
2
)
%
4
]
-
poly
[
point_num
//
2
])
+
\
np
.
linalg
.
norm
(
box
[(
i
+
3
)
%
4
]
-
poly
[
-
1
])
if
dist
<
min_dist
:
min_dist
=
dist
first_point_idx
=
i
for
i
in
range
(
4
):
min_area_quad
[
i
]
=
box
[(
first_point_idx
+
i
)
%
4
]
return
min_area_quad
,
center_point
def
shrink_quad_along_width
(
self
,
quad
,
begin_width_ratio
=
0.
,
end_width_ratio
=
1.
):
"""
Generate shrink_quad_along_width.
"""
ratio_pair
=
np
.
array
(
[[
begin_width_ratio
],
[
end_width_ratio
]],
dtype
=
np
.
float32
)
p0_1
=
quad
[
0
]
+
(
quad
[
1
]
-
quad
[
0
])
*
ratio_pair
p3_2
=
quad
[
3
]
+
(
quad
[
2
]
-
quad
[
3
])
*
ratio_pair
return
np
.
array
([
p0_1
[
0
],
p0_1
[
1
],
p3_2
[
1
],
p3_2
[
0
]])
def
shrink_poly_along_width
(
self
,
quads
,
shrink_ratio_of_width
,
expand_height_ratio
=
1.0
):
"""
shrink poly with given length.
"""
upper_edge_list
=
[]
def
get_cut_info
(
edge_len_list
,
cut_len
):
for
idx
,
edge_len
in
enumerate
(
edge_len_list
):
cut_len
-=
edge_len
if
cut_len
<=
0.000001
:
ratio
=
(
cut_len
+
edge_len_list
[
idx
])
/
edge_len_list
[
idx
]
return
idx
,
ratio
for
quad
in
quads
:
upper_edge_len
=
np
.
linalg
.
norm
(
quad
[
0
]
-
quad
[
1
])
upper_edge_list
.
append
(
upper_edge_len
)
# length of left edge and right edge.
left_length
=
np
.
linalg
.
norm
(
quads
[
0
][
0
]
-
quads
[
0
][
3
])
*
expand_height_ratio
right_length
=
np
.
linalg
.
norm
(
quads
[
-
1
][
1
]
-
quads
[
-
1
][
2
])
*
expand_height_ratio
shrink_length
=
min
(
left_length
,
right_length
,
sum
(
upper_edge_list
))
*
shrink_ratio_of_width
# shrinking length
upper_len_left
=
shrink_length
upper_len_right
=
sum
(
upper_edge_list
)
-
shrink_length
left_idx
,
left_ratio
=
get_cut_info
(
upper_edge_list
,
upper_len_left
)
left_quad
=
self
.
shrink_quad_along_width
(
quads
[
left_idx
],
begin_width_ratio
=
left_ratio
,
end_width_ratio
=
1
)
right_idx
,
right_ratio
=
get_cut_info
(
upper_edge_list
,
upper_len_right
)
right_quad
=
self
.
shrink_quad_along_width
(
quads
[
right_idx
],
begin_width_ratio
=
0
,
end_width_ratio
=
right_ratio
)
out_quad_list
=
[]
if
left_idx
==
right_idx
:
out_quad_list
.
append
(
[
left_quad
[
0
],
right_quad
[
1
],
right_quad
[
2
],
left_quad
[
3
]])
else
:
out_quad_list
.
append
(
left_quad
)
for
idx
in
range
(
left_idx
+
1
,
right_idx
):
out_quad_list
.
append
(
quads
[
idx
])
out_quad_list
.
append
(
right_quad
)
return
np
.
array
(
out_quad_list
),
list
(
range
(
left_idx
,
right_idx
+
1
))
def
prepare_text_label
(
self
,
label_str
,
Lexicon_Table
):
"""
Prepare text lablel by given Lexicon_Table.
"""
if
len
(
Lexicon_Table
)
==
36
:
return
label_str
.
upper
()
else
:
return
label_str
def
vector_angle
(
self
,
A
,
B
):
"""
Calculate the angle between vector AB and x-axis positive direction.
"""
AB
=
np
.
array
([
B
[
1
]
-
A
[
1
],
B
[
0
]
-
A
[
0
]])
return
np
.
arctan2
(
*
AB
)
def
theta_line_cross_point
(
self
,
theta
,
point
):
"""
Calculate the line through given point and angle in ax + by + c =0 form.
"""
x
,
y
=
point
cos
=
np
.
cos
(
theta
)
sin
=
np
.
sin
(
theta
)
return
[
sin
,
-
cos
,
cos
*
y
-
sin
*
x
]
def
line_cross_two_point
(
self
,
A
,
B
):
"""
Calculate the line through given point A and B in ax + by + c =0 form.
"""
angle
=
self
.
vector_angle
(
A
,
B
)
return
self
.
theta_line_cross_point
(
angle
,
A
)
def
average_angle
(
self
,
poly
):
"""
Calculate the average angle between left and right edge in given poly.
"""
p0
,
p1
,
p2
,
p3
=
poly
angle30
=
self
.
vector_angle
(
p3
,
p0
)
angle21
=
self
.
vector_angle
(
p2
,
p1
)
return
(
angle30
+
angle21
)
/
2
def
line_cross_point
(
self
,
line1
,
line2
):
"""
line1 and line2 in 0=ax+by+c form, compute the cross point of line1 and line2
"""
a1
,
b1
,
c1
=
line1
a2
,
b2
,
c2
=
line2
d
=
a1
*
b2
-
a2
*
b1
if
d
==
0
:
# print("line1", line1)
# print("line2", line2)
print
(
'Cross point does not exist'
)
return
np
.
array
([
0
,
0
],
dtype
=
np
.
float32
)
else
:
x
=
(
b1
*
c2
-
b2
*
c1
)
/
d
y
=
(
a2
*
c1
-
a1
*
c2
)
/
d
return
np
.
array
([
x
,
y
],
dtype
=
np
.
float32
)
def
quad2tcl
(
self
,
poly
,
ratio
):
"""
Generate center line by poly clock-wise point. (4, 2)
"""
ratio_pair
=
np
.
array
(
[[
0.5
-
ratio
/
2
],
[
0.5
+
ratio
/
2
]],
dtype
=
np
.
float32
)
p0_3
=
poly
[
0
]
+
(
poly
[
3
]
-
poly
[
0
])
*
ratio_pair
p1_2
=
poly
[
1
]
+
(
poly
[
2
]
-
poly
[
1
])
*
ratio_pair
return
np
.
array
([
p0_3
[
0
],
p1_2
[
0
],
p1_2
[
1
],
p0_3
[
1
]])
def
poly2tcl
(
self
,
poly
,
ratio
):
"""
Generate center line by poly clock-wise point.
"""
ratio_pair
=
np
.
array
(
[[
0.5
-
ratio
/
2
],
[
0.5
+
ratio
/
2
]],
dtype
=
np
.
float32
)
tcl_poly
=
np
.
zeros_like
(
poly
)
point_num
=
poly
.
shape
[
0
]
for
idx
in
range
(
point_num
//
2
):
point_pair
=
poly
[
idx
]
+
(
poly
[
point_num
-
1
-
idx
]
-
poly
[
idx
]
)
*
ratio_pair
tcl_poly
[
idx
]
=
point_pair
[
0
]
tcl_poly
[
point_num
-
1
-
idx
]
=
point_pair
[
1
]
return
tcl_poly
def
gen_quad_tbo
(
self
,
quad
,
tcl_mask
,
tbo_map
):
"""
Generate tbo_map for give quad.
"""
# upper and lower line function: ax + by + c = 0;
up_line
=
self
.
line_cross_two_point
(
quad
[
0
],
quad
[
1
])
lower_line
=
self
.
line_cross_two_point
(
quad
[
3
],
quad
[
2
])
quad_h
=
0.5
*
(
np
.
linalg
.
norm
(
quad
[
0
]
-
quad
[
3
])
+
np
.
linalg
.
norm
(
quad
[
1
]
-
quad
[
2
]))
quad_w
=
0.5
*
(
np
.
linalg
.
norm
(
quad
[
0
]
-
quad
[
1
])
+
np
.
linalg
.
norm
(
quad
[
2
]
-
quad
[
3
]))
# average angle of left and right line.
angle
=
self
.
average_angle
(
quad
)
xy_in_poly
=
np
.
argwhere
(
tcl_mask
==
1
)
for
y
,
x
in
xy_in_poly
:
point
=
(
x
,
y
)
line
=
self
.
theta_line_cross_point
(
angle
,
point
)
cross_point_upper
=
self
.
line_cross_point
(
up_line
,
line
)
cross_point_lower
=
self
.
line_cross_point
(
lower_line
,
line
)
##FIX, offset reverse
upper_offset_x
,
upper_offset_y
=
cross_point_upper
-
point
lower_offset_x
,
lower_offset_y
=
cross_point_lower
-
point
tbo_map
[
y
,
x
,
0
]
=
upper_offset_y
tbo_map
[
y
,
x
,
1
]
=
upper_offset_x
tbo_map
[
y
,
x
,
2
]
=
lower_offset_y
tbo_map
[
y
,
x
,
3
]
=
lower_offset_x
tbo_map
[
y
,
x
,
4
]
=
1.0
/
max
(
min
(
quad_h
,
quad_w
),
1.0
)
*
2
return
tbo_map
def
poly2quads
(
self
,
poly
):
"""
Split poly into quads.
"""
quad_list
=
[]
point_num
=
poly
.
shape
[
0
]
# point pair
point_pair_list
=
[]
for
idx
in
range
(
point_num
//
2
):
point_pair
=
[
poly
[
idx
],
poly
[
point_num
-
1
-
idx
]]
point_pair_list
.
append
(
point_pair
)
quad_num
=
point_num
//
2
-
1
for
idx
in
range
(
quad_num
):
# reshape and adjust to clock-wise
quad_list
.
append
((
np
.
array
(
point_pair_list
)[[
idx
,
idx
+
1
]]
).
reshape
(
4
,
2
)[[
0
,
2
,
3
,
1
]])
return
np
.
array
(
quad_list
)
def
rotate_im_poly
(
self
,
im
,
text_polys
):
"""
rotate image with 90 / 180 / 270 degre
"""
im_w
,
im_h
=
im
.
shape
[
1
],
im
.
shape
[
0
]
dst_im
=
im
.
copy
()
dst_polys
=
[]
rand_degree_ratio
=
np
.
random
.
rand
()
rand_degree_cnt
=
1
if
rand_degree_ratio
>
0.5
:
rand_degree_cnt
=
3
for
i
in
range
(
rand_degree_cnt
):
dst_im
=
np
.
rot90
(
dst_im
)
rot_degree
=
-
90
*
rand_degree_cnt
rot_angle
=
rot_degree
*
math
.
pi
/
180.0
n_poly
=
text_polys
.
shape
[
0
]
cx
,
cy
=
0.5
*
im_w
,
0.5
*
im_h
ncx
,
ncy
=
0.5
*
dst_im
.
shape
[
1
],
0.5
*
dst_im
.
shape
[
0
]
for
i
in
range
(
n_poly
):
wordBB
=
text_polys
[
i
]
poly
=
[]
for
j
in
range
(
4
):
# 16->4
sx
,
sy
=
wordBB
[
j
][
0
],
wordBB
[
j
][
1
]
dx
=
math
.
cos
(
rot_angle
)
*
(
sx
-
cx
)
-
math
.
sin
(
rot_angle
)
*
(
sy
-
cy
)
+
ncx
dy
=
math
.
sin
(
rot_angle
)
*
(
sx
-
cx
)
+
math
.
cos
(
rot_angle
)
*
(
sy
-
cy
)
+
ncy
poly
.
append
([
dx
,
dy
])
dst_polys
.
append
(
poly
)
return
dst_im
,
np
.
array
(
dst_polys
,
dtype
=
np
.
float32
)
def
__call__
(
self
,
data
):
input_size
=
512
im
=
data
[
'image'
]
text_polys
=
data
[
'polys'
]
text_tags
=
data
[
'tags'
]
text_strs
=
data
[
'strs'
]
h
,
w
,
_
=
im
.
shape
text_polys
,
text_tags
,
hv_tags
=
self
.
check_and_validate_polys
(
text_polys
,
text_tags
,
(
h
,
w
))
if
text_polys
.
shape
[
0
]
<=
0
:
return
None
# set aspect ratio and keep area fix
asp_scales
=
np
.
arange
(
1.0
,
1.55
,
0.1
)
asp_scale
=
np
.
random
.
choice
(
asp_scales
)
if
np
.
random
.
rand
()
<
0.5
:
asp_scale
=
1.0
/
asp_scale
asp_scale
=
math
.
sqrt
(
asp_scale
)
asp_wx
=
asp_scale
asp_hy
=
1.0
/
asp_scale
im
=
cv2
.
resize
(
im
,
dsize
=
None
,
fx
=
asp_wx
,
fy
=
asp_hy
)
text_polys
[:,
:,
0
]
*=
asp_wx
text_polys
[:,
:,
1
]
*=
asp_hy
h
,
w
,
_
=
im
.
shape
if
max
(
h
,
w
)
>
2048
:
rd_scale
=
2048.0
/
max
(
h
,
w
)
im
=
cv2
.
resize
(
im
,
dsize
=
None
,
fx
=
rd_scale
,
fy
=
rd_scale
)
text_polys
*=
rd_scale
h
,
w
,
_
=
im
.
shape
if
min
(
h
,
w
)
<
16
:
return
None
# no background
im
,
text_polys
,
text_tags
,
hv_tags
,
text_strs
=
self
.
crop_area
(
im
,
text_polys
,
text_tags
,
hv_tags
,
text_strs
,
crop_background
=
False
)
if
text_polys
.
shape
[
0
]
==
0
:
return
None
# # continue for all ignore case
if
np
.
sum
((
text_tags
*
1.0
))
>=
text_tags
.
size
:
return
None
new_h
,
new_w
,
_
=
im
.
shape
if
(
new_h
is
None
)
or
(
new_w
is
None
):
return
None
# resize image
std_ratio
=
float
(
input_size
)
/
max
(
new_w
,
new_h
)
rand_scales
=
np
.
array
(
[
0.25
,
0.375
,
0.5
,
0.625
,
0.75
,
0.875
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
])
rz_scale
=
std_ratio
*
np
.
random
.
choice
(
rand_scales
)
im
=
cv2
.
resize
(
im
,
dsize
=
None
,
fx
=
rz_scale
,
fy
=
rz_scale
)
text_polys
[:,
:,
0
]
*=
rz_scale
text_polys
[:,
:,
1
]
*=
rz_scale
# add gaussian blur
if
np
.
random
.
rand
()
<
0.1
*
0.5
:
ks
=
np
.
random
.
permutation
(
5
)[
0
]
+
1
ks
=
int
(
ks
/
2
)
*
2
+
1
im
=
cv2
.
GaussianBlur
(
im
,
ksize
=
(
ks
,
ks
),
sigmaX
=
0
,
sigmaY
=
0
)
# add brighter
if
np
.
random
.
rand
()
<
0.1
*
0.5
:
im
=
im
*
(
1.0
+
np
.
random
.
rand
()
*
0.5
)
im
=
np
.
clip
(
im
,
0.0
,
255.0
)
# add darker
if
np
.
random
.
rand
()
<
0.1
*
0.5
:
im
=
im
*
(
1.0
-
np
.
random
.
rand
()
*
0.5
)
im
=
np
.
clip
(
im
,
0.0
,
255.0
)
# Padding the im to [input_size, input_size]
new_h
,
new_w
,
_
=
im
.
shape
if
min
(
new_w
,
new_h
)
<
input_size
*
0.5
:
return
None
im_padded
=
np
.
ones
((
input_size
,
input_size
,
3
),
dtype
=
np
.
float32
)
im_padded
[:,
:,
2
]
=
0.485
*
255
im_padded
[:,
:,
1
]
=
0.456
*
255
im_padded
[:,
:,
0
]
=
0.406
*
255
# Random the start position
del_h
=
input_size
-
new_h
del_w
=
input_size
-
new_w
sh
,
sw
=
0
,
0
if
del_h
>
1
:
sh
=
int
(
np
.
random
.
rand
()
*
del_h
)
if
del_w
>
1
:
sw
=
int
(
np
.
random
.
rand
()
*
del_w
)
# Padding
im_padded
[
sh
:
sh
+
new_h
,
sw
:
sw
+
new_w
,
:]
=
im
.
copy
()
text_polys
[:,
:,
0
]
+=
sw
text_polys
[:,
:,
1
]
+=
sh
score_map
,
score_label_map
,
border_map
,
direction_map
,
training_mask
,
\
pos_list
,
pos_mask
,
label_list
,
score_label_map_text_label
=
self
.
generate_tcl_ctc_label
(
input_size
,
input_size
,
text_polys
,
text_tags
,
text_strs
,
0.25
)
if
len
(
label_list
)
<=
0
:
# eliminate negative samples
return
None
pos_list_temp
=
np
.
zeros
([
64
,
3
])
pos_mask_temp
=
np
.
zeros
([
64
,
1
])
label_list_temp
=
np
.
zeros
([
50
,
1
])
+
36
for
i
,
label
in
enumerate
(
label_list
):
n
=
len
(
label
)
if
n
>
50
:
label_list
[
i
]
=
label
[:
50
]
continue
while
n
<
50
:
label
.
append
([
36
])
n
+=
1
for
i
in
range
(
len
(
label_list
)):
label_list
[
i
]
=
np
.
array
(
label_list
[
i
])
if
len
(
pos_list
)
<=
0
or
len
(
pos_list
)
>
30
:
return
None
for
__
in
range
(
30
-
len
(
pos_list
),
0
,
-
1
):
pos_list
.
append
(
pos_list_temp
)
pos_mask
.
append
(
pos_mask_temp
)
label_list
.
append
(
label_list_temp
)
if
self
.
img_id
==
self
.
batch_size
-
1
:
self
.
img_id
=
0
else
:
self
.
img_id
+=
1
im_padded
[:,
:,
2
]
-=
0.485
*
255
im_padded
[:,
:,
1
]
-=
0.456
*
255
im_padded
[:,
:,
0
]
-=
0.406
*
255
im_padded
[:,
:,
2
]
/=
(
255.0
*
0.229
)
im_padded
[:,
:,
1
]
/=
(
255.0
*
0.224
)
im_padded
[:,
:,
0
]
/=
(
255.0
*
0.225
)
im_padded
=
im_padded
.
transpose
((
2
,
0
,
1
))
images
=
im_padded
[::
-
1
,
:,
:]
tcl_maps
=
score_map
[
np
.
newaxis
,
:,
:]
tcl_label_maps
=
score_label_map
[
np
.
newaxis
,
:,
:]
border_maps
=
border_map
.
transpose
((
2
,
0
,
1
))
direction_maps
=
direction_map
.
transpose
((
2
,
0
,
1
))
training_masks
=
training_mask
[
np
.
newaxis
,
:,
:]
pos_list
=
np
.
array
(
pos_list
)
pos_mask
=
np
.
array
(
pos_mask
)
label_list
=
np
.
array
(
label_list
)
data
[
'images'
]
=
images
data
[
'tcl_maps'
]
=
tcl_maps
data
[
'tcl_label_maps'
]
=
tcl_label_maps
data
[
'border_maps'
]
=
border_maps
data
[
'direction_maps'
]
=
direction_maps
data
[
'training_masks'
]
=
training_masks
data
[
'label_list'
]
=
label_list
data
[
'pos_list'
]
=
pos_list
data
[
'pos_mask'
]
=
pos_mask
return
data
ppocr/data/pgnet_dataset.py
0 → 100644
View file @
1f76f449
# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
numpy
as
np
import
os
from
paddle.io
import
Dataset
from
.imaug
import
transform
,
create_operators
import
random
class
PGDateSet
(
Dataset
):
def
__init__
(
self
,
config
,
mode
,
logger
):
super
(
PGDateSet
,
self
).
__init__
()
self
.
logger
=
logger
global_config
=
config
[
'Global'
]
dataset_config
=
config
[
mode
][
'dataset'
]
loader_config
=
config
[
mode
][
'loader'
]
label_file_list
=
dataset_config
.
pop
(
'label_file_list'
)
data_source_num
=
len
(
label_file_list
)
ratio_list
=
dataset_config
.
get
(
"ratio_list"
,
[
1.0
])
if
isinstance
(
ratio_list
,
(
float
,
int
)):
ratio_list
=
[
float
(
ratio_list
)]
*
int
(
data_source_num
)
self
.
data_format
=
dataset_config
.
get
(
'data_format'
,
'icdar'
)
assert
len
(
ratio_list
)
==
data_source_num
,
"The length of ratio_list should be the same as the file_list."
# self.data_dir = dataset_config['data_dir']
self
.
do_shuffle
=
loader_config
[
'shuffle'
]
logger
.
info
(
"Initialize indexs of datasets:%s"
%
label_file_list
)
self
.
data_lines
=
self
.
get_image_info_list
(
label_file_list
,
ratio_list
,
self
.
data_format
)
self
.
data_idx_order_list
=
list
(
range
(
len
(
self
.
data_lines
)))
if
mode
.
lower
()
==
"train"
:
self
.
shuffle_data_random
()
self
.
ops
=
create_operators
(
dataset_config
[
'transforms'
],
global_config
)
def
shuffle_data_random
(
self
):
if
self
.
do_shuffle
:
random
.
shuffle
(
self
.
data_lines
)
return
def
extract_polys
(
self
,
poly_txt_path
):
"""
Read text_polys, txt_tags, txts from give txt file.
"""
text_polys
,
txt_tags
,
txts
=
[],
[],
[]
with
open
(
poly_txt_path
)
as
f
:
for
line
in
f
.
readlines
():
poly_str
,
txt
=
line
.
strip
().
split
(
'
\t
'
)
poly
=
map
(
float
,
poly_str
.
split
(
','
))
text_polys
.
append
(
np
.
array
(
list
(
poly
),
dtype
=
np
.
float32
).
reshape
(
-
1
,
2
))
txts
.
append
(
txt
)
if
txt
==
'###'
:
txt_tags
.
append
(
True
)
else
:
txt_tags
.
append
(
False
)
return
np
.
array
(
list
(
map
(
np
.
array
,
text_polys
))),
\
np
.
array
(
txt_tags
,
dtype
=
np
.
bool
),
txts
def
extract_info_textnet
(
self
,
im_fn
,
img_dir
=
''
):
"""
Extract information from line in textnet format.
"""
info_list
=
im_fn
.
split
(
'
\t
'
)
img_path
=
''
for
ext
in
[
'.jpg'
,
'.png'
,
'.jpeg'
,
'.JPG'
]:
if
os
.
path
.
exists
(
os
.
path
.
join
(
img_dir
,
info_list
[
0
]
+
ext
)):
img_path
=
os
.
path
.
join
(
img_dir
,
info_list
[
0
]
+
ext
)
break
if
img_path
==
''
:
print
(
'Image {0} NOT found in {1}, and it will be ignored.'
.
format
(
info_list
[
0
],
img_dir
))
nBox
=
(
len
(
info_list
)
-
1
)
//
9
wordBBs
,
txts
,
txt_tags
=
[],
[],
[]
for
n
in
range
(
0
,
nBox
):
wordBB
=
list
(
map
(
float
,
info_list
[
n
*
9
+
1
:(
n
+
1
)
*
9
]))
txt
=
info_list
[(
n
+
1
)
*
9
]
wordBBs
.
append
([[
wordBB
[
0
],
wordBB
[
1
]],
[
wordBB
[
2
],
wordBB
[
3
]],
[
wordBB
[
4
],
wordBB
[
5
]],
[
wordBB
[
6
],
wordBB
[
7
]]])
txts
.
append
(
txt
)
if
txt
==
'###'
:
txt_tags
.
append
(
True
)
else
:
txt_tags
.
append
(
False
)
return
img_path
,
np
.
array
(
wordBBs
,
dtype
=
np
.
float32
),
txt_tags
,
txts
def
get_image_info_list
(
self
,
file_list
,
ratio_list
,
data_format
=
'textnet'
):
if
isinstance
(
file_list
,
str
):
file_list
=
[
file_list
]
data_lines
=
[]
for
idx
,
data_source
in
enumerate
(
file_list
):
image_files
=
[]
if
data_format
==
'icdar'
:
image_files
=
[
(
data_source
,
x
)
for
x
in
os
.
listdir
(
os
.
path
.
join
(
data_source
,
'rgb'
))
if
x
.
split
(
'.'
)[
-
1
]
in
[
'jpg'
,
'png'
,
'jpeg'
,
'JPG'
]
]
elif
data_format
==
'textnet'
:
with
open
(
data_source
)
as
f
:
image_files
=
[(
data_source
,
x
.
strip
())
for
x
in
f
.
readlines
()]
else
:
print
(
"Unrecognized data format..."
)
exit
(
-
1
)
image_files
=
random
.
sample
(
image_files
,
round
(
len
(
image_files
)
*
ratio_list
[
idx
]))
data_lines
.
extend
(
image_files
)
return
data_lines
def
__getitem__
(
self
,
idx
):
file_idx
=
self
.
data_idx_order_list
[
idx
]
data_path
,
data_line
=
self
.
data_lines
[
file_idx
]
try
:
if
self
.
data_format
==
'icdar'
:
im_path
=
os
.
path
.
join
(
data_path
,
'rgb'
,
data_line
)
poly_path
=
os
.
path
.
join
(
data_path
,
'poly'
,
data_line
.
split
(
'.'
)[
0
]
+
'.txt'
)
text_polys
,
text_tags
,
text_strs
=
self
.
extract_polys
(
poly_path
)
else
:
image_dir
=
os
.
path
.
join
(
os
.
path
.
dirname
(
data_path
),
'image'
)
im_path
,
text_polys
,
text_tags
,
text_strs
=
self
.
extract_info_textnet
(
data_line
,
image_dir
)
data
=
{
'img_path'
:
im_path
,
'polys'
:
text_polys
,
'tags'
:
text_tags
,
'strs'
:
text_strs
}
with
open
(
data
[
'img_path'
],
'rb'
)
as
f
:
img
=
f
.
read
()
data
[
'image'
]
=
img
outs
=
transform
(
data
,
self
.
ops
)
except
Exception
as
e
:
self
.
logger
.
error
(
"When parsing line {}, error happened with msg: {}"
.
format
(
self
.
data_idx_order_list
[
idx
],
e
))
outs
=
None
if
outs
is
None
:
return
self
.
__getitem__
(
np
.
random
.
randint
(
self
.
__len__
()))
return
outs
def
__len__
(
self
):
return
len
(
self
.
data_idx_order_list
)
ppocr/losses/__init__.py
View file @
1f76f449
...
@@ -29,10 +29,11 @@ def build_loss(config):
...
@@ -29,10 +29,11 @@ def build_loss(config):
# cls loss
# cls loss
from
.cls_loss
import
ClsLoss
from
.cls_loss
import
ClsLoss
# e2e loss
from
.e2e_pg_loss
import
PGLoss
support_dict
=
[
support_dict
=
[
'DBLoss'
,
'EASTLoss'
,
'SASTLoss'
,
'CTCLoss'
,
'ClsLoss'
,
'AttentionLoss'
,
'DBLoss'
,
'EASTLoss'
,
'SASTLoss'
,
'CTCLoss'
,
'ClsLoss'
,
'AttentionLoss'
,
'SRNLoss'
'SRNLoss'
,
'PGLoss'
]
]
config
=
copy
.
deepcopy
(
config
)
config
=
copy
.
deepcopy
(
config
)
module_name
=
config
.
pop
(
'name'
)
module_name
=
config
.
pop
(
'name'
)
...
...
ppocr/losses/e2e_pg_loss.py
0 → 100644
View file @
1f76f449
# copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
absolute_import
from
__future__
import
division
from
__future__
import
print_function
from
paddle
import
nn
import
paddle
import
numpy
as
np
import
copy
from
.det_basic_loss
import
BalanceLoss
,
MaskL1Loss
,
DiceLoss
class
PGLoss
(
nn
.
Layer
):
"""
Differentiable Binarization (DB) Loss Function
args:
param (dict): the super paramter for DB Loss
"""
def
__init__
(
self
,
alpha
=
5
,
beta
=
10
,
eps
=
1e-6
,
**
kwargs
):
super
(
PGLoss
,
self
).
__init__
()
self
.
alpha
=
alpha
self
.
beta
=
beta
self
.
dice_loss
=
DiceLoss
(
eps
=
eps
)
def
org_tcl_rois
(
self
,
batch_size
,
pos_lists
,
pos_masks
,
label_lists
):
"""
"""
pos_lists_
,
pos_masks_
,
label_lists_
=
[],
[],
[]
img_bs
=
batch_size
tcl_bs
=
64
ngpu
=
int
(
batch_size
/
img_bs
)
img_ids
=
np
.
array
(
pos_lists
,
dtype
=
np
.
int32
)[:,
0
,
0
].
copy
()
pos_lists_split
,
pos_masks_split
,
label_lists_split
=
[],
[],
[]
for
i
in
range
(
ngpu
):
pos_lists_split
.
append
([])
pos_masks_split
.
append
([])
label_lists_split
.
append
([])
for
i
in
range
(
img_ids
.
shape
[
0
]):
img_id
=
img_ids
[
i
]
gpu_id
=
int
(
img_id
/
img_bs
)
img_id
=
img_id
%
img_bs
pos_list
=
pos_lists
[
i
].
copy
()
pos_list
[:,
0
]
=
img_id
pos_lists_split
[
gpu_id
].
append
(
pos_list
)
pos_masks_split
[
gpu_id
].
append
(
pos_masks
[
i
].
copy
())
label_lists_split
[
gpu_id
].
append
(
copy
.
deepcopy
(
label_lists
[
i
]))
# repeat or delete
for
i
in
range
(
ngpu
):
vp_len
=
len
(
pos_lists_split
[
i
])
if
vp_len
<=
tcl_bs
:
for
j
in
range
(
0
,
tcl_bs
-
vp_len
):
pos_list
=
pos_lists_split
[
i
][
j
].
copy
()
pos_lists_split
[
i
].
append
(
pos_list
)
pos_mask
=
pos_masks_split
[
i
][
j
].
copy
()
pos_masks_split
[
i
].
append
(
pos_mask
)
label_list
=
copy
.
deepcopy
(
label_lists_split
[
i
][
j
])
label_lists_split
[
i
].
append
(
label_list
)
else
:
for
j
in
range
(
0
,
vp_len
-
tcl_bs
):
c_len
=
len
(
pos_lists_split
[
i
])
pop_id
=
np
.
random
.
permutation
(
c_len
)[
0
]
pos_lists_split
[
i
].
pop
(
pop_id
)
pos_masks_split
[
i
].
pop
(
pop_id
)
label_lists_split
[
i
].
pop
(
pop_id
)
# merge
for
i
in
range
(
ngpu
):
pos_lists_
.
extend
(
pos_lists_split
[
i
])
pos_masks_
.
extend
(
pos_masks_split
[
i
])
label_lists_
.
extend
(
label_lists_split
[
i
])
return
pos_lists_
,
pos_masks_
,
label_lists_
def
pre_process
(
self
,
label_list
,
pos_list
,
pos_mask
):
label_list
=
label_list
.
numpy
()
b
,
h
,
w
,
c
=
label_list
.
shape
pos_list
=
pos_list
.
numpy
()
pos_mask
=
pos_mask
.
numpy
()
pos_list_t
=
[]
pos_mask_t
=
[]
label_list_t
=
[]
for
i
in
range
(
b
):
for
j
in
range
(
30
):
if
pos_mask
[
i
,
j
].
any
():
pos_list_t
.
append
(
pos_list
[
i
][
j
])
pos_mask_t
.
append
(
pos_mask
[
i
][
j
])
label_list_t
.
append
(
label_list
[
i
][
j
])
pos_list
,
pos_mask
,
label_list
=
self
.
org_tcl_rois
(
b
,
pos_list_t
,
pos_mask_t
,
label_list_t
)
label
=
[]
tt
=
[
l
.
tolist
()
for
l
in
label_list
]
for
i
in
range
(
64
):
k
=
0
for
j
in
range
(
50
):
if
tt
[
i
][
j
][
0
]
!=
36
:
k
+=
1
else
:
break
label
.
append
(
k
)
label
=
paddle
.
to_tensor
(
label
)
label
=
paddle
.
cast
(
label
,
dtype
=
'int64'
)
pos_list
=
paddle
.
to_tensor
(
pos_list
)
pos_mask
=
paddle
.
to_tensor
(
pos_mask
)
label_list
=
paddle
.
squeeze
(
paddle
.
to_tensor
(
label_list
),
axis
=
2
)
label_list
=
paddle
.
cast
(
label_list
,
dtype
=
'int32'
)
return
pos_list
,
pos_mask
,
label_list
,
label
def
border_loss
(
self
,
f_border
,
l_border
,
l_score
,
l_mask
):
l_border_split
,
l_border_norm
=
paddle
.
tensor
.
split
(
l_border
,
num_or_sections
=
[
4
,
1
],
axis
=
1
)
f_border_split
=
f_border
b
,
c
,
h
,
w
=
l_border_norm
.
shape
l_border_norm_split
=
paddle
.
expand
(
x
=
l_border_norm
,
shape
=
[
b
,
4
*
c
,
h
,
w
])
b
,
c
,
h
,
w
=
l_score
.
shape
l_border_score
=
paddle
.
expand
(
x
=
l_score
,
shape
=
[
b
,
4
*
c
,
h
,
w
])
b
,
c
,
h
,
w
=
l_mask
.
shape
l_border_mask
=
paddle
.
expand
(
x
=
l_mask
,
shape
=
[
b
,
4
*
c
,
h
,
w
])
border_diff
=
l_border_split
-
f_border_split
abs_border_diff
=
paddle
.
abs
(
border_diff
)
border_sign
=
abs_border_diff
<
1.0
border_sign
=
paddle
.
cast
(
border_sign
,
dtype
=
'float32'
)
border_sign
.
stop_gradient
=
True
border_in_loss
=
0.5
*
abs_border_diff
*
abs_border_diff
*
border_sign
+
\
(
abs_border_diff
-
0.5
)
*
(
1.0
-
border_sign
)
border_out_loss
=
l_border_norm_split
*
border_in_loss
border_loss
=
paddle
.
sum
(
border_out_loss
*
l_border_score
*
l_border_mask
)
/
\
(
paddle
.
sum
(
l_border_score
*
l_border_mask
)
+
1e-5
)
return
border_loss
def
direction_loss
(
self
,
f_direction
,
l_direction
,
l_score
,
l_mask
):
l_direction_split
,
l_direction_norm
=
paddle
.
tensor
.
split
(
l_direction
,
num_or_sections
=
[
2
,
1
],
axis
=
1
)
f_direction_split
=
f_direction
b
,
c
,
h
,
w
=
l_direction_norm
.
shape
l_direction_norm_split
=
paddle
.
expand
(
x
=
l_direction_norm
,
shape
=
[
b
,
2
*
c
,
h
,
w
])
b
,
c
,
h
,
w
=
l_score
.
shape
l_direction_score
=
paddle
.
expand
(
x
=
l_score
,
shape
=
[
b
,
2
*
c
,
h
,
w
])
b
,
c
,
h
,
w
=
l_mask
.
shape
l_direction_mask
=
paddle
.
expand
(
x
=
l_mask
,
shape
=
[
b
,
2
*
c
,
h
,
w
])
direction_diff
=
l_direction_split
-
f_direction_split
abs_direction_diff
=
paddle
.
abs
(
direction_diff
)
direction_sign
=
abs_direction_diff
<
1.0
direction_sign
=
paddle
.
cast
(
direction_sign
,
dtype
=
'float32'
)
direction_sign
.
stop_gradient
=
True
direction_in_loss
=
0.5
*
abs_direction_diff
*
abs_direction_diff
*
direction_sign
+
\
(
abs_direction_diff
-
0.5
)
*
(
1.0
-
direction_sign
)
direction_out_loss
=
l_direction_norm_split
*
direction_in_loss
direction_loss
=
paddle
.
sum
(
direction_out_loss
*
l_direction_score
*
l_direction_mask
)
/
\
(
paddle
.
sum
(
l_direction_score
*
l_direction_mask
)
+
1e-5
)
return
direction_loss
def
ctcloss
(
self
,
f_char
,
tcl_pos
,
tcl_mask
,
tcl_label
,
label_t
):
f_char
=
paddle
.
transpose
(
f_char
,
[
0
,
2
,
3
,
1
])
tcl_pos
=
paddle
.
reshape
(
tcl_pos
,
[
-
1
,
3
])
tcl_pos
=
paddle
.
cast
(
tcl_pos
,
dtype
=
int
)
f_tcl_char
=
paddle
.
gather_nd
(
f_char
,
tcl_pos
)
f_tcl_char
=
paddle
.
reshape
(
f_tcl_char
,
[
-
1
,
64
,
37
])
# len(Lexicon_Table)+1
f_tcl_char_fg
,
f_tcl_char_bg
=
paddle
.
split
(
f_tcl_char
,
[
36
,
1
],
axis
=
2
)
f_tcl_char_bg
=
f_tcl_char_bg
*
tcl_mask
+
(
1.0
-
tcl_mask
)
*
20.0
b
,
c
,
l
=
tcl_mask
.
shape
tcl_mask_fg
=
paddle
.
expand
(
x
=
tcl_mask
,
shape
=
[
b
,
c
,
36
*
l
])
tcl_mask_fg
.
stop_gradient
=
True
f_tcl_char_fg
=
f_tcl_char_fg
*
tcl_mask_fg
+
(
1.0
-
tcl_mask_fg
)
*
(
-
20.0
)
f_tcl_char_mask
=
paddle
.
concat
([
f_tcl_char_fg
,
f_tcl_char_bg
],
axis
=
2
)
f_tcl_char_ld
=
paddle
.
transpose
(
f_tcl_char_mask
,
(
1
,
0
,
2
))
N
,
B
,
_
=
f_tcl_char_ld
.
shape
input_lengths
=
paddle
.
to_tensor
([
N
]
*
B
,
dtype
=
'int64'
)
cost
=
paddle
.
nn
.
functional
.
ctc_loss
(
log_probs
=
f_tcl_char_ld
,
labels
=
tcl_label
,
input_lengths
=
input_lengths
,
label_lengths
=
label_t
,
blank
=
36
,
reduction
=
'none'
)
cost
=
cost
.
mean
()
return
cost
def
forward
(
self
,
predicts
,
labels
):
images
,
tcl_maps
,
tcl_label_maps
,
border_maps
\
,
direction_maps
,
training_masks
,
label_list
,
pos_list
,
pos_mask
=
labels
# for all the batch_size
pos_list
,
pos_mask
,
label_list
,
label_t
=
self
.
pre_process
(
label_list
,
pos_list
,
pos_mask
)
f_score
,
f_boder
,
f_direction
,
f_char
=
predicts
score_loss
=
self
.
dice_loss
(
f_score
,
tcl_maps
,
training_masks
)
border_loss
=
self
.
border_loss
(
f_boder
,
border_maps
,
tcl_maps
,
training_masks
)
direction_loss
=
self
.
direction_loss
(
f_direction
,
direction_maps
,
tcl_maps
,
training_masks
)
ctc_loss
=
self
.
ctcloss
(
f_char
,
pos_list
,
pos_mask
,
label_list
,
label_t
)
loss_all
=
score_loss
+
border_loss
+
direction_loss
+
5
*
ctc_loss
losses
=
{
'loss'
:
loss_all
,
"score_loss"
:
score_loss
,
"border_loss"
:
border_loss
,
"direction_loss"
:
direction_loss
,
"ctc_loss"
:
ctc_loss
}
return
losses
ppocr/metrics/__init__.py
View file @
1f76f449
...
@@ -26,8 +26,9 @@ def build_metric(config):
...
@@ -26,8 +26,9 @@ def build_metric(config):
from
.det_metric
import
DetMetric
from
.det_metric
import
DetMetric
from
.rec_metric
import
RecMetric
from
.rec_metric
import
RecMetric
from
.cls_metric
import
ClsMetric
from
.cls_metric
import
ClsMetric
from
.e2e_metric
import
E2EMetric
support_dict
=
[
'DetMetric'
,
'RecMetric'
,
'ClsMetric'
]
support_dict
=
[
'DetMetric'
,
'RecMetric'
,
'ClsMetric'
,
'E2EMetric'
]
config
=
copy
.
deepcopy
(
config
)
config
=
copy
.
deepcopy
(
config
)
module_name
=
config
.
pop
(
'name'
)
module_name
=
config
.
pop
(
'name'
)
...
...
ppocr/metrics/e2e_metric.py
0 → 100644
View file @
1f76f449
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
absolute_import
from
__future__
import
division
from
__future__
import
print_function
__all__
=
[
'E2EMetric'
]
from
ppocr.utils.e2e_metric.Deteval
import
*
class
E2EMetric
(
object
):
def
__init__
(
self
,
main_indicator
=
'f_score_e2e'
,
**
kwargs
):
self
.
label_list
=
[
'0'
,
'1'
,
'2'
,
'3'
,
'4'
,
'5'
,
'6'
,
'7'
,
'8'
,
'9'
,
'A'
,
'B'
,
'C'
,
'D'
,
'E'
,
'F'
,
'G'
,
'H'
,
'I'
,
'J'
,
'K'
,
'L'
,
'M'
,
'N'
,
'O'
,
'P'
,
'Q'
,
'R'
,
'S'
,
'T'
,
'U'
,
'V'
,
'W'
,
'X'
,
'Y'
,
'Z'
]
self
.
main_indicator
=
main_indicator
self
.
reset
()
def
__call__
(
self
,
preds
,
batch
,
**
kwargs
):
'''
batch: a list produced by dataloaders.
image: np.ndarray of shape (N, C, H, W).
ratio_list: np.ndarray of shape(N,2)
polygons: np.ndarray of shape (N, K, 4, 2), the polygons of objective regions.
ignore_tags: np.ndarray of shape (N, K), indicates whether a region is ignorable or not.
preds: a list of dict produced by post process
points: np.ndarray of shape (N, K, 4, 2), the polygons of objective regions.
'''
gt_polyons_batch
=
batch
[
2
]
temp_gt_strs_batch
=
batch
[
3
]
ignore_tags_batch
=
batch
[
4
]
gt_strs_batch
=
[]
temp_gt_strs_batch
=
temp_gt_strs_batch
[
0
].
tolist
()
for
temp_list
in
temp_gt_strs_batch
:
t
=
""
for
index
in
temp_list
:
if
index
<
36
:
t
+=
self
.
label_list
[
index
]
gt_strs_batch
.
append
(
t
)
for
pred
,
gt_polyons
,
gt_strs
,
ignore_tags
in
zip
(
preds
,
gt_polyons_batch
,
gt_strs_batch
,
ignore_tags_batch
):
# prepare gt
gt_info_list
=
[{
'points'
:
gt_polyon
,
'text'
:
gt_str
,
'ignore'
:
ignore_tag
}
for
gt_polyon
,
gt_str
,
ignore_tag
in
zip
(
gt_polyons
,
gt_strs
,
ignore_tags
)]
# prepare det
e2e_info_list
=
[{
'points'
:
det_polyon
,
'text'
:
pred_str
}
for
det_polyon
,
pred_str
in
zip
(
pred
[
'points'
],
preds
[
'strs'
])]
result
=
get_socre
(
gt_info_list
,
e2e_info_list
)
self
.
results
.
append
(
result
)
def
get_metric
(
self
):
"""
return metrics {
'precision': 0,
'recall': 0,
'hmean': 0
}
"""
metircs
=
combine_results
(
self
.
results
)
self
.
reset
()
return
metircs
def
reset
(
self
):
self
.
results
=
[]
# clear results
ppocr/metrics/eval_det_iou.py
View file @
1f76f449
...
@@ -200,7 +200,8 @@ class DetectionIoUEvaluator(object):
...
@@ -200,7 +200,8 @@ class DetectionIoUEvaluator(object):
methodPrecision
=
0
if
numGlobalCareDet
==
0
else
float
(
methodPrecision
=
0
if
numGlobalCareDet
==
0
else
float
(
matchedSum
)
/
numGlobalCareDet
matchedSum
)
/
numGlobalCareDet
methodHmean
=
0
if
methodRecall
+
methodPrecision
==
0
else
2
*
\
methodHmean
=
0
if
methodRecall
+
methodPrecision
==
0
else
2
*
\
methodRecall
*
methodPrecision
/
(
methodRecall
+
methodPrecision
)
methodRecall
*
methodPrecision
/
(
methodRecall
+
methodPrecision
)
# print(methodRecall, methodPrecision, methodHmean)
# print(methodRecall, methodPrecision, methodHmean)
# sys.exit(-1)
# sys.exit(-1)
methodMetrics
=
{
methodMetrics
=
{
...
...
ppocr/modeling/backbones/__init__.py
View file @
1f76f449
...
@@ -26,6 +26,9 @@ def build_backbone(config, model_type):
...
@@ -26,6 +26,9 @@ def build_backbone(config, model_type):
from
.rec_resnet_vd
import
ResNet
from
.rec_resnet_vd
import
ResNet
from
.rec_resnet_fpn
import
ResNetFPN
from
.rec_resnet_fpn
import
ResNetFPN
support_dict
=
[
'MobileNetV3'
,
'ResNet'
,
'ResNetFPN'
]
support_dict
=
[
'MobileNetV3'
,
'ResNet'
,
'ResNetFPN'
]
elif
model_type
==
'e2e'
:
from
.e2e_resnet_vd_pg
import
ResNet
support_dict
=
[
'ResNet'
]
else
:
else
:
raise
NotImplementedError
raise
NotImplementedError
...
...
ppocr/modeling/backbones/e2e_resnet_vd_pg.py
0 → 100644
View file @
1f76f449
# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
absolute_import
from
__future__
import
division
from
__future__
import
print_function
import
paddle
from
paddle
import
ParamAttr
import
paddle.nn
as
nn
import
paddle.nn.functional
as
F
__all__
=
[
"ResNet"
]
class
ConvBNLayer
(
nn
.
Layer
):
def
__init__
(
self
,
in_channels
,
out_channels
,
kernel_size
,
stride
=
1
,
groups
=
1
,
is_vd_mode
=
False
,
act
=
None
,
name
=
None
,
):
super
(
ConvBNLayer
,
self
).
__init__
()
self
.
is_vd_mode
=
is_vd_mode
self
.
_pool2d_avg
=
nn
.
AvgPool2D
(
kernel_size
=
2
,
stride
=
2
,
padding
=
0
,
ceil_mode
=
True
)
self
.
_conv
=
nn
.
Conv2D
(
in_channels
=
in_channels
,
out_channels
=
out_channels
,
kernel_size
=
kernel_size
,
stride
=
stride
,
padding
=
(
kernel_size
-
1
)
//
2
,
groups
=
groups
,
weight_attr
=
ParamAttr
(
name
=
name
+
"_weights"
),
bias_attr
=
False
)
if
name
==
"conv1"
:
bn_name
=
"bn_"
+
name
else
:
bn_name
=
"bn"
+
name
[
3
:]
self
.
_batch_norm
=
nn
.
BatchNorm
(
out_channels
,
act
=
act
,
param_attr
=
ParamAttr
(
name
=
bn_name
+
'_scale'
),
bias_attr
=
ParamAttr
(
bn_name
+
'_offset'
),
moving_mean_name
=
bn_name
+
'_mean'
,
moving_variance_name
=
bn_name
+
'_variance'
)
def
forward
(
self
,
inputs
):
# if self.is_vd_mode:
# inputs = self._pool2d_avg(inputs)
y
=
self
.
_conv
(
inputs
)
y
=
self
.
_batch_norm
(
y
)
return
y
class
BottleneckBlock
(
nn
.
Layer
):
def
__init__
(
self
,
in_channels
,
out_channels
,
stride
,
shortcut
=
True
,
if_first
=
False
,
name
=
None
):
super
(
BottleneckBlock
,
self
).
__init__
()
self
.
conv0
=
ConvBNLayer
(
in_channels
=
in_channels
,
out_channels
=
out_channels
,
kernel_size
=
1
,
act
=
'relu'
,
name
=
name
+
"_branch2a"
)
self
.
conv1
=
ConvBNLayer
(
in_channels
=
out_channels
,
out_channels
=
out_channels
,
kernel_size
=
3
,
stride
=
stride
,
act
=
'relu'
,
name
=
name
+
"_branch2b"
)
self
.
conv2
=
ConvBNLayer
(
in_channels
=
out_channels
,
out_channels
=
out_channels
*
4
,
kernel_size
=
1
,
act
=
None
,
name
=
name
+
"_branch2c"
)
if
not
shortcut
:
self
.
short
=
ConvBNLayer
(
in_channels
=
in_channels
,
out_channels
=
out_channels
*
4
,
kernel_size
=
1
,
stride
=
stride
,
is_vd_mode
=
False
if
if_first
else
True
,
name
=
name
+
"_branch1"
)
self
.
shortcut
=
shortcut
def
forward
(
self
,
inputs
):
y
=
self
.
conv0
(
inputs
)
conv1
=
self
.
conv1
(
y
)
conv2
=
self
.
conv2
(
conv1
)
if
self
.
shortcut
:
short
=
inputs
else
:
short
=
self
.
short
(
inputs
)
y
=
paddle
.
add
(
x
=
short
,
y
=
conv2
)
y
=
F
.
relu
(
y
)
return
y
class
BasicBlock
(
nn
.
Layer
):
def
__init__
(
self
,
in_channels
,
out_channels
,
stride
,
shortcut
=
True
,
if_first
=
False
,
name
=
None
):
super
(
BasicBlock
,
self
).
__init__
()
self
.
stride
=
stride
self
.
conv0
=
ConvBNLayer
(
in_channels
=
in_channels
,
out_channels
=
out_channels
,
kernel_size
=
3
,
stride
=
stride
,
act
=
'relu'
,
name
=
name
+
"_branch2a"
)
self
.
conv1
=
ConvBNLayer
(
in_channels
=
out_channels
,
out_channels
=
out_channels
,
kernel_size
=
3
,
act
=
None
,
name
=
name
+
"_branch2b"
)
if
not
shortcut
:
self
.
short
=
ConvBNLayer
(
in_channels
=
in_channels
,
out_channels
=
out_channels
,
kernel_size
=
1
,
stride
=
1
,
is_vd_mode
=
False
if
if_first
else
True
,
name
=
name
+
"_branch1"
)
self
.
shortcut
=
shortcut
def
forward
(
self
,
inputs
):
y
=
self
.
conv0
(
inputs
)
conv1
=
self
.
conv1
(
y
)
if
self
.
shortcut
:
short
=
inputs
else
:
short
=
self
.
short
(
inputs
)
y
=
paddle
.
add
(
x
=
short
,
y
=
conv1
)
y
=
F
.
relu
(
y
)
return
y
class
ResNet
(
nn
.
Layer
):
def
__init__
(
self
,
in_channels
=
3
,
layers
=
50
,
**
kwargs
):
super
(
ResNet
,
self
).
__init__
()
self
.
layers
=
layers
supported_layers
=
[
18
,
34
,
50
,
101
,
152
,
200
]
assert
layers
in
supported_layers
,
\
"supported layers are {} but input layer is {}"
.
format
(
supported_layers
,
layers
)
if
layers
==
18
:
depth
=
[
2
,
2
,
2
,
2
]
elif
layers
==
34
or
layers
==
50
:
# depth = [3, 4, 6, 3]
depth
=
[
3
,
4
,
6
,
3
,
3
]
elif
layers
==
101
:
depth
=
[
3
,
4
,
23
,
3
]
elif
layers
==
152
:
depth
=
[
3
,
8
,
36
,
3
]
elif
layers
==
200
:
depth
=
[
3
,
12
,
48
,
3
]
num_channels
=
[
64
,
256
,
512
,
1024
,
2048
]
if
layers
>=
50
else
[
64
,
64
,
128
,
256
]
num_filters
=
[
64
,
128
,
256
,
512
,
512
]
self
.
conv1_1
=
ConvBNLayer
(
in_channels
=
in_channels
,
out_channels
=
64
,
kernel_size
=
7
,
stride
=
2
,
act
=
'relu'
,
name
=
"conv1_1"
)
self
.
pool2d_max
=
nn
.
MaxPool2D
(
kernel_size
=
3
,
stride
=
2
,
padding
=
1
)
self
.
stages
=
[]
self
.
out_channels
=
[
3
,
64
]
# num_filters = [64, 128, 256, 512, 512]
if
layers
>=
50
:
for
block
in
range
(
len
(
depth
)):
block_list
=
[]
shortcut
=
False
for
i
in
range
(
depth
[
block
]):
if
layers
in
[
101
,
152
]
and
block
==
2
:
if
i
==
0
:
conv_name
=
"res"
+
str
(
block
+
2
)
+
"a"
else
:
conv_name
=
"res"
+
str
(
block
+
2
)
+
"b"
+
str
(
i
)
else
:
conv_name
=
"res"
+
str
(
block
+
2
)
+
chr
(
97
+
i
)
bottleneck_block
=
self
.
add_sublayer
(
'bb_%d_%d'
%
(
block
,
i
),
BottleneckBlock
(
in_channels
=
num_channels
[
block
]
if
i
==
0
else
num_filters
[
block
]
*
4
,
out_channels
=
num_filters
[
block
],
stride
=
2
if
i
==
0
and
block
!=
0
else
1
,
shortcut
=
shortcut
,
if_first
=
block
==
i
==
0
,
name
=
conv_name
))
shortcut
=
True
block_list
.
append
(
bottleneck_block
)
self
.
out_channels
.
append
(
num_filters
[
block
]
*
4
)
self
.
stages
.
append
(
nn
.
Sequential
(
*
block_list
))
else
:
for
block
in
range
(
len
(
depth
)):
block_list
=
[]
shortcut
=
False
for
i
in
range
(
depth
[
block
]):
conv_name
=
"res"
+
str
(
block
+
2
)
+
chr
(
97
+
i
)
basic_block
=
self
.
add_sublayer
(
'bb_%d_%d'
%
(
block
,
i
),
BasicBlock
(
in_channels
=
num_channels
[
block
]
if
i
==
0
else
num_filters
[
block
],
out_channels
=
num_filters
[
block
],
stride
=
2
if
i
==
0
and
block
!=
0
else
1
,
shortcut
=
shortcut
,
if_first
=
block
==
i
==
0
,
name
=
conv_name
))
shortcut
=
True
block_list
.
append
(
basic_block
)
self
.
out_channels
.
append
(
num_filters
[
block
])
self
.
stages
.
append
(
nn
.
Sequential
(
*
block_list
))
def
forward
(
self
,
inputs
):
out
=
[
inputs
]
y
=
self
.
conv1_1
(
inputs
)
out
.
append
(
y
)
y
=
self
.
pool2d_max
(
y
)
for
block
in
self
.
stages
:
y
=
block
(
y
)
out
.
append
(
y
)
return
out
ppocr/modeling/heads/__init__.py
View file @
1f76f449
...
@@ -20,6 +20,7 @@ def build_head(config):
...
@@ -20,6 +20,7 @@ def build_head(config):
from
.det_db_head
import
DBHead
from
.det_db_head
import
DBHead
from
.det_east_head
import
EASTHead
from
.det_east_head
import
EASTHead
from
.det_sast_head
import
SASTHead
from
.det_sast_head
import
SASTHead
from
.e2e_pg_head
import
PGHead
# rec head
# rec head
from
.rec_ctc_head
import
CTCHead
from
.rec_ctc_head
import
CTCHead
...
@@ -30,8 +31,8 @@ def build_head(config):
...
@@ -30,8 +31,8 @@ def build_head(config):
from
.cls_head
import
ClsHead
from
.cls_head
import
ClsHead
support_dict
=
[
support_dict
=
[
'DBHead'
,
'EASTHead'
,
'SASTHead'
,
'CTCHead'
,
'ClsHead'
,
'AttentionHead'
,
'DBHead'
,
'EASTHead'
,
'SASTHead'
,
'CTCHead'
,
'ClsHead'
,
'AttentionHead'
,
'SRNHead'
'SRNHead'
,
'PGHead'
]
]
module_name
=
config
.
pop
(
'name'
)
module_name
=
config
.
pop
(
'name'
)
assert
module_name
in
support_dict
,
Exception
(
'head only support {}'
.
format
(
assert
module_name
in
support_dict
,
Exception
(
'head only support {}'
.
format
(
...
...
ppocr/modeling/heads/e2e_pg_head.py
0 → 100644
View file @
1f76f449
# copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
absolute_import
from
__future__
import
division
from
__future__
import
print_function
import
math
import
paddle
from
paddle
import
nn
import
paddle.nn.functional
as
F
from
paddle
import
ParamAttr
class
ConvBNLayer
(
nn
.
Layer
):
def
__init__
(
self
,
in_channels
,
out_channels
,
kernel_size
,
stride
,
padding
,
groups
=
1
,
if_act
=
True
,
act
=
None
,
name
=
None
):
super
(
ConvBNLayer
,
self
).
__init__
()
self
.
if_act
=
if_act
self
.
act
=
act
self
.
conv
=
nn
.
Conv2D
(
in_channels
=
in_channels
,
out_channels
=
out_channels
,
kernel_size
=
kernel_size
,
stride
=
stride
,
padding
=
padding
,
groups
=
groups
,
weight_attr
=
ParamAttr
(
name
=
name
+
'_weights'
),
bias_attr
=
False
)
self
.
bn
=
nn
.
BatchNorm
(
num_channels
=
out_channels
,
act
=
act
,
param_attr
=
ParamAttr
(
name
=
"bn_"
+
name
+
"_scale"
),
bias_attr
=
ParamAttr
(
name
=
"bn_"
+
name
+
"_offset"
),
moving_mean_name
=
"bn_"
+
name
+
"_mean"
,
moving_variance_name
=
"bn_"
+
name
+
"_variance"
,
use_global_stats
=
False
)
def
forward
(
self
,
x
):
x
=
self
.
conv
(
x
)
x
=
self
.
bn
(
x
)
return
x
class
PGHead
(
nn
.
Layer
):
"""
"""
def
__init__
(
self
,
in_channels
,
model_name
,
**
kwargs
):
super
(
PGHead
,
self
).
__init__
()
self
.
model_name
=
model_name
self
.
conv_f_score1
=
ConvBNLayer
(
in_channels
=
in_channels
,
out_channels
=
64
,
kernel_size
=
1
,
stride
=
1
,
padding
=
0
,
act
=
'relu'
,
name
=
"conv_f_score{}"
.
format
(
1
))
self
.
conv_f_score2
=
ConvBNLayer
(
in_channels
=
64
,
out_channels
=
64
,
kernel_size
=
3
,
stride
=
1
,
padding
=
1
,
act
=
'relu'
,
name
=
"conv_f_score{}"
.
format
(
2
))
self
.
conv_f_score3
=
ConvBNLayer
(
in_channels
=
64
,
out_channels
=
128
,
kernel_size
=
1
,
stride
=
1
,
padding
=
0
,
act
=
'relu'
,
name
=
"conv_f_score{}"
.
format
(
3
))
self
.
conv1
=
nn
.
Conv2D
(
in_channels
=
128
,
out_channels
=
1
,
kernel_size
=
3
,
stride
=
1
,
padding
=
1
,
groups
=
1
,
weight_attr
=
ParamAttr
(
name
=
"conv_f_score{}"
.
format
(
4
)),
bias_attr
=
False
)
self
.
conv_f_boder1
=
ConvBNLayer
(
in_channels
=
in_channels
,
out_channels
=
64
,
kernel_size
=
1
,
stride
=
1
,
padding
=
0
,
act
=
'relu'
,
name
=
"conv_f_boder{}"
.
format
(
1
))
self
.
conv_f_boder2
=
ConvBNLayer
(
in_channels
=
64
,
out_channels
=
64
,
kernel_size
=
3
,
stride
=
1
,
padding
=
1
,
act
=
'relu'
,
name
=
"conv_f_boder{}"
.
format
(
2
))
self
.
conv_f_boder3
=
ConvBNLayer
(
in_channels
=
64
,
out_channels
=
128
,
kernel_size
=
1
,
stride
=
1
,
padding
=
0
,
act
=
'relu'
,
name
=
"conv_f_boder{}"
.
format
(
3
))
self
.
conv2
=
nn
.
Conv2D
(
in_channels
=
128
,
out_channels
=
4
,
kernel_size
=
3
,
stride
=
1
,
padding
=
1
,
groups
=
1
,
weight_attr
=
ParamAttr
(
name
=
"conv_f_boder{}"
.
format
(
4
)),
bias_attr
=
False
)
self
.
conv_f_char1
=
ConvBNLayer
(
in_channels
=
in_channels
,
out_channels
=
128
,
kernel_size
=
1
,
stride
=
1
,
padding
=
0
,
act
=
'relu'
,
name
=
"conv_f_char{}"
.
format
(
1
))
self
.
conv_f_char2
=
ConvBNLayer
(
in_channels
=
128
,
out_channels
=
128
,
kernel_size
=
3
,
stride
=
1
,
padding
=
1
,
act
=
'relu'
,
name
=
"conv_f_char{}"
.
format
(
2
))
self
.
conv_f_char3
=
ConvBNLayer
(
in_channels
=
128
,
out_channels
=
256
,
kernel_size
=
1
,
stride
=
1
,
padding
=
0
,
act
=
'relu'
,
name
=
"conv_f_char{}"
.
format
(
3
))
self
.
conv_f_char4
=
ConvBNLayer
(
in_channels
=
256
,
out_channels
=
256
,
kernel_size
=
3
,
stride
=
1
,
padding
=
1
,
act
=
'relu'
,
name
=
"conv_f_char{}"
.
format
(
4
))
self
.
conv_f_char5
=
ConvBNLayer
(
in_channels
=
256
,
out_channels
=
256
,
kernel_size
=
1
,
stride
=
1
,
padding
=
0
,
act
=
'relu'
,
name
=
"conv_f_char{}"
.
format
(
5
))
self
.
conv3
=
nn
.
Conv2D
(
in_channels
=
256
,
out_channels
=
6625
,
kernel_size
=
3
,
stride
=
1
,
padding
=
1
,
groups
=
1
,
weight_attr
=
ParamAttr
(
name
=
"conv_f_char{}"
.
format
(
6
)),
bias_attr
=
False
)
self
.
conv_f_direc1
=
ConvBNLayer
(
in_channels
=
in_channels
,
out_channels
=
64
,
kernel_size
=
1
,
stride
=
1
,
padding
=
0
,
act
=
'relu'
,
name
=
"conv_f_direc{}"
.
format
(
1
))
self
.
conv_f_direc2
=
ConvBNLayer
(
in_channels
=
64
,
out_channels
=
64
,
kernel_size
=
3
,
stride
=
1
,
padding
=
1
,
act
=
'relu'
,
name
=
"conv_f_direc{}"
.
format
(
2
))
self
.
conv_f_direc3
=
ConvBNLayer
(
in_channels
=
64
,
out_channels
=
128
,
kernel_size
=
1
,
stride
=
1
,
padding
=
0
,
act
=
'relu'
,
name
=
"conv_f_direc{}"
.
format
(
3
))
self
.
conv4
=
nn
.
Conv2D
(
in_channels
=
128
,
out_channels
=
2
,
kernel_size
=
3
,
stride
=
1
,
padding
=
1
,
groups
=
1
,
weight_attr
=
ParamAttr
(
name
=
"conv_f_direc{}"
.
format
(
4
)),
bias_attr
=
False
)
def
forward
(
self
,
x
):
f_score
=
self
.
conv_f_score1
(
x
)
f_score
=
self
.
conv_f_score2
(
f_score
)
f_score
=
self
.
conv_f_score3
(
f_score
)
f_score
=
self
.
conv1
(
f_score
)
f_score
=
F
.
sigmoid
(
f_score
)
# f_boder
f_boder
=
self
.
conv_f_boder1
(
x
)
f_boder
=
self
.
conv_f_boder2
(
f_boder
)
f_boder
=
self
.
conv_f_boder3
(
f_boder
)
f_boder
=
self
.
conv2
(
f_boder
)
f_char
=
self
.
conv_f_char1
(
x
)
f_char
=
self
.
conv_f_char2
(
f_char
)
f_char
=
self
.
conv_f_char3
(
f_char
)
f_char
=
self
.
conv_f_char4
(
f_char
)
f_char
=
self
.
conv_f_char5
(
f_char
)
f_char
=
self
.
conv3
(
f_char
)
f_direction
=
self
.
conv_f_direc1
(
x
)
f_direction
=
self
.
conv_f_direc2
(
f_direction
)
f_direction
=
self
.
conv_f_direc3
(
f_direction
)
f_direction
=
self
.
conv4
(
f_direction
)
return
f_score
,
f_boder
,
f_direction
,
f_char
ppocr/modeling/necks/__init__.py
View file @
1f76f449
...
@@ -14,12 +14,14 @@
...
@@ -14,12 +14,14 @@
__all__
=
[
'build_neck'
]
__all__
=
[
'build_neck'
]
def
build_neck
(
config
):
def
build_neck
(
config
):
from
.db_fpn
import
DBFPN
from
.db_fpn
import
DBFPN
from
.east_fpn
import
EASTFPN
from
.east_fpn
import
EASTFPN
from
.sast_fpn
import
SASTFPN
from
.sast_fpn
import
SASTFPN
from
.rnn
import
SequenceEncoder
from
.rnn
import
SequenceEncoder
support_dict
=
[
'DBFPN'
,
'EASTFPN'
,
'SASTFPN'
,
'SequenceEncoder'
]
from
.pg_fpn
import
PGFPN
support_dict
=
[
'DBFPN'
,
'EASTFPN'
,
'SASTFPN'
,
'SequenceEncoder'
,
'PGFPN'
]
module_name
=
config
.
pop
(
'name'
)
module_name
=
config
.
pop
(
'name'
)
assert
module_name
in
support_dict
,
Exception
(
'neck only support {}'
.
format
(
assert
module_name
in
support_dict
,
Exception
(
'neck only support {}'
.
format
(
...
...
ppocr/modeling/necks/pg_fpn.py
0 → 100644
View file @
1f76f449
# copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
absolute_import
from
__future__
import
division
from
__future__
import
print_function
import
paddle
from
paddle
import
nn
import
paddle.nn.functional
as
F
from
paddle
import
ParamAttr
class
ConvBNLayer
(
nn
.
Layer
):
def
__init__
(
self
,
in_channels
,
out_channels
,
kernel_size
,
stride
=
1
,
groups
=
1
,
is_vd_mode
=
False
,
act
=
None
,
name
=
None
):
super
(
ConvBNLayer
,
self
).
__init__
()
self
.
is_vd_mode
=
is_vd_mode
self
.
_pool2d_avg
=
nn
.
AvgPool2D
(
kernel_size
=
2
,
stride
=
2
,
padding
=
0
,
ceil_mode
=
True
)
self
.
_conv
=
nn
.
Conv2D
(
in_channels
=
in_channels
,
out_channels
=
out_channels
,
kernel_size
=
kernel_size
,
stride
=
stride
,
padding
=
(
kernel_size
-
1
)
//
2
,
groups
=
groups
,
weight_attr
=
ParamAttr
(
name
=
name
+
"_weights"
),
bias_attr
=
False
)
if
name
==
"conv1"
:
bn_name
=
"bn_"
+
name
else
:
bn_name
=
"bn"
+
name
[
3
:]
self
.
_batch_norm
=
nn
.
BatchNorm
(
out_channels
,
act
=
act
,
param_attr
=
ParamAttr
(
name
=
bn_name
+
'_scale'
),
bias_attr
=
ParamAttr
(
bn_name
+
'_offset'
),
moving_mean_name
=
bn_name
+
'_mean'
,
moving_variance_name
=
bn_name
+
'_variance'
,
use_global_stats
=
False
)
def
forward
(
self
,
inputs
):
# if self.is_vd_mode:
# inputs = self._pool2d_avg(inputs)
y
=
self
.
_conv
(
inputs
)
y
=
self
.
_batch_norm
(
y
)
return
y
class
DeConvBNLayer
(
nn
.
Layer
):
def
__init__
(
self
,
in_channels
,
out_channels
,
kernel_size
=
4
,
stride
=
2
,
padding
=
1
,
groups
=
1
,
if_act
=
True
,
act
=
None
,
name
=
None
):
super
(
DeConvBNLayer
,
self
).
__init__
()
self
.
if_act
=
if_act
self
.
act
=
act
self
.
deconv
=
nn
.
Conv2DTranspose
(
in_channels
=
in_channels
,
out_channels
=
out_channels
,
kernel_size
=
kernel_size
,
stride
=
stride
,
padding
=
padding
,
groups
=
groups
,
weight_attr
=
ParamAttr
(
name
=
name
+
'_weights'
),
bias_attr
=
False
)
self
.
bn
=
nn
.
BatchNorm
(
num_channels
=
out_channels
,
act
=
act
,
param_attr
=
ParamAttr
(
name
=
"bn_"
+
name
+
"_scale"
),
bias_attr
=
ParamAttr
(
name
=
"bn_"
+
name
+
"_offset"
),
moving_mean_name
=
"bn_"
+
name
+
"_mean"
,
moving_variance_name
=
"bn_"
+
name
+
"_variance"
,
use_global_stats
=
False
)
def
forward
(
self
,
x
):
x
=
self
.
deconv
(
x
)
x
=
self
.
bn
(
x
)
return
x
class
FPN_Up_Fusion
(
nn
.
Layer
):
def
__init__
(
self
,
in_channels
):
super
(
FPN_Up_Fusion
,
self
).
__init__
()
in_channels
=
in_channels
[::
-
1
]
out_channels
=
[
256
,
256
,
192
,
192
,
128
]
self
.
h0_conv
=
ConvBNLayer
(
in_channels
[
0
],
out_channels
[
0
],
1
,
1
,
act
=
None
,
name
=
'conv_h0'
)
self
.
h1_conv
=
ConvBNLayer
(
in_channels
[
1
],
out_channels
[
1
],
1
,
1
,
act
=
None
,
name
=
'conv_h1'
)
self
.
h2_conv
=
ConvBNLayer
(
in_channels
[
2
],
out_channels
[
2
],
1
,
1
,
act
=
None
,
name
=
'conv_h2'
)
self
.
h3_conv
=
ConvBNLayer
(
in_channels
[
3
],
out_channels
[
3
],
1
,
1
,
act
=
None
,
name
=
'conv_h3'
)
self
.
h4_conv
=
ConvBNLayer
(
in_channels
[
4
],
out_channels
[
4
],
1
,
1
,
act
=
None
,
name
=
'conv_h4'
)
self
.
dconv0
=
DeConvBNLayer
(
in_channels
=
out_channels
[
0
],
out_channels
=
out_channels
[
1
],
name
=
"dconv_{}"
.
format
(
0
))
self
.
dconv1
=
DeConvBNLayer
(
in_channels
=
out_channels
[
1
],
out_channels
=
out_channels
[
2
],
act
=
None
,
name
=
"dconv_{}"
.
format
(
1
))
self
.
dconv2
=
DeConvBNLayer
(
in_channels
=
out_channels
[
2
],
out_channels
=
out_channels
[
3
],
act
=
None
,
name
=
"dconv_{}"
.
format
(
2
))
self
.
dconv3
=
DeConvBNLayer
(
in_channels
=
out_channels
[
3
],
out_channels
=
out_channels
[
4
],
act
=
None
,
name
=
"dconv_{}"
.
format
(
3
))
self
.
conv_g1
=
ConvBNLayer
(
in_channels
=
out_channels
[
1
],
out_channels
=
out_channels
[
1
],
kernel_size
=
3
,
stride
=
1
,
act
=
'relu'
,
name
=
"conv_g{}"
.
format
(
1
))
self
.
conv_g2
=
ConvBNLayer
(
in_channels
=
out_channels
[
2
],
out_channels
=
out_channels
[
2
],
kernel_size
=
3
,
stride
=
1
,
act
=
'relu'
,
name
=
"conv_g{}"
.
format
(
2
))
self
.
conv_g3
=
ConvBNLayer
(
in_channels
=
out_channels
[
3
],
out_channels
=
out_channels
[
3
],
kernel_size
=
3
,
stride
=
1
,
act
=
'relu'
,
name
=
"conv_g{}"
.
format
(
3
))
self
.
conv_g4
=
ConvBNLayer
(
in_channels
=
out_channels
[
4
],
out_channels
=
out_channels
[
4
],
kernel_size
=
3
,
stride
=
1
,
act
=
'relu'
,
name
=
"conv_g{}"
.
format
(
4
))
self
.
convf
=
ConvBNLayer
(
in_channels
=
out_channels
[
4
],
out_channels
=
out_channels
[
4
],
kernel_size
=
1
,
stride
=
1
,
act
=
None
,
name
=
"conv_f{}"
.
format
(
4
))
def
_add_relu
(
self
,
x1
,
x2
):
x
=
paddle
.
add
(
x
=
x1
,
y
=
x2
)
x
=
F
.
relu
(
x
)
return
x
def
forward
(
self
,
x
):
f
=
x
[
2
:][::
-
1
]
h0
=
self
.
h0_conv
(
f
[
0
])
h1
=
self
.
h1_conv
(
f
[
1
])
h2
=
self
.
h2_conv
(
f
[
2
])
h3
=
self
.
h3_conv
(
f
[
3
])
h4
=
self
.
h4_conv
(
f
[
4
])
g0
=
self
.
dconv0
(
h0
)
g1
=
self
.
dconv2
(
self
.
conv_g2
(
self
.
_add_relu
(
g0
,
h1
)))
g2
=
self
.
dconv2
(
self
.
conv_g2
(
self
.
_add_relu
(
g1
,
h2
)))
g3
=
self
.
dconv3
(
self
.
conv_g2
(
self
.
_add_relu
(
g2
,
h3
)))
g4
=
self
.
dconv4
(
self
.
conv_g2
(
self
.
_add_relu
(
g3
,
h4
)))
return
g4
class
FPN_Down_Fusion
(
nn
.
Layer
):
def
__init__
(
self
,
in_channels
):
super
(
FPN_Down_Fusion
,
self
).
__init__
()
out_channels
=
[
32
,
64
,
128
]
self
.
h0_conv
=
ConvBNLayer
(
in_channels
[
0
],
out_channels
[
0
],
3
,
1
,
act
=
None
,
name
=
'FPN_d1'
)
self
.
h1_conv
=
ConvBNLayer
(
in_channels
[
1
],
out_channels
[
1
],
3
,
1
,
act
=
None
,
name
=
'FPN_d2'
)
self
.
h2_conv
=
ConvBNLayer
(
in_channels
[
2
],
out_channels
[
2
],
3
,
1
,
act
=
None
,
name
=
'FPN_d3'
)
self
.
g0_conv
=
ConvBNLayer
(
out_channels
[
0
],
out_channels
[
1
],
3
,
2
,
act
=
None
,
name
=
'FPN_d4'
)
self
.
g1_conv
=
nn
.
Sequential
(
ConvBNLayer
(
out_channels
[
1
],
out_channels
[
1
],
3
,
1
,
act
=
'relu'
,
name
=
'FPN_d5'
),
ConvBNLayer
(
out_channels
[
1
],
out_channels
[
2
],
3
,
2
,
act
=
None
,
name
=
'FPN_d6'
))
self
.
g2_conv
=
nn
.
Sequential
(
ConvBNLayer
(
out_channels
[
2
],
out_channels
[
2
],
3
,
1
,
act
=
'relu'
,
name
=
'FPN_d7'
),
ConvBNLayer
(
out_channels
[
2
],
out_channels
[
2
],
1
,
1
,
act
=
None
,
name
=
'FPN_d8'
))
def
forward
(
self
,
x
):
f
=
x
[:
3
]
h0
=
self
.
h0_conv
(
f
[
0
])
h1
=
self
.
h1_conv
(
f
[
1
])
h2
=
self
.
h2_conv
(
f
[
2
])
g0
=
self
.
g0_conv
(
h0
)
g1
=
paddle
.
add
(
x
=
g0
,
y
=
h1
)
g1
=
F
.
relu
(
g1
)
g1
=
self
.
g1_conv
(
g1
)
g2
=
paddle
.
add
(
x
=
g1
,
y
=
h2
)
g2
=
F
.
relu
(
g2
)
g2
=
self
.
g2_conv
(
g2
)
return
g2
class
PGFPN
(
nn
.
Layer
):
def
__init__
(
self
,
in_channels
,
with_cab
=
False
,
**
kwargs
):
super
(
PGFPN
,
self
).
__init__
()
self
.
in_channels
=
in_channels
self
.
with_cab
=
with_cab
self
.
FPN_Down_Fusion
=
FPN_Down_Fusion
(
self
.
in_channels
)
self
.
FPN_Up_Fusion
=
FPN_Up_Fusion
(
self
.
in_channels
)
self
.
out_channels
=
128
def
forward
(
self
,
x
):
# down fpn
f_down
=
self
.
FPN_Down_Fusion
(
x
)
# up fpn
f_up
=
self
.
FPN_Up_Fusion
(
x
)
# fusion
f_common
=
paddle
.
add
(
x
=
f_down
,
y
=
f_up
)
f_common
=
F
.
relu
(
f_common
)
return
f_common
ppocr/postprocess/__init__.py
View file @
1f76f449
...
@@ -28,10 +28,11 @@ def build_post_process(config, global_config=None):
...
@@ -28,10 +28,11 @@ def build_post_process(config, global_config=None):
from
.sast_postprocess
import
SASTPostProcess
from
.sast_postprocess
import
SASTPostProcess
from
.rec_postprocess
import
CTCLabelDecode
,
AttnLabelDecode
,
SRNLabelDecode
from
.rec_postprocess
import
CTCLabelDecode
,
AttnLabelDecode
,
SRNLabelDecode
from
.cls_postprocess
import
ClsPostProcess
from
.cls_postprocess
import
ClsPostProcess
from
.pg_postprocess
import
PGPostProcess
support_dict
=
[
support_dict
=
[
'DBPostProcess'
,
'EASTPostProcess'
,
'SASTPostProcess'
,
'CTCLabelDecode'
,
'DBPostProcess'
,
'EASTPostProcess'
,
'SASTPostProcess'
,
'CTCLabelDecode'
,
'AttnLabelDecode'
,
'ClsPostProcess'
,
'SRNLabelDecode'
'AttnLabelDecode'
,
'ClsPostProcess'
,
'SRNLabelDecode'
,
'PGPostProcess'
]
]
config
=
copy
.
deepcopy
(
config
)
config
=
copy
.
deepcopy
(
config
)
...
...
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment