Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
wangsen
paddle_dbnet
Commits
e40fd431
Commit
e40fd431
authored
Sep 23, 2021
by
Leif
Browse files
Merge remote-tracking branch 'origin/dygraph' into dygraph
parents
6e0cbbe1
0da240d0
Changes
71
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
509 additions
and
285 deletions
+509
-285
ppocr/data/imaug/__init__.py
ppocr/data/imaug/__init__.py
+3
-1
ppocr/data/imaug/label_ops.py
ppocr/data/imaug/label_ops.py
+7
-2
ppocr/data/imaug/make_pse_gt.py
ppocr/data/imaug/make_pse_gt.py
+85
-0
ppocr/data/imaug/random_crop_data.py
ppocr/data/imaug/random_crop_data.py
+36
-28
ppocr/data/imaug/rec_img_aug.py
ppocr/data/imaug/rec_img_aug.py
+24
-2
ppocr/data/simple_dataset.py
ppocr/data/simple_dataset.py
+0
-1
ppocr/losses/__init__.py
ppocr/losses/__init__.py
+5
-2
ppocr/losses/basic_loss.py
ppocr/losses/basic_loss.py
+18
-11
ppocr/losses/combined_loss.py
ppocr/losses/combined_loss.py
+9
-5
ppocr/losses/det_basic_loss.py
ppocr/losses/det_basic_loss.py
+1
-56
ppocr/losses/det_pse_loss.py
ppocr/losses/det_pse_loss.py
+145
-0
ppocr/losses/distillation_loss.py
ppocr/losses/distillation_loss.py
+8
-6
ppocr/metrics/eval_det_iou.py
ppocr/metrics/eval_det_iou.py
+0
-11
ppocr/modeling/backbones/rec_nrtr_mtb.py
ppocr/modeling/backbones/rec_nrtr_mtb.py
+5
-3
ppocr/modeling/heads/__init__.py
ppocr/modeling/heads/__init__.py
+4
-2
ppocr/modeling/heads/det_pse_head.py
ppocr/modeling/heads/det_pse_head.py
+35
-0
ppocr/modeling/heads/multiheadAttention.py
ppocr/modeling/heads/multiheadAttention.py
+37
-52
ppocr/modeling/heads/rec_nrtr_head.py
ppocr/modeling/heads/rec_nrtr_head.py
+83
-101
ppocr/modeling/heads/rec_sar_head.py
ppocr/modeling/heads/rec_sar_head.py
+2
-1
ppocr/modeling/necks/__init__.py
ppocr/modeling/necks/__init__.py
+2
-1
No files found.
ppocr/data/imaug/__init__.py
View file @
e40fd431
...
...
@@ -19,11 +19,13 @@ from __future__ import unicode_literals
from
.iaa_augment
import
IaaAugment
from
.make_border_map
import
MakeBorderMap
from
.make_shrink_map
import
MakeShrinkMap
from
.random_crop_data
import
EastRandomCropData
,
PSERandomCrop
from
.random_crop_data
import
EastRandomCropData
,
RandomCropImgMask
from
.make_pse_gt
import
MakePseGt
from
.rec_img_aug
import
RecAug
,
RecResizeImg
,
ClsResizeImg
,
SRNRecResizeImg
,
NRTRRecResizeImg
,
SARRecResizeImg
from
.randaugment
import
RandAugment
from
.copy_paste
import
CopyPaste
from
.ColorJitter
import
ColorJitter
from
.operators
import
*
from
.label_ops
import
*
...
...
ppocr/data/imaug/label_ops.py
View file @
e40fd431
...
...
@@ -174,21 +174,26 @@ class NRTRLabelEncode(BaseRecLabelEncode):
super
(
NRTRLabelEncode
,
self
).
__init__
(
max_text_length
,
character_dict_path
,
character_type
,
use_space_char
)
def
__call__
(
self
,
data
):
text
=
data
[
'label'
]
text
=
self
.
encode
(
text
)
if
text
is
None
:
return
None
if
len
(
text
)
>=
self
.
max_text_len
-
1
:
return
None
data
[
'length'
]
=
np
.
array
(
len
(
text
))
text
.
insert
(
0
,
2
)
text
.
append
(
3
)
text
=
text
+
[
0
]
*
(
self
.
max_text_len
-
len
(
text
))
data
[
'label'
]
=
np
.
array
(
text
)
return
data
def
add_special_char
(
self
,
dict_character
):
dict_character
=
[
'blank'
,
'<unk>'
,
'<s>'
,
'</s>'
]
+
dict_character
dict_character
=
[
'blank'
,
'<unk>'
,
'<s>'
,
'</s>'
]
+
dict_character
return
dict_character
class
CTCLabelEncode
(
BaseRecLabelEncode
):
""" Convert between text-label and text-index """
...
...
ppocr/data/imaug/make_pse_gt.py
0 → 100644
View file @
e40fd431
# -*- coding:utf-8 -*-
from
__future__
import
absolute_import
from
__future__
import
division
from
__future__
import
print_function
from
__future__
import
unicode_literals
import
cv2
import
numpy
as
np
import
pyclipper
from
shapely.geometry
import
Polygon
__all__
=
[
'MakePseGt'
]
class
MakePseGt
(
object
):
r
'''
Making binary mask from detection data with ICDAR format.
Typically following the process of class `MakeICDARData`.
'''
def
__init__
(
self
,
kernel_num
=
7
,
size
=
640
,
min_shrink_ratio
=
0.4
,
**
kwargs
):
self
.
kernel_num
=
kernel_num
self
.
min_shrink_ratio
=
min_shrink_ratio
self
.
size
=
size
def
__call__
(
self
,
data
):
image
=
data
[
'image'
]
text_polys
=
data
[
'polys'
]
ignore_tags
=
data
[
'ignore_tags'
]
h
,
w
,
_
=
image
.
shape
short_edge
=
min
(
h
,
w
)
if
short_edge
<
self
.
size
:
# keep short_size >= self.size
scale
=
self
.
size
/
short_edge
image
=
cv2
.
resize
(
image
,
dsize
=
None
,
fx
=
scale
,
fy
=
scale
)
text_polys
*=
scale
gt_kernels
=
[]
for
i
in
range
(
1
,
self
.
kernel_num
+
1
):
# s1->sn, from big to small
rate
=
1.0
-
(
1.0
-
self
.
min_shrink_ratio
)
/
(
self
.
kernel_num
-
1
)
*
i
text_kernel
,
ignore_tags
=
self
.
generate_kernel
(
image
.
shape
[
0
:
2
],
rate
,
text_polys
,
ignore_tags
)
gt_kernels
.
append
(
text_kernel
)
training_mask
=
np
.
ones
(
image
.
shape
[
0
:
2
],
dtype
=
'uint8'
)
for
i
in
range
(
text_polys
.
shape
[
0
]):
if
ignore_tags
[
i
]:
cv2
.
fillPoly
(
training_mask
,
text_polys
[
i
].
astype
(
np
.
int32
)[
np
.
newaxis
,
:,
:],
0
)
gt_kernels
=
np
.
array
(
gt_kernels
)
gt_kernels
[
gt_kernels
>
0
]
=
1
data
[
'image'
]
=
image
data
[
'polys'
]
=
text_polys
data
[
'gt_kernels'
]
=
gt_kernels
[
0
:]
data
[
'gt_text'
]
=
gt_kernels
[
0
]
data
[
'mask'
]
=
training_mask
.
astype
(
'float32'
)
return
data
def
generate_kernel
(
self
,
img_size
,
shrink_ratio
,
text_polys
,
ignore_tags
=
None
):
h
,
w
=
img_size
text_kernel
=
np
.
zeros
((
h
,
w
),
dtype
=
np
.
float32
)
for
i
,
poly
in
enumerate
(
text_polys
):
polygon
=
Polygon
(
poly
)
distance
=
polygon
.
area
*
(
1
-
shrink_ratio
*
shrink_ratio
)
/
(
polygon
.
length
+
1e-6
)
subject
=
[
tuple
(
l
)
for
l
in
poly
]
pco
=
pyclipper
.
PyclipperOffset
()
pco
.
AddPath
(
subject
,
pyclipper
.
JT_ROUND
,
pyclipper
.
ET_CLOSEDPOLYGON
)
shrinked
=
np
.
array
(
pco
.
Execute
(
-
distance
))
if
len
(
shrinked
)
==
0
or
shrinked
.
size
==
0
:
if
ignore_tags
is
not
None
:
ignore_tags
[
i
]
=
True
continue
try
:
shrinked
=
np
.
array
(
shrinked
[
0
]).
reshape
(
-
1
,
2
)
except
:
if
ignore_tags
is
not
None
:
ignore_tags
[
i
]
=
True
continue
cv2
.
fillPoly
(
text_kernel
,
[
shrinked
.
astype
(
np
.
int32
)],
i
+
1
)
return
text_kernel
,
ignore_tags
ppocr/data/imaug/random_crop_data.py
View file @
e40fd431
...
...
@@ -164,47 +164,55 @@ class EastRandomCropData(object):
return
data
class
PSE
RandomCrop
(
object
):
def
__init__
(
self
,
size
,
**
kwargs
):
class
RandomCrop
ImgMask
(
object
):
def
__init__
(
self
,
size
,
main_key
,
crop_keys
,
p
=
3
/
8
,
**
kwargs
):
self
.
size
=
size
self
.
main_key
=
main_key
self
.
crop_keys
=
crop_keys
self
.
p
=
p
def
__call__
(
self
,
data
):
im
gs
=
data
[
'im
gs
'
]
im
age
=
data
[
'im
age
'
]
h
,
w
=
im
gs
[
0
]
.
shape
[
0
:
2
]
h
,
w
=
im
age
.
shape
[
0
:
2
]
th
,
tw
=
self
.
size
if
w
==
tw
and
h
==
th
:
return
imgs
return
data
# label中存在文本实例,并且按照概率进行裁剪,使用threshold_label_map控制
if
np
.
max
(
imgs
[
2
]
)
>
0
and
random
.
random
()
>
3
/
8
:
#
文本实例的左上角点
tl
=
np
.
min
(
np
.
where
(
imgs
[
2
]
>
0
),
axis
=
1
)
-
self
.
size
mask
=
data
[
self
.
main_key
]
if
np
.
max
(
mask
)
>
0
and
random
.
random
()
>
self
.
p
:
#
make sure to crop the text region
tl
=
np
.
min
(
np
.
where
(
mask
>
0
),
axis
=
1
)
-
(
th
,
tw
)
tl
[
tl
<
0
]
=
0
# 文本实例的右下角点
br
=
np
.
max
(
np
.
where
(
imgs
[
2
]
>
0
),
axis
=
1
)
-
self
.
size
br
=
np
.
max
(
np
.
where
(
mask
>
0
),
axis
=
1
)
-
(
th
,
tw
)
br
[
br
<
0
]
=
0
# 保证选到右下角点时,有足够的距离进行crop
br
[
0
]
=
min
(
br
[
0
],
h
-
th
)
br
[
1
]
=
min
(
br
[
1
],
w
-
tw
)
for
_
in
range
(
50000
):
i
=
random
.
randint
(
tl
[
0
],
br
[
0
])
j
=
random
.
randint
(
tl
[
1
],
br
[
1
])
# 保证shrink_label_map有文本
if
imgs
[
1
][
i
:
i
+
th
,
j
:
j
+
tw
].
sum
()
<=
0
:
continue
i
=
random
.
randint
(
tl
[
0
],
br
[
0
])
if
tl
[
0
]
<
br
[
0
]
else
0
j
=
random
.
randint
(
tl
[
1
],
br
[
1
])
if
tl
[
1
]
<
br
[
1
]
else
0
else
:
break
else
:
i
=
random
.
randint
(
0
,
h
-
th
)
j
=
random
.
randint
(
0
,
w
-
tw
)
i
=
random
.
randint
(
0
,
h
-
th
)
if
h
-
th
>
0
else
0
j
=
random
.
randint
(
0
,
w
-
tw
)
if
w
-
tw
>
0
else
0
# return i, j, th, tw
for
idx
in
range
(
len
(
imgs
)):
if
len
(
imgs
[
idx
].
shape
)
==
3
:
imgs
[
idx
]
=
imgs
[
idx
][
i
:
i
+
th
,
j
:
j
+
tw
,
:]
for
k
in
data
:
if
k
in
self
.
crop_keys
:
if
len
(
data
[
k
].
shape
)
==
3
:
if
np
.
argmin
(
data
[
k
].
shape
)
==
0
:
img
=
data
[
k
][:,
i
:
i
+
th
,
j
:
j
+
tw
]
if
img
.
shape
[
1
]
!=
img
.
shape
[
2
]:
a
=
1
elif
np
.
argmin
(
data
[
k
].
shape
)
==
2
:
img
=
data
[
k
][
i
:
i
+
th
,
j
:
j
+
tw
,
:]
if
img
.
shape
[
1
]
!=
img
.
shape
[
0
]:
a
=
1
else
:
img
=
data
[
k
]
else
:
imgs
[
idx
]
=
imgs
[
idx
][
i
:
i
+
th
,
j
:
j
+
tw
]
data
[
'imgs'
]
=
imgs
img
=
data
[
k
][
i
:
i
+
th
,
j
:
j
+
tw
]
if
img
.
shape
[
0
]
!=
img
.
shape
[
1
]:
a
=
1
data
[
k
]
=
img
return
data
ppocr/data/imaug/rec_img_aug.py
View file @
e40fd431
...
...
@@ -44,12 +44,33 @@ class ClsResizeImg(object):
class
NRTRRecResizeImg
(
object
):
def
__init__
(
self
,
image_shape
,
resize_type
,
**
kwargs
):
def
__init__
(
self
,
image_shape
,
resize_type
,
padding
=
False
,
**
kwargs
):
self
.
image_shape
=
image_shape
self
.
resize_type
=
resize_type
self
.
padding
=
padding
def
__call__
(
self
,
data
):
img
=
data
[
'image'
]
img
=
cv2
.
cvtColor
(
img
,
cv2
.
COLOR_BGR2GRAY
)
image_shape
=
self
.
image_shape
if
self
.
padding
:
imgC
,
imgH
,
imgW
=
image_shape
# todo: change to 0 and modified image shape
h
=
img
.
shape
[
0
]
w
=
img
.
shape
[
1
]
ratio
=
w
/
float
(
h
)
if
math
.
ceil
(
imgH
*
ratio
)
>
imgW
:
resized_w
=
imgW
else
:
resized_w
=
int
(
math
.
ceil
(
imgH
*
ratio
))
resized_image
=
cv2
.
resize
(
img
,
(
resized_w
,
imgH
))
norm_img
=
np
.
expand_dims
(
resized_image
,
-
1
)
norm_img
=
norm_img
.
transpose
((
2
,
0
,
1
))
resized_image
=
norm_img
.
astype
(
np
.
float32
)
/
128.
-
1.
padding_im
=
np
.
zeros
((
imgC
,
imgH
,
imgW
),
dtype
=
np
.
float32
)
padding_im
[:,
:,
0
:
resized_w
]
=
resized_image
data
[
'image'
]
=
padding_im
return
data
if
self
.
resize_type
==
'PIL'
:
image_pil
=
Image
.
fromarray
(
np
.
uint8
(
img
))
img
=
image_pil
.
resize
(
self
.
image_shape
,
Image
.
ANTIALIAS
)
...
...
@@ -109,7 +130,8 @@ class SARRecResizeImg(object):
def
__call__
(
self
,
data
):
img
=
data
[
'image'
]
norm_img
,
resize_shape
,
pad_shape
,
valid_ratio
=
resize_norm_img_sar
(
img
,
self
.
image_shape
,
self
.
width_downsample_ratio
)
norm_img
,
resize_shape
,
pad_shape
,
valid_ratio
=
resize_norm_img_sar
(
img
,
self
.
image_shape
,
self
.
width_downsample_ratio
)
data
[
'image'
]
=
norm_img
data
[
'resized_shape'
]
=
resize_shape
data
[
'pad_shape'
]
=
pad_shape
...
...
ppocr/data/simple_dataset.py
View file @
e40fd431
...
...
@@ -15,7 +15,6 @@ import numpy as np
import
os
import
random
from
paddle.io
import
Dataset
from
.imaug
import
transform
,
create_operators
...
...
ppocr/losses/__init__.py
View file @
e40fd431
...
...
@@ -20,6 +20,7 @@ import paddle.nn as nn
from
.det_db_loss
import
DBLoss
from
.det_east_loss
import
EASTLoss
from
.det_sast_loss
import
SASTLoss
from
.det_pse_loss
import
PSELoss
# rec loss
from
.rec_ctc_loss
import
CTCLoss
...
...
@@ -42,10 +43,12 @@ from .combined_loss import CombinedLoss
# table loss
from
.table_att_loss
import
TableAttentionLoss
def
build_loss
(
config
):
support_dict
=
[
'DBLoss'
,
'EASTLoss'
,
'SASTLoss'
,
'CTCLoss'
,
'ClsLoss'
,
'AttentionLoss'
,
'SRNLoss'
,
'PGLoss'
,
'CombinedLoss'
,
'NRTRLoss'
,
'TableAttentionLoss'
,
'SARLoss'
'DBLoss'
,
'PSELoss'
,
'EASTLoss'
,
'SASTLoss'
,
'CTCLoss'
,
'ClsLoss'
,
'AttentionLoss'
,
'SRNLoss'
,
'PGLoss'
,
'CombinedLoss'
,
'NRTRLoss'
,
'TableAttentionLoss'
,
'SARLoss'
]
config
=
copy
.
deepcopy
(
config
)
...
...
ppocr/losses/basic_loss.py
View file @
e40fd431
...
...
@@ -56,31 +56,34 @@ class CELoss(nn.Layer):
class
KLJSLoss
(
object
):
def
__init__
(
self
,
mode
=
'kl'
):
assert
mode
in
[
'kl'
,
'js'
,
'KL'
,
'JS'
],
"mode can only be one of ['kl', 'js', 'KL', 'JS']"
assert
mode
in
[
'kl'
,
'js'
,
'KL'
,
'JS'
],
"mode can only be one of ['kl', 'js', 'KL', 'JS']"
self
.
mode
=
mode
def
__call__
(
self
,
p1
,
p2
,
reduction
=
"mean"
):
loss
=
paddle
.
multiply
(
p2
,
paddle
.
log
(
(
p2
+
1e-5
)
/
(
p1
+
1e-5
)
+
1e-5
))
loss
=
paddle
.
multiply
(
p2
,
paddle
.
log
((
p2
+
1e-5
)
/
(
p1
+
1e-5
)
+
1e-5
))
if
self
.
mode
.
lower
()
==
"js"
:
loss
+=
paddle
.
multiply
(
p1
,
paddle
.
log
((
p1
+
1e-5
)
/
(
p2
+
1e-5
)
+
1e-5
))
loss
+=
paddle
.
multiply
(
p1
,
paddle
.
log
((
p1
+
1e-5
)
/
(
p2
+
1e-5
)
+
1e-5
))
loss
*=
0.5
if
reduction
==
"mean"
:
loss
=
paddle
.
mean
(
loss
,
axis
=
[
1
,
2
])
elif
reduction
==
"none"
or
reduction
is
None
:
loss
=
paddle
.
mean
(
loss
,
axis
=
[
1
,
2
])
elif
reduction
==
"none"
or
reduction
is
None
:
return
loss
else
:
loss
=
paddle
.
sum
(
loss
,
axis
=
[
1
,
2
])
loss
=
paddle
.
sum
(
loss
,
axis
=
[
1
,
2
])
return
loss
class
DMLLoss
(
nn
.
Layer
):
"""
DMLLoss
"""
def
__init__
(
self
,
act
=
None
):
def
__init__
(
self
,
act
=
None
,
use_log
=
False
):
super
().
__init__
()
if
act
is
not
None
:
assert
act
in
[
"softmax"
,
"sigmoid"
]
...
...
@@ -91,19 +94,23 @@ class DMLLoss(nn.Layer):
else
:
self
.
act
=
None
self
.
use_log
=
use_log
self
.
jskl_loss
=
KLJSLoss
(
mode
=
"js"
)
def
forward
(
self
,
out1
,
out2
):
if
self
.
act
is
not
None
:
out1
=
self
.
act
(
out1
)
out2
=
self
.
act
(
out2
)
if
len
(
out1
.
shape
)
<
2
:
if
self
.
use_log
:
# for recognition distillation, log is needed for feature map
log_out1
=
paddle
.
log
(
out1
)
log_out2
=
paddle
.
log
(
out2
)
loss
=
(
F
.
kl_div
(
log_out1
,
out2
,
reduction
=
'batchmean'
)
+
F
.
kl_div
(
log_out2
,
out1
,
reduction
=
'batchmean'
))
/
2.0
else
:
# for detection distillation log is not needed
loss
=
self
.
jskl_loss
(
out1
,
out2
)
return
loss
...
...
ppocr/losses/combined_loss.py
View file @
e40fd431
...
...
@@ -49,11 +49,15 @@ class CombinedLoss(nn.Layer):
loss
=
loss_func
(
input
,
batch
,
**
kargs
)
if
isinstance
(
loss
,
paddle
.
Tensor
):
loss
=
{
"loss_{}_{}"
.
format
(
str
(
loss
),
idx
):
loss
}
weight
=
self
.
loss_weight
[
idx
]
for
key
in
loss
.
keys
():
if
key
==
"loss"
:
loss_all
+=
loss
[
key
]
*
weight
loss
=
{
key
:
loss
[
key
]
*
weight
for
key
in
loss
}
if
"loss"
in
loss
:
loss_all
+=
loss
[
"loss"
]
else
:
loss_dict
[
"{}_{}"
.
format
(
key
,
idx
)]
=
loss
[
key
]
loss_all
+=
paddle
.
add_n
(
list
(
loss
.
values
()))
loss_dict
.
update
(
loss
)
loss_dict
[
"loss"
]
=
loss_all
return
loss_dict
ppocr/losses/det_basic_loss.py
View file @
e40fd431
...
...
@@ -75,12 +75,6 @@ class BalanceLoss(nn.Layer):
mask (variable): masked maps.
return: (variable) balanced loss
"""
# if self.main_loss_type in ['DiceLoss']:
# # For the loss that returns to scalar value, perform ohem on the mask
# mask = ohem_batch(pred, gt, mask, self.negative_ratio)
# loss = self.loss(pred, gt, mask)
# return loss
positive
=
gt
*
mask
negative
=
(
1
-
gt
)
*
mask
...
...
@@ -154,52 +148,3 @@ class BCELoss(nn.Layer):
def
forward
(
self
,
input
,
label
,
mask
=
None
,
weight
=
None
,
name
=
None
):
loss
=
F
.
binary_cross_entropy
(
input
,
label
,
reduction
=
self
.
reduction
)
return
loss
\ No newline at end of file
def
ohem_single
(
score
,
gt_text
,
training_mask
,
ohem_ratio
):
pos_num
=
(
int
)(
np
.
sum
(
gt_text
>
0.5
))
-
(
int
)(
np
.
sum
((
gt_text
>
0.5
)
&
(
training_mask
<=
0.5
)))
if
pos_num
==
0
:
# selected_mask = gt_text.copy() * 0 # may be not good
selected_mask
=
training_mask
selected_mask
=
selected_mask
.
reshape
(
1
,
selected_mask
.
shape
[
0
],
selected_mask
.
shape
[
1
]).
astype
(
'float32'
)
return
selected_mask
neg_num
=
(
int
)(
np
.
sum
(
gt_text
<=
0.5
))
neg_num
=
(
int
)(
min
(
pos_num
*
ohem_ratio
,
neg_num
))
if
neg_num
==
0
:
selected_mask
=
training_mask
selected_mask
=
selected_mask
.
reshape
(
1
,
selected_mask
.
shape
[
0
],
selected_mask
.
shape
[
1
]).
astype
(
'float32'
)
return
selected_mask
neg_score
=
score
[
gt_text
<=
0.5
]
# 将负样本得分从高到低排序
neg_score_sorted
=
np
.
sort
(
-
neg_score
)
threshold
=
-
neg_score_sorted
[
neg_num
-
1
]
# 选出 得分高的 负样本 和正样本 的 mask
selected_mask
=
((
score
>=
threshold
)
|
(
gt_text
>
0.5
))
&
(
training_mask
>
0.5
)
selected_mask
=
selected_mask
.
reshape
(
1
,
selected_mask
.
shape
[
0
],
selected_mask
.
shape
[
1
]).
astype
(
'float32'
)
return
selected_mask
def
ohem_batch
(
scores
,
gt_texts
,
training_masks
,
ohem_ratio
):
scores
=
scores
.
numpy
()
gt_texts
=
gt_texts
.
numpy
()
training_masks
=
training_masks
.
numpy
()
selected_masks
=
[]
for
i
in
range
(
scores
.
shape
[
0
]):
selected_masks
.
append
(
ohem_single
(
scores
[
i
,
:,
:],
gt_texts
[
i
,
:,
:],
training_masks
[
i
,
:,
:],
ohem_ratio
))
selected_masks
=
np
.
concatenate
(
selected_masks
,
0
)
selected_masks
=
paddle
.
to_tensor
(
selected_masks
)
return
selected_masks
ppocr/losses/det_pse_loss.py
0 → 100644
View file @
e40fd431
# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
paddle
from
paddle
import
nn
from
paddle.nn
import
functional
as
F
import
numpy
as
np
from
ppocr.utils.iou
import
iou
class
PSELoss
(
nn
.
Layer
):
def
__init__
(
self
,
alpha
,
ohem_ratio
=
3
,
kernel_sample_mask
=
'pred'
,
reduction
=
'sum'
,
eps
=
1e-6
,
**
kwargs
):
"""Implement PSE Loss.
"""
super
(
PSELoss
,
self
).
__init__
()
assert
reduction
in
[
'sum'
,
'mean'
,
'none'
]
self
.
alpha
=
alpha
self
.
ohem_ratio
=
ohem_ratio
self
.
kernel_sample_mask
=
kernel_sample_mask
self
.
reduction
=
reduction
self
.
eps
=
eps
def
forward
(
self
,
outputs
,
labels
):
predicts
=
outputs
[
'maps'
]
predicts
=
F
.
interpolate
(
predicts
,
scale_factor
=
4
)
texts
=
predicts
[:,
0
,
:,
:]
kernels
=
predicts
[:,
1
:,
:,
:]
gt_texts
,
gt_kernels
,
training_masks
=
labels
[
1
:]
# text loss
selected_masks
=
self
.
ohem_batch
(
texts
,
gt_texts
,
training_masks
)
loss_text
=
self
.
dice_loss
(
texts
,
gt_texts
,
selected_masks
)
iou_text
=
iou
((
texts
>
0
).
astype
(
'int64'
),
gt_texts
,
training_masks
,
reduce
=
False
)
losses
=
dict
(
loss_text
=
loss_text
,
iou_text
=
iou_text
)
# kernel loss
loss_kernels
=
[]
if
self
.
kernel_sample_mask
==
'gt'
:
selected_masks
=
gt_texts
*
training_masks
elif
self
.
kernel_sample_mask
==
'pred'
:
selected_masks
=
(
F
.
sigmoid
(
texts
)
>
0.5
).
astype
(
'float32'
)
*
training_masks
for
i
in
range
(
kernels
.
shape
[
1
]):
kernel_i
=
kernels
[:,
i
,
:,
:]
gt_kernel_i
=
gt_kernels
[:,
i
,
:,
:]
loss_kernel_i
=
self
.
dice_loss
(
kernel_i
,
gt_kernel_i
,
selected_masks
)
loss_kernels
.
append
(
loss_kernel_i
)
loss_kernels
=
paddle
.
mean
(
paddle
.
stack
(
loss_kernels
,
axis
=
1
),
axis
=
1
)
iou_kernel
=
iou
((
kernels
[:,
-
1
,
:,
:]
>
0
).
astype
(
'int64'
),
gt_kernels
[:,
-
1
,
:,
:],
training_masks
*
gt_texts
,
reduce
=
False
)
losses
.
update
(
dict
(
loss_kernels
=
loss_kernels
,
iou_kernel
=
iou_kernel
))
loss
=
self
.
alpha
*
loss_text
+
(
1
-
self
.
alpha
)
*
loss_kernels
losses
[
'loss'
]
=
loss
if
self
.
reduction
==
'sum'
:
losses
=
{
x
:
paddle
.
sum
(
v
)
for
x
,
v
in
losses
.
items
()}
elif
self
.
reduction
==
'mean'
:
losses
=
{
x
:
paddle
.
mean
(
v
)
for
x
,
v
in
losses
.
items
()}
return
losses
def
dice_loss
(
self
,
input
,
target
,
mask
):
input
=
F
.
sigmoid
(
input
)
input
=
input
.
reshape
([
input
.
shape
[
0
],
-
1
])
target
=
target
.
reshape
([
target
.
shape
[
0
],
-
1
])
mask
=
mask
.
reshape
([
mask
.
shape
[
0
],
-
1
])
input
=
input
*
mask
target
=
target
*
mask
a
=
paddle
.
sum
(
input
*
target
,
1
)
b
=
paddle
.
sum
(
input
*
input
,
1
)
+
self
.
eps
c
=
paddle
.
sum
(
target
*
target
,
1
)
+
self
.
eps
d
=
(
2
*
a
)
/
(
b
+
c
)
return
1
-
d
def
ohem_single
(
self
,
score
,
gt_text
,
training_mask
,
ohem_ratio
=
3
):
pos_num
=
int
(
paddle
.
sum
((
gt_text
>
0.5
).
astype
(
'float32'
)))
-
int
(
paddle
.
sum
(
paddle
.
logical_and
((
gt_text
>
0.5
),
(
training_mask
<=
0.5
))
.
astype
(
'float32'
)))
if
pos_num
==
0
:
selected_mask
=
training_mask
selected_mask
=
selected_mask
.
reshape
(
[
1
,
selected_mask
.
shape
[
0
],
selected_mask
.
shape
[
1
]]).
astype
(
'float32'
)
return
selected_mask
neg_num
=
int
(
paddle
.
sum
((
gt_text
<=
0.5
).
astype
(
'float32'
)))
neg_num
=
int
(
min
(
pos_num
*
ohem_ratio
,
neg_num
))
if
neg_num
==
0
:
selected_mask
=
training_mask
selected_mask
=
selected_mask
.
view
(
1
,
selected_mask
.
shape
[
0
],
selected_mask
.
shape
[
1
]).
astype
(
'float32'
)
return
selected_mask
neg_score
=
paddle
.
masked_select
(
score
,
gt_text
<=
0.5
)
neg_score_sorted
=
paddle
.
sort
(
-
neg_score
)
threshold
=
-
neg_score_sorted
[
neg_num
-
1
]
selected_mask
=
paddle
.
logical_and
(
paddle
.
logical_or
((
score
>=
threshold
),
(
gt_text
>
0.5
)),
(
training_mask
>
0.5
))
selected_mask
=
selected_mask
.
reshape
(
[
1
,
selected_mask
.
shape
[
0
],
selected_mask
.
shape
[
1
]]).
astype
(
'float32'
)
return
selected_mask
def
ohem_batch
(
self
,
scores
,
gt_texts
,
training_masks
,
ohem_ratio
=
3
):
selected_masks
=
[]
for
i
in
range
(
scores
.
shape
[
0
]):
selected_masks
.
append
(
self
.
ohem_single
(
scores
[
i
,
:,
:],
gt_texts
[
i
,
:,
:],
training_masks
[
i
,
:,
:],
ohem_ratio
))
selected_masks
=
paddle
.
concat
(
selected_masks
,
0
).
astype
(
'float32'
)
return
selected_masks
ppocr/losses/distillation_loss.py
View file @
e40fd431
...
...
@@ -44,10 +44,11 @@ class DistillationDMLLoss(DMLLoss):
def
__init__
(
self
,
model_name_pairs
=
[],
act
=
None
,
use_log
=
False
,
key
=
None
,
maps_name
=
None
,
name
=
"dml"
):
super
().
__init__
(
act
=
act
)
super
().
__init__
(
act
=
act
,
use_log
=
use_log
)
assert
isinstance
(
model_name_pairs
,
list
)
self
.
key
=
key
self
.
model_name_pairs
=
self
.
_check_model_name_pairs
(
model_name_pairs
)
...
...
@@ -57,7 +58,8 @@ class DistillationDMLLoss(DMLLoss):
def
_check_model_name_pairs
(
self
,
model_name_pairs
):
if
not
isinstance
(
model_name_pairs
,
list
):
return
[]
elif
isinstance
(
model_name_pairs
[
0
],
list
)
and
isinstance
(
model_name_pairs
[
0
][
0
],
str
):
elif
isinstance
(
model_name_pairs
[
0
],
list
)
and
isinstance
(
model_name_pairs
[
0
][
0
],
str
):
return
model_name_pairs
else
:
return
[
model_name_pairs
]
...
...
@@ -112,8 +114,8 @@ class DistillationDMLLoss(DMLLoss):
loss_dict
[
"{}_{}_{}_{}_{}"
.
format
(
key
,
pair
[
0
],
pair
[
1
],
map_name
,
idx
)]
=
loss
[
key
]
else
:
loss_dict
[
"{}_{}_{}"
.
format
(
self
.
name
,
self
.
maps_name
[
_c
],
idx
)]
=
loss
loss_dict
[
"{}_{}_{}"
.
format
(
self
.
name
,
self
.
maps_name
[
_c
],
idx
)]
=
loss
loss_dict
=
_sum_loss
(
loss_dict
)
...
...
ppocr/metrics/eval_det_iou.py
View file @
e40fd431
...
...
@@ -169,21 +169,10 @@ class DetectionIoUEvaluator(object):
numGlobalCareDet
+=
numDetCare
perSampleMetrics
=
{
'precision'
:
precision
,
'recall'
:
recall
,
'hmean'
:
hmean
,
'pairs'
:
pairs
,
'iouMat'
:
[]
if
len
(
detPols
)
>
100
else
iouMat
.
tolist
(),
'gtPolPoints'
:
gtPolPoints
,
'detPolPoints'
:
detPolPoints
,
'gtCare'
:
numGtCare
,
'detCare'
:
numDetCare
,
'gtDontCare'
:
gtDontCarePolsNum
,
'detDontCare'
:
detDontCarePolsNum
,
'detMatched'
:
detMatched
,
'evaluationLog'
:
evaluationLog
}
return
perSampleMetrics
def
combine_results
(
self
,
results
):
...
...
ppocr/modeling/backbones/rec_nrtr_mtb.py
View file @
e40fd431
...
...
@@ -13,6 +13,7 @@
# limitations under the License.
from
paddle
import
nn
import
paddle
class
MTB
(
nn
.
Layer
):
...
...
@@ -40,7 +41,8 @@ class MTB(nn.Layer):
x
=
self
.
block
(
images
)
if
self
.
cnn_num
==
2
:
# (b, w, h, c)
x
=
x
.
transpose
([
0
,
3
,
2
,
1
])
x_shape
=
x
.
shape
x
=
x
.
reshape
([
x_shape
[
0
],
x_shape
[
1
],
x_shape
[
2
]
*
x_shape
[
3
]])
x
=
paddle
.
transpose
(
x
,
[
0
,
3
,
2
,
1
])
x_shape
=
paddle
.
shape
(
x
)
x
=
paddle
.
reshape
(
x
,
[
x_shape
[
0
],
x_shape
[
1
],
x_shape
[
2
]
*
x_shape
[
3
]])
return
x
ppocr/modeling/heads/__init__.py
View file @
e40fd431
...
...
@@ -20,6 +20,7 @@ def build_head(config):
from
.det_db_head
import
DBHead
from
.det_east_head
import
EASTHead
from
.det_sast_head
import
SASTHead
from
.det_pse_head
import
PSEHead
from
.e2e_pg_head
import
PGHead
# rec head
...
...
@@ -32,8 +33,9 @@ def build_head(config):
# cls head
from
.cls_head
import
ClsHead
support_dict
=
[
'DBHead'
,
'EASTHead'
,
'SASTHead'
,
'CTCHead'
,
'ClsHead'
,
'AttentionHead'
,
'SRNHead'
,
'PGHead'
,
'Transformer'
,
'TableAttentionHead'
,
'SARHead'
'DBHead'
,
'PSEHead'
,
'EASTHead'
,
'SASTHead'
,
'CTCHead'
,
'ClsHead'
,
'AttentionHead'
,
'SRNHead'
,
'PGHead'
,
'Transformer'
,
'TableAttentionHead'
,
'SARHead'
]
#table head
...
...
ppocr/modeling/heads/det_pse_head.py
0 → 100644
View file @
e40fd431
# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
paddle
import
nn
class
PSEHead
(
nn
.
Layer
):
def
__init__
(
self
,
in_channels
,
hidden_dim
=
256
,
out_channels
=
7
,
**
kwargs
):
super
(
PSEHead
,
self
).
__init__
()
self
.
conv1
=
nn
.
Conv2D
(
in_channels
,
hidden_dim
,
kernel_size
=
3
,
stride
=
1
,
padding
=
1
)
self
.
bn1
=
nn
.
BatchNorm2D
(
hidden_dim
)
self
.
relu1
=
nn
.
ReLU
()
self
.
conv2
=
nn
.
Conv2D
(
hidden_dim
,
out_channels
,
kernel_size
=
1
,
stride
=
1
,
padding
=
0
)
def
forward
(
self
,
x
,
**
kwargs
):
out
=
self
.
conv1
(
x
)
out
=
self
.
relu1
(
self
.
bn1
(
out
))
out
=
self
.
conv2
(
out
)
return
{
'maps'
:
out
}
ppocr/modeling/heads/multiheadAttention.py
View file @
e40fd431
...
...
@@ -71,8 +71,6 @@ class MultiheadAttention(nn.Layer):
value
,
key_padding_mask
=
None
,
incremental_state
=
None
,
need_weights
=
True
,
static_kv
=
False
,
attn_mask
=
None
):
"""
Inputs of forward function
...
...
@@ -88,46 +86,42 @@ class MultiheadAttention(nn.Layer):
attn_output: [target length, batch size, embed dim]
attn_output_weights: [batch size, target length, sequence length]
"""
tgt_len
,
bsz
,
embed_dim
=
query
.
shape
assert
embed_dim
==
self
.
embed_dim
assert
list
(
query
.
shape
)
==
[
tgt_len
,
bsz
,
embed_dim
]
assert
key
.
shape
==
value
.
shape
q_shape
=
paddle
.
shape
(
query
)
src_shape
=
paddle
.
shape
(
key
)
q
=
self
.
_in_proj_q
(
query
)
k
=
self
.
_in_proj_k
(
key
)
v
=
self
.
_in_proj_v
(
value
)
q
*=
self
.
scaling
q
=
q
.
reshape
([
tgt_len
,
bsz
*
self
.
num_heads
,
self
.
head_dim
]).
transpose
(
[
1
,
0
,
2
])
k
=
k
.
reshape
([
-
1
,
bsz
*
self
.
num_heads
,
self
.
head_dim
]).
transpose
(
[
1
,
0
,
2
])
v
=
v
.
reshape
([
-
1
,
bsz
*
self
.
num_heads
,
self
.
head_dim
]).
transpose
(
[
1
,
0
,
2
])
src_len
=
k
.
shape
[
1
]
q
=
paddle
.
transpose
(
paddle
.
reshape
(
q
,
[
q_shape
[
0
],
q_shape
[
1
],
self
.
num_heads
,
self
.
head_dim
]),
[
1
,
2
,
0
,
3
])
k
=
paddle
.
transpose
(
paddle
.
reshape
(
k
,
[
src_shape
[
0
],
q_shape
[
1
],
self
.
num_heads
,
self
.
head_dim
]),
[
1
,
2
,
0
,
3
])
v
=
paddle
.
transpose
(
paddle
.
reshape
(
v
,
[
src_shape
[
0
],
q_shape
[
1
],
self
.
num_heads
,
self
.
head_dim
]),
[
1
,
2
,
0
,
3
])
if
key_padding_mask
is
not
None
:
assert
key_padding_mask
.
shape
[
0
]
==
bsz
assert
key_padding_mask
.
shape
[
1
]
==
src_len
attn_output_weights
=
paddle
.
bmm
(
q
,
k
.
transpose
([
0
,
2
,
1
]))
assert
list
(
attn_output_weights
.
shape
)
==
[
bsz
*
self
.
num_heads
,
tgt_len
,
src_len
]
assert
key_padding_mask
.
shape
[
0
]
==
q_shape
[
1
]
assert
key_padding_mask
.
shape
[
1
]
==
src_shape
[
0
]
attn_output_weights
=
paddle
.
matmul
(
q
,
paddle
.
transpose
(
k
,
[
0
,
1
,
3
,
2
]))
if
attn_mask
is
not
None
:
attn_mask
=
attn_mask
.
unsqueeze
(
0
)
attn_mask
=
paddle
.
unsqueeze
(
paddle
.
unsqueeze
(
attn_mask
,
0
),
0
)
attn_output_weights
+=
attn_mask
if
key_padding_mask
is
not
None
:
attn_output_weights
=
attn_output_weights
.
reshape
(
[
bsz
,
self
.
num_heads
,
tgt_len
,
src_len
])
key
=
key_padding_mask
.
unsqueeze
(
1
).
unsqueeze
(
2
).
astype
(
'float32'
)
y
=
paddle
.
full
(
shape
=
key
.
shape
,
dtype
=
'float32'
,
fill_value
=
'-inf'
)
attn_output_weights
=
paddle
.
reshape
(
attn_output_weights
,
[
q_shape
[
1
],
self
.
num_heads
,
q_shape
[
0
],
src_shape
[
0
]])
key
=
paddle
.
unsqueeze
(
paddle
.
unsqueeze
(
key_padding_mask
,
1
),
2
)
key
=
paddle
.
cast
(
key
,
'float32'
)
y
=
paddle
.
full
(
shape
=
paddle
.
shape
(
key
),
dtype
=
'float32'
,
fill_value
=
'-inf'
)
y
=
paddle
.
where
(
key
==
0.
,
key
,
y
)
attn_output_weights
+=
y
attn_output_weights
=
attn_output_weights
.
reshape
(
[
bsz
*
self
.
num_heads
,
tgt_len
,
src_len
])
attn_output_weights
=
F
.
softmax
(
attn_output_weights
.
astype
(
'float32'
),
axis
=-
1
,
...
...
@@ -136,43 +130,34 @@ class MultiheadAttention(nn.Layer):
attn_output_weights
=
F
.
dropout
(
attn_output_weights
,
p
=
self
.
dropout
,
training
=
self
.
training
)
attn_output
=
paddle
.
bmm
(
attn_output_weights
,
v
)
assert
list
(
attn_output
.
shape
)
==
[
bsz
*
self
.
num_heads
,
tgt_len
,
self
.
head_dim
]
attn_output
=
attn_output
.
transpose
([
1
,
0
,
2
]).
reshape
(
[
tgt_len
,
bsz
,
embed_dim
])
attn_output
=
paddle
.
matmul
(
attn_output_weights
,
v
)
attn_output
=
paddle
.
reshape
(
paddle
.
transpose
(
attn_output
,
[
2
,
0
,
1
,
3
]),
[
q_shape
[
0
],
q_shape
[
1
],
self
.
embed_dim
])
attn_output
=
self
.
out_proj
(
attn_output
)
if
need_weights
:
# average attention weights over heads
attn_output_weights
=
attn_output_weights
.
reshape
(
[
bsz
,
self
.
num_heads
,
tgt_len
,
src_len
])
attn_output_weights
=
attn_output_weights
.
sum
(
axis
=
1
)
/
self
.
num_heads
else
:
attn_output_weights
=
None
return
attn_output
,
attn_output_weights
return
attn_output
def
_in_proj_q
(
self
,
query
):
query
=
query
.
transpose
([
1
,
2
,
0
])
query
=
paddle
.
transpose
(
query
,
[
1
,
2
,
0
])
query
=
paddle
.
unsqueeze
(
query
,
axis
=
2
)
res
=
self
.
conv1
(
query
)
res
=
paddle
.
squeeze
(
res
,
axis
=
2
)
res
=
res
.
transpose
([
2
,
0
,
1
])
res
=
paddle
.
transpose
(
res
,
[
2
,
0
,
1
])
return
res
def
_in_proj_k
(
self
,
key
):
key
=
key
.
transpose
([
1
,
2
,
0
])
key
=
paddle
.
transpose
(
key
,
[
1
,
2
,
0
])
key
=
paddle
.
unsqueeze
(
key
,
axis
=
2
)
res
=
self
.
conv2
(
key
)
res
=
paddle
.
squeeze
(
res
,
axis
=
2
)
res
=
res
.
transpose
([
2
,
0
,
1
])
res
=
paddle
.
transpose
(
res
,
[
2
,
0
,
1
])
return
res
def
_in_proj_v
(
self
,
value
):
value
=
valu
e
.
transpose
([
1
,
2
,
0
])
#(1, 2, 0)
value
=
paddl
e
.
transpose
(
value
,
[
1
,
2
,
0
])
#(1, 2, 0)
value
=
paddle
.
unsqueeze
(
value
,
axis
=
2
)
res
=
self
.
conv3
(
value
)
res
=
paddle
.
squeeze
(
res
,
axis
=
2
)
res
=
res
.
transpose
([
2
,
0
,
1
])
res
=
paddle
.
transpose
(
res
,
[
2
,
0
,
1
])
return
res
ppocr/modeling/heads/rec_nrtr_head.py
View file @
e40fd431
...
...
@@ -61,12 +61,12 @@ class Transformer(nn.Layer):
custom_decoder
=
None
,
in_channels
=
0
,
out_channels
=
0
,
dst_vocab_size
=
99
,
scale_embedding
=
True
):
super
(
Transformer
,
self
).
__init__
()
self
.
out_channels
=
out_channels
+
1
self
.
embedding
=
Embeddings
(
d_model
=
d_model
,
vocab
=
dst_vocab_size
,
vocab
=
self
.
out_channels
,
padding_idx
=
0
,
scale_embedding
=
scale_embedding
)
self
.
positional_encoding
=
PositionalEncoding
(
...
...
@@ -96,9 +96,10 @@ class Transformer(nn.Layer):
self
.
beam_size
=
beam_size
self
.
d_model
=
d_model
self
.
nhead
=
nhead
self
.
tgt_word_prj
=
nn
.
Linear
(
d_model
,
dst_vocab_size
,
bias_attr
=
False
)
self
.
tgt_word_prj
=
nn
.
Linear
(
d_model
,
self
.
out_channels
,
bias_attr
=
False
)
w0
=
np
.
random
.
normal
(
0.0
,
d_model
**-
0.5
,
(
d_model
,
dst_vocab_size
)).
astype
(
np
.
float32
)
(
d_model
,
self
.
out_channels
)).
astype
(
np
.
float32
)
self
.
tgt_word_prj
.
weight
.
set_value
(
w0
)
self
.
apply
(
self
.
_init_weights
)
...
...
@@ -156,46 +157,41 @@ class Transformer(nn.Layer):
return
self
.
forward_test
(
src
)
def
forward_test
(
self
,
src
):
bs
=
src
.
shape
[
0
]
bs
=
paddle
.
shape
(
src
)
[
0
]
if
self
.
encoder
is
not
None
:
src
=
self
.
positional_encoding
(
src
.
transpose
([
1
,
0
,
2
]))
src
=
self
.
positional_encoding
(
paddle
.
transpose
(
src
,
[
1
,
0
,
2
]))
memory
=
self
.
encoder
(
src
)
else
:
memory
=
src
.
squeeze
(
2
).
transpose
(
[
2
,
0
,
1
])
memory
=
paddle
.
transpose
(
paddle
.
squeeze
(
src
,
2
),
[
2
,
0
,
1
])
dec_seq
=
paddle
.
full
((
bs
,
1
),
2
,
dtype
=
paddle
.
int64
)
dec_prob
=
paddle
.
full
((
bs
,
1
),
1.
,
dtype
=
paddle
.
float32
)
for
len_dec_seq
in
range
(
1
,
25
):
src_enc
=
memory
.
clone
()
tgt_key_padding_mask
=
self
.
generate_padding_mask
(
dec_seq
)
dec_seq_embed
=
self
.
embedding
(
dec_seq
).
transpose
([
1
,
0
,
2
])
dec_seq_embed
=
paddle
.
transpose
(
self
.
embedding
(
dec_seq
),
[
1
,
0
,
2
])
dec_seq_embed
=
self
.
positional_encoding
(
dec_seq_embed
)
tgt_mask
=
self
.
generate_square_subsequent_mask
(
dec_seq_embed
.
shape
[
0
])
tgt_mask
=
self
.
generate_square_subsequent_mask
(
paddle
.
shape
(
dec_seq_embed
)[
0
])
output
=
self
.
decoder
(
dec_seq_embed
,
src_enc
,
memory
,
tgt_mask
=
tgt_mask
,
memory_mask
=
None
,
tgt_key_padding_mask
=
tgt_key_padding_mask
,
tgt_key_padding_mask
=
None
,
memory_key_padding_mask
=
None
)
dec_output
=
output
.
transpose
([
1
,
0
,
2
])
dec_output
=
dec_output
[:,
-
1
,
:]
# Pick the last step: (bh * bm) * d_h
word_prob
=
F
.
log_softmax
(
self
.
tgt_word_prj
(
dec_output
),
axis
=
1
)
word_prob
=
word_prob
.
reshape
([
1
,
bs
,
-
1
])
preds_idx
=
word_prob
.
argmax
(
axis
=
2
)
dec_output
=
paddle
.
transpose
(
output
,
[
1
,
0
,
2
])
dec_output
=
dec_output
[:,
-
1
,
:]
word_prob
=
F
.
softmax
(
self
.
tgt_word_prj
(
dec_output
),
axis
=
1
)
preds_idx
=
paddle
.
argmax
(
word_prob
,
axis
=
1
)
if
paddle
.
equal_all
(
preds_idx
[
-
1
]
,
preds_idx
,
paddle
.
full
(
p
reds_idx
[
-
1
].
shape
,
3
,
dtype
=
'int64'
)):
p
addle
.
shape
(
preds_idx
)
,
3
,
dtype
=
'int64'
)):
break
preds_prob
=
word_prob
.
max
(
axis
=
2
)
preds_prob
=
paddle
.
max
(
word_prob
,
axis
=
1
)
dec_seq
=
paddle
.
concat
(
[
dec_seq
,
preds_idx
.
reshape
([
-
1
,
1
])],
axis
=
1
)
return
dec_seq
[
dec_seq
,
paddle
.
reshape
(
preds_idx
,
[
-
1
,
1
])],
axis
=
1
)
dec_prob
=
paddle
.
concat
(
[
dec_prob
,
paddle
.
reshape
(
preds_prob
,
[
-
1
,
1
])],
axis
=
1
)
return
[
dec_seq
,
dec_prob
]
def
forward_beam
(
self
,
images
):
''' Translation work in one batch '''
...
...
@@ -211,14 +207,15 @@ class Transformer(nn.Layer):
n_prev_active_inst
,
n_bm
):
''' Collect tensor parts associated to active instances. '''
_
,
*
d_hs
=
beamed_tensor
.
shape
beamed_tensor_shape
=
paddle
.
shape
(
beamed_tensor
)
n_curr_active_inst
=
len
(
curr_active_inst_idx
)
new_shape
=
(
n_curr_active_inst
*
n_bm
,
*
d_hs
)
new_shape
=
(
n_curr_active_inst
*
n_bm
,
beamed_tensor_shape
[
1
],
beamed_tensor_shape
[
2
])
beamed_tensor
=
beamed_tensor
.
reshape
([
n_prev_active_inst
,
-
1
])
beamed_tensor
=
beamed_tensor
.
index_select
(
paddle
.
to_tensor
(
curr_active_inst_idx
)
,
axis
=
0
)
beamed_tensor
=
beamed_tensor
.
reshape
(
[
*
new_shape
]
)
curr_active_inst_idx
,
axis
=
0
)
beamed_tensor
=
beamed_tensor
.
reshape
(
new_shape
)
return
beamed_tensor
...
...
@@ -249,44 +246,26 @@ class Transformer(nn.Layer):
b
.
get_current_state
()
for
b
in
inst_dec_beams
if
not
b
.
done
]
dec_partial_seq
=
paddle
.
stack
(
dec_partial_seq
)
dec_partial_seq
=
dec_partial_seq
.
reshape
([
-
1
,
len_dec_seq
])
return
dec_partial_seq
def
prepare_beam_memory_key_padding_mask
(
inst_dec_beams
,
memory_key_padding_mask
,
n_bm
):
keep
=
[]
for
idx
in
(
memory_key_padding_mask
):
if
not
inst_dec_beams
[
idx
].
done
:
keep
.
append
(
idx
)
memory_key_padding_mask
=
memory_key_padding_mask
[
paddle
.
to_tensor
(
keep
)]
len_s
=
memory_key_padding_mask
.
shape
[
-
1
]
n_inst
=
memory_key_padding_mask
.
shape
[
0
]
memory_key_padding_mask
=
paddle
.
concat
(
[
memory_key_padding_mask
for
i
in
range
(
n_bm
)],
axis
=
1
)
memory_key_padding_mask
=
memory_key_padding_mask
.
reshape
(
[
n_inst
*
n_bm
,
len_s
])
#repeat(1, n_bm)
return
memory_key_padding_mask
def
predict_word
(
dec_seq
,
enc_output
,
n_active_inst
,
n_bm
,
memory_key_padding_mask
):
tgt_key_padding_mask
=
self
.
generate_padding_mask
(
dec_seq
)
dec_seq
=
self
.
embedding
(
dec_seq
).
transpose
([
1
,
0
,
2
])
dec_seq
=
paddle
.
transpose
(
self
.
embedding
(
dec_seq
),
[
1
,
0
,
2
])
dec_seq
=
self
.
positional_encoding
(
dec_seq
)
tgt_mask
=
self
.
generate_square_subsequent_mask
(
dec_seq
.
shape
[
0
])
tgt_mask
=
self
.
generate_square_subsequent_mask
(
paddle
.
shape
(
dec_seq
)[
0
])
dec_output
=
self
.
decoder
(
dec_seq
,
enc_output
,
tgt_mask
=
tgt_mask
,
tgt_key_padding_mask
=
tgt_key_padding_mask
,
memory_key_padding_mask
=
memory_key_padding_mask
,
).
transpose
(
[
1
,
0
,
2
])
tgt_key_padding_mask
=
None
,
memory_key_padding_mask
=
memory_key_padding_mask
,
)
dec_output
=
paddle
.
transpose
(
dec_output
,
[
1
,
0
,
2
])
dec_output
=
dec_output
[:,
-
1
,
:]
# Pick the last step: (bh * bm) * d_h
word_prob
=
F
.
log_
softmax
(
self
.
tgt_word_prj
(
dec_output
),
axis
=
1
)
word_prob
=
word_prob
.
reshape
(
[
n_active_inst
,
n_bm
,
-
1
])
word_prob
=
F
.
softmax
(
self
.
tgt_word_prj
(
dec_output
),
axis
=
1
)
word_prob
=
paddle
.
reshape
(
word_prob
,
[
n_active_inst
,
n_bm
,
-
1
])
return
word_prob
def
collect_active_inst_idx_list
(
inst_beams
,
word_prob
,
...
...
@@ -302,9 +281,8 @@ class Transformer(nn.Layer):
n_active_inst
=
len
(
inst_idx_to_position_map
)
dec_seq
=
prepare_beam_dec_seq
(
inst_dec_beams
,
len_dec_seq
)
memory_key_padding_mask
=
None
word_prob
=
predict_word
(
dec_seq
,
enc_output
,
n_active_inst
,
n_bm
,
memory_key_padding_mask
)
None
)
# Update the beam with predicted word prob information and collect incomplete instances
active_inst_idx_list
=
collect_active_inst_idx_list
(
inst_dec_beams
,
word_prob
,
inst_idx_to_position_map
)
...
...
@@ -324,27 +302,21 @@ class Transformer(nn.Layer):
with
paddle
.
no_grad
():
#-- Encode
if
self
.
encoder
is
not
None
:
src
=
self
.
positional_encoding
(
images
.
transpose
([
1
,
0
,
2
]))
src_enc
=
self
.
encoder
(
src
)
.
transpose
([
1
,
0
,
2
])
src_enc
=
self
.
encoder
(
src
)
else
:
src_enc
=
images
.
squeeze
(
2
).
transpose
([
0
,
2
,
1
])
#-- Repeat data for beam search
n_bm
=
self
.
beam_size
n_inst
,
len_s
,
d_h
=
src_enc
.
shape
src_enc
=
paddle
.
concat
([
src_enc
for
i
in
range
(
n_bm
)],
axis
=
1
)
src_enc
=
src_enc
.
reshape
([
n_inst
*
n_bm
,
len_s
,
d_h
]).
transpose
(
[
1
,
0
,
2
])
#-- Prepare beams
inst_dec_beams
=
[
Beam
(
n_bm
)
for
_
in
range
(
n_inst
)]
#-- Bookkeeping for active or not
active_inst_idx_list
=
list
(
range
(
n_inst
))
src_shape
=
paddle
.
shape
(
src_enc
)
inst_dec_beams
=
[
Beam
(
n_bm
)
for
_
in
range
(
1
)]
active_inst_idx_list
=
list
(
range
(
1
))
# Repeat data for beam search
src_enc
=
paddle
.
tile
(
src_enc
,
[
1
,
n_bm
,
1
])
inst_idx_to_position_map
=
get_inst_idx_to_tensor_position_map
(
active_inst_idx_list
)
#
--
Decode
# Decode
for
len_dec_seq
in
range
(
1
,
25
):
src_enc_copy
=
src_enc
.
clone
()
active_inst_idx_list
=
beam_decode_step
(
...
...
@@ -358,10 +330,19 @@ class Transformer(nn.Layer):
batch_hyp
,
batch_scores
=
collect_hypothesis_and_scores
(
inst_dec_beams
,
1
)
result_hyp
=
[]
for
bs_hyp
in
batch_hyp
:
bs_hyp_pad
=
bs_hyp
[
0
]
+
[
3
]
*
(
25
-
len
(
bs_hyp
[
0
]))
hyp_scores
=
[]
for
bs_hyp
,
score
in
zip
(
batch_hyp
,
batch_scores
):
l
=
len
(
bs_hyp
[
0
])
bs_hyp_pad
=
bs_hyp
[
0
]
+
[
3
]
*
(
25
-
l
)
result_hyp
.
append
(
bs_hyp_pad
)
return
paddle
.
to_tensor
(
np
.
array
(
result_hyp
),
dtype
=
paddle
.
int64
)
score
=
float
(
score
)
/
l
hyp_score
=
[
score
for
_
in
range
(
25
)]
hyp_scores
.
append
(
hyp_score
)
return
[
paddle
.
to_tensor
(
np
.
array
(
result_hyp
),
dtype
=
paddle
.
int64
),
paddle
.
to_tensor
(
hyp_scores
)
]
def
generate_square_subsequent_mask
(
self
,
sz
):
"""Generate a square mask for the sequence. The masked positions are filled with float('-inf').
...
...
@@ -376,7 +357,7 @@ class Transformer(nn.Layer):
return
mask
def
generate_padding_mask
(
self
,
x
):
padding_mask
=
x
.
equal
(
paddle
.
to_tensor
(
0
,
dtype
=
x
.
dtype
))
padding_mask
=
paddle
.
equal
(
x
,
paddle
.
to_tensor
(
0
,
dtype
=
x
.
dtype
))
return
padding_mask
def
_reset_parameters
(
self
):
...
...
@@ -514,17 +495,17 @@ class TransformerEncoderLayer(nn.Layer):
src
,
src
,
attn_mask
=
src_mask
,
key_padding_mask
=
src_key_padding_mask
)
[
0
]
key_padding_mask
=
src_key_padding_mask
)
src
=
src
+
self
.
dropout1
(
src2
)
src
=
self
.
norm1
(
src
)
src
=
src
.
transpose
([
1
,
2
,
0
])
src
=
paddle
.
transpose
(
src
,
[
1
,
2
,
0
])
src
=
paddle
.
unsqueeze
(
src
,
2
)
src2
=
self
.
conv2
(
F
.
relu
(
self
.
conv1
(
src
)))
src2
=
paddle
.
squeeze
(
src2
,
2
)
src2
=
src2
.
transpose
([
2
,
0
,
1
])
src2
=
paddle
.
transpose
(
src2
,
[
2
,
0
,
1
])
src
=
paddle
.
squeeze
(
src
,
2
)
src
=
src
.
transpose
([
2
,
0
,
1
])
src
=
paddle
.
transpose
(
src
,
[
2
,
0
,
1
])
src
=
src
+
self
.
dropout2
(
src2
)
src
=
self
.
norm2
(
src
)
...
...
@@ -598,7 +579,7 @@ class TransformerDecoderLayer(nn.Layer):
tgt
,
tgt
,
attn_mask
=
tgt_mask
,
key_padding_mask
=
tgt_key_padding_mask
)
[
0
]
key_padding_mask
=
tgt_key_padding_mask
)
tgt
=
tgt
+
self
.
dropout1
(
tgt2
)
tgt
=
self
.
norm1
(
tgt
)
tgt2
=
self
.
multihead_attn
(
...
...
@@ -606,18 +587,18 @@ class TransformerDecoderLayer(nn.Layer):
memory
,
memory
,
attn_mask
=
memory_mask
,
key_padding_mask
=
memory_key_padding_mask
)
[
0
]
key_padding_mask
=
memory_key_padding_mask
)
tgt
=
tgt
+
self
.
dropout2
(
tgt2
)
tgt
=
self
.
norm2
(
tgt
)
# default
tgt
=
tgt
.
transpose
([
1
,
2
,
0
])
tgt
=
paddle
.
transpose
(
tgt
,
[
1
,
2
,
0
])
tgt
=
paddle
.
unsqueeze
(
tgt
,
2
)
tgt2
=
self
.
conv2
(
F
.
relu
(
self
.
conv1
(
tgt
)))
tgt2
=
paddle
.
squeeze
(
tgt2
,
2
)
tgt2
=
tgt2
.
transpose
([
2
,
0
,
1
])
tgt2
=
paddle
.
transpose
(
tgt2
,
[
2
,
0
,
1
])
tgt
=
paddle
.
squeeze
(
tgt
,
2
)
tgt
=
tgt
.
transpose
([
2
,
0
,
1
])
tgt
=
paddle
.
transpose
(
tgt
,
[
2
,
0
,
1
])
tgt
=
tgt
+
self
.
dropout3
(
tgt2
)
tgt
=
self
.
norm3
(
tgt
)
...
...
@@ -656,8 +637,8 @@ class PositionalEncoding(nn.Layer):
(
-
math
.
log
(
10000.0
)
/
dim
))
pe
[:,
0
::
2
]
=
paddle
.
sin
(
position
*
div_term
)
pe
[:,
1
::
2
]
=
paddle
.
cos
(
position
*
div_term
)
pe
=
pe
.
unsqueeze
(
0
)
pe
=
pe
.
transpose
([
1
,
0
,
2
])
pe
=
p
addl
e
.
unsqueeze
(
pe
,
0
)
pe
=
p
addl
e
.
transpose
(
pe
,
[
1
,
0
,
2
])
self
.
register_buffer
(
'pe'
,
pe
)
def
forward
(
self
,
x
):
...
...
@@ -670,7 +651,7 @@ class PositionalEncoding(nn.Layer):
Examples:
>>> output = pos_encoder(x)
"""
x
=
x
+
self
.
pe
[:
x
.
shape
[
0
],
:]
x
=
x
+
self
.
pe
[:
paddle
.
shape
(
x
)
[
0
],
:]
return
self
.
dropout
(
x
)
...
...
@@ -702,7 +683,7 @@ class PositionalEncoding_2d(nn.Layer):
(
-
math
.
log
(
10000.0
)
/
dim
))
pe
[:,
0
::
2
]
=
paddle
.
sin
(
position
*
div_term
)
pe
[:,
1
::
2
]
=
paddle
.
cos
(
position
*
div_term
)
pe
=
p
e
.
unsqueeze
(
0
).
transpose
(
[
1
,
0
,
2
])
pe
=
p
addle
.
transpose
(
paddle
.
unsqueeze
(
pe
,
0
),
[
1
,
0
,
2
])
self
.
register_buffer
(
'pe'
,
pe
)
self
.
avg_pool_1
=
nn
.
AdaptiveAvgPool2D
((
1
,
1
))
...
...
@@ -722,21 +703,22 @@ class PositionalEncoding_2d(nn.Layer):
Examples:
>>> output = pos_encoder(x)
"""
w_pe
=
self
.
pe
[:
x
.
shape
[
-
1
],
:]
w_pe
=
self
.
pe
[:
paddle
.
shape
(
x
)
[
-
1
],
:]
w1
=
self
.
linear1
(
self
.
avg_pool_1
(
x
).
squeeze
()).
unsqueeze
(
0
)
w_pe
=
w_pe
*
w1
w_pe
=
w_p
e
.
transpose
([
1
,
2
,
0
])
w_pe
=
w_p
e
.
unsqueeze
(
2
)
w_pe
=
paddl
e
.
transpose
(
w_pe
,
[
1
,
2
,
0
])
w_pe
=
paddl
e
.
unsqueeze
(
w_pe
,
2
)
h_pe
=
self
.
pe
[:
x
.
shape
[
-
2
],
:]
h_pe
=
self
.
pe
[:
paddle
.
shape
(
x
)
.
shape
[
-
2
],
:]
w2
=
self
.
linear2
(
self
.
avg_pool_2
(
x
).
squeeze
()).
unsqueeze
(
0
)
h_pe
=
h_pe
*
w2
h_pe
=
h_p
e
.
transpose
([
1
,
2
,
0
])
h_pe
=
h_p
e
.
unsqueeze
(
3
)
h_pe
=
paddl
e
.
transpose
(
h_pe
,
[
1
,
2
,
0
])
h_pe
=
paddl
e
.
unsqueeze
(
h_pe
,
3
)
x
=
x
+
w_pe
+
h_pe
x
=
x
.
reshape
(
[
x
.
shape
[
0
],
x
.
shape
[
1
],
x
.
shape
[
2
]
*
x
.
shape
[
3
]]).
transpose
(
x
=
paddle
.
transpose
(
paddle
.
reshape
(
x
,
[
x
.
shape
[
0
],
x
.
shape
[
1
],
x
.
shape
[
2
]
*
x
.
shape
[
3
]]),
[
2
,
0
,
1
])
return
self
.
dropout
(
x
)
...
...
@@ -817,7 +799,7 @@ class Beam():
def
sort_scores
(
self
):
"Sort the scores."
return
self
.
scores
,
paddle
.
to_tensor
(
[
i
for
i
in
range
(
self
.
scores
.
shape
[
0
])],
dtype
=
'int32'
)
[
i
for
i
in
range
(
int
(
self
.
scores
.
shape
[
0
])
)
],
dtype
=
'int32'
)
def
get_the_best_score_and_idx
(
self
):
"Get the score of the best in the beam."
...
...
ppocr/modeling/heads/rec_sar_head.py
View file @
e40fd431
...
...
@@ -235,6 +235,7 @@ class ParallelSARDecoder(BaseDecoder):
# cal mask of attention weight
for
i
,
valid_ratio
in
enumerate
(
valid_ratios
):
valid_width
=
min
(
w
,
math
.
ceil
(
w
*
valid_ratio
))
if
valid_width
<
w
:
attn_weight
[
i
,
:,
:,
valid_width
:,
:]
=
float
(
'-inf'
)
attn_weight
=
paddle
.
reshape
(
attn_weight
,
[
bsz
,
T
,
-
1
])
...
...
ppocr/modeling/necks/__init__.py
View file @
e40fd431
...
...
@@ -22,7 +22,8 @@ def build_neck(config):
from
.rnn
import
SequenceEncoder
from
.pg_fpn
import
PGFPN
from
.table_fpn
import
TableFPN
support_dict
=
[
'DBFPN'
,
'EASTFPN'
,
'SASTFPN'
,
'SequenceEncoder'
,
'PGFPN'
,
'TableFPN'
]
from
.fpn
import
FPN
support_dict
=
[
'FPN'
,
'DBFPN'
,
'EASTFPN'
,
'SASTFPN'
,
'SequenceEncoder'
,
'PGFPN'
,
'TableFPN'
]
module_name
=
config
.
pop
(
'name'
)
assert
module_name
in
support_dict
,
Exception
(
'neck only support {}'
.
format
(
...
...
Prev
1
2
3
4
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment