Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
wangsen
paddle_dbnet
Commits
006d84bf
Unverified
Commit
006d84bf
authored
Oct 21, 2021
by
崔浩
Committed by
GitHub
Oct 21, 2021
Browse files
Merge branch 'PaddlePaddle:dygraph' into dygraph
parents
302ca30c
8beeb84c
Changes
241
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
2063 additions
and
41 deletions
+2063
-41
ppocr/losses/distillation_loss.py
ppocr/losses/distillation_loss.py
+9
-7
ppocr/losses/rec_aster_loss.py
ppocr/losses/rec_aster_loss.py
+99
-0
ppocr/losses/rec_ctc_loss.py
ppocr/losses/rec_ctc_loss.py
+10
-2
ppocr/losses/rec_enhanced_ctc_loss.py
ppocr/losses/rec_enhanced_ctc_loss.py
+70
-0
ppocr/losses/rec_nrtr_loss.py
ppocr/losses/rec_nrtr_loss.py
+30
-0
ppocr/losses/rec_sar_loss.py
ppocr/losses/rec_sar_loss.py
+28
-0
ppocr/metrics/eval_det_iou.py
ppocr/metrics/eval_det_iou.py
+0
-11
ppocr/metrics/rec_metric.py
ppocr/metrics/rec_metric.py
+11
-1
ppocr/modeling/architectures/base_model.py
ppocr/modeling/architectures/base_model.py
+0
-1
ppocr/modeling/backbones/__init__.py
ppocr/modeling/backbones/__init__.py
+5
-1
ppocr/modeling/backbones/rec_mv1_enhance.py
ppocr/modeling/backbones/rec_mv1_enhance.py
+3
-12
ppocr/modeling/backbones/rec_nrtr_mtb.py
ppocr/modeling/backbones/rec_nrtr_mtb.py
+48
-0
ppocr/modeling/backbones/rec_resnet_31.py
ppocr/modeling/backbones/rec_resnet_31.py
+176
-0
ppocr/modeling/backbones/rec_resnet_aster.py
ppocr/modeling/backbones/rec_resnet_aster.py
+140
-0
ppocr/modeling/heads/__init__.py
ppocr/modeling/heads/__init__.py
+8
-2
ppocr/modeling/heads/det_pse_head.py
ppocr/modeling/heads/det_pse_head.py
+35
-0
ppocr/modeling/heads/multiheadAttention.py
ppocr/modeling/heads/multiheadAttention.py
+163
-0
ppocr/modeling/heads/rec_aster_head.py
ppocr/modeling/heads/rec_aster_head.py
+389
-0
ppocr/modeling/heads/rec_ctc_head.py
ppocr/modeling/heads/rec_ctc_head.py
+13
-4
ppocr/modeling/heads/rec_nrtr_head.py
ppocr/modeling/heads/rec_nrtr_head.py
+826
-0
No files found.
ppocr/losses/distillation_loss.py
View file @
006d84bf
...
@@ -44,20 +44,22 @@ class DistillationDMLLoss(DMLLoss):
...
@@ -44,20 +44,22 @@ class DistillationDMLLoss(DMLLoss):
def
__init__
(
self
,
def
__init__
(
self
,
model_name_pairs
=
[],
model_name_pairs
=
[],
act
=
None
,
act
=
None
,
use_log
=
False
,
key
=
None
,
key
=
None
,
maps_name
=
None
,
maps_name
=
None
,
name
=
"dml"
):
name
=
"dml"
):
super
().
__init__
(
act
=
act
)
super
().
__init__
(
act
=
act
,
use_log
=
use_log
)
assert
isinstance
(
model_name_pairs
,
list
)
assert
isinstance
(
model_name_pairs
,
list
)
self
.
key
=
key
self
.
key
=
key
self
.
model_name_pairs
=
self
.
_check_model_name_pairs
(
model_name_pairs
)
self
.
model_name_pairs
=
self
.
_check_model_name_pairs
(
model_name_pairs
)
self
.
name
=
name
self
.
name
=
name
self
.
maps_name
=
self
.
_check_maps_name
(
maps_name
)
self
.
maps_name
=
self
.
_check_maps_name
(
maps_name
)
def
_check_model_name_pairs
(
self
,
model_name_pairs
):
def
_check_model_name_pairs
(
self
,
model_name_pairs
):
if
not
isinstance
(
model_name_pairs
,
list
):
if
not
isinstance
(
model_name_pairs
,
list
):
return
[]
return
[]
elif
isinstance
(
model_name_pairs
[
0
],
list
)
and
isinstance
(
model_name_pairs
[
0
][
0
],
str
):
elif
isinstance
(
model_name_pairs
[
0
],
list
)
and
isinstance
(
model_name_pairs
[
0
][
0
],
str
):
return
model_name_pairs
return
model_name_pairs
else
:
else
:
return
[
model_name_pairs
]
return
[
model_name_pairs
]
...
@@ -110,11 +112,11 @@ class DistillationDMLLoss(DMLLoss):
...
@@ -110,11 +112,11 @@ class DistillationDMLLoss(DMLLoss):
if
isinstance
(
loss
,
dict
):
if
isinstance
(
loss
,
dict
):
for
key
in
loss
:
for
key
in
loss
:
loss_dict
[
"{}_{}_{}_{}_{}"
.
format
(
key
,
pair
[
loss_dict
[
"{}_{}_{}_{}_{}"
.
format
(
key
,
pair
[
0
],
pair
[
1
],
map_name
,
idx
)]
=
loss
[
key
]
0
],
pair
[
1
],
self
.
map
s
_name
,
idx
)]
=
loss
[
key
]
else
:
else
:
loss_dict
[
"{}_{}_{}"
.
format
(
self
.
name
,
self
.
maps_name
[
_c
],
loss_dict
[
"{}_{}_{}"
.
format
(
self
.
name
,
self
.
maps_name
[
idx
)]
=
loss
_c
],
idx
)]
=
loss
loss_dict
=
_sum_loss
(
loss_dict
)
loss_dict
=
_sum_loss
(
loss_dict
)
return
loss_dict
return
loss_dict
...
...
ppocr/losses/rec_aster_loss.py
0 → 100644
View file @
006d84bf
# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
absolute_import
from
__future__
import
division
from
__future__
import
print_function
import
paddle
from
paddle
import
nn
class
CosineEmbeddingLoss
(
nn
.
Layer
):
def
__init__
(
self
,
margin
=
0.
):
super
(
CosineEmbeddingLoss
,
self
).
__init__
()
self
.
margin
=
margin
self
.
epsilon
=
1e-12
def
forward
(
self
,
x1
,
x2
,
target
):
similarity
=
paddle
.
fluid
.
layers
.
reduce_sum
(
x1
*
x2
,
dim
=-
1
)
/
(
paddle
.
norm
(
x1
,
axis
=-
1
)
*
paddle
.
norm
(
x2
,
axis
=-
1
)
+
self
.
epsilon
)
one_list
=
paddle
.
full_like
(
target
,
fill_value
=
1
)
out
=
paddle
.
fluid
.
layers
.
reduce_mean
(
paddle
.
where
(
paddle
.
equal
(
target
,
one_list
),
1.
-
similarity
,
paddle
.
maximum
(
paddle
.
zeros_like
(
similarity
),
similarity
-
self
.
margin
)))
return
out
class
AsterLoss
(
nn
.
Layer
):
def
__init__
(
self
,
weight
=
None
,
size_average
=
True
,
ignore_index
=-
100
,
sequence_normalize
=
False
,
sample_normalize
=
True
,
**
kwargs
):
super
(
AsterLoss
,
self
).
__init__
()
self
.
weight
=
weight
self
.
size_average
=
size_average
self
.
ignore_index
=
ignore_index
self
.
sequence_normalize
=
sequence_normalize
self
.
sample_normalize
=
sample_normalize
self
.
loss_sem
=
CosineEmbeddingLoss
()
self
.
is_cosin_loss
=
True
self
.
loss_func_rec
=
nn
.
CrossEntropyLoss
(
weight
=
None
,
reduction
=
'none'
)
def
forward
(
self
,
predicts
,
batch
):
targets
=
batch
[
1
].
astype
(
"int64"
)
label_lengths
=
batch
[
2
].
astype
(
'int64'
)
sem_target
=
batch
[
3
].
astype
(
'float32'
)
embedding_vectors
=
predicts
[
'embedding_vectors'
]
rec_pred
=
predicts
[
'rec_pred'
]
if
not
self
.
is_cosin_loss
:
sem_loss
=
paddle
.
sum
(
self
.
loss_sem
(
embedding_vectors
,
sem_target
))
else
:
label_target
=
paddle
.
ones
([
embedding_vectors
.
shape
[
0
]])
sem_loss
=
paddle
.
sum
(
self
.
loss_sem
(
embedding_vectors
,
sem_target
,
label_target
))
# rec loss
batch_size
,
def_max_length
=
targets
.
shape
[
0
],
targets
.
shape
[
1
]
mask
=
paddle
.
zeros
([
batch_size
,
def_max_length
])
for
i
in
range
(
batch_size
):
mask
[
i
,
:
label_lengths
[
i
]]
=
1
mask
=
paddle
.
cast
(
mask
,
"float32"
)
max_length
=
max
(
label_lengths
)
assert
max_length
==
rec_pred
.
shape
[
1
]
targets
=
targets
[:,
:
max_length
]
mask
=
mask
[:,
:
max_length
]
rec_pred
=
paddle
.
reshape
(
rec_pred
,
[
-
1
,
rec_pred
.
shape
[
2
]])
input
=
nn
.
functional
.
log_softmax
(
rec_pred
,
axis
=
1
)
targets
=
paddle
.
reshape
(
targets
,
[
-
1
,
1
])
mask
=
paddle
.
reshape
(
mask
,
[
-
1
,
1
])
output
=
-
paddle
.
index_sample
(
input
,
index
=
targets
)
*
mask
output
=
paddle
.
sum
(
output
)
if
self
.
sequence_normalize
:
output
=
output
/
paddle
.
sum
(
mask
)
if
self
.
sample_normalize
:
output
=
output
/
batch_size
loss
=
output
+
sem_loss
*
0.1
return
{
'loss'
:
loss
}
ppocr/losses/rec_ctc_loss.py
View file @
006d84bf
...
@@ -21,16 +21,24 @@ from paddle import nn
...
@@ -21,16 +21,24 @@ from paddle import nn
class
CTCLoss
(
nn
.
Layer
):
class
CTCLoss
(
nn
.
Layer
):
def
__init__
(
self
,
**
kwargs
):
def
__init__
(
self
,
use_focal_loss
=
False
,
**
kwargs
):
super
(
CTCLoss
,
self
).
__init__
()
super
(
CTCLoss
,
self
).
__init__
()
self
.
loss_func
=
nn
.
CTCLoss
(
blank
=
0
,
reduction
=
'none'
)
self
.
loss_func
=
nn
.
CTCLoss
(
blank
=
0
,
reduction
=
'none'
)
self
.
use_focal_loss
=
use_focal_loss
def
forward
(
self
,
predicts
,
batch
):
def
forward
(
self
,
predicts
,
batch
):
if
isinstance
(
predicts
,
(
list
,
tuple
)):
predicts
=
predicts
[
-
1
]
predicts
=
predicts
.
transpose
((
1
,
0
,
2
))
predicts
=
predicts
.
transpose
((
1
,
0
,
2
))
N
,
B
,
_
=
predicts
.
shape
N
,
B
,
_
=
predicts
.
shape
preds_lengths
=
paddle
.
to_tensor
([
N
]
*
B
,
dtype
=
'int64'
)
preds_lengths
=
paddle
.
to_tensor
([
N
]
*
B
,
dtype
=
'int64'
)
labels
=
batch
[
1
].
astype
(
"int32"
)
labels
=
batch
[
1
].
astype
(
"int32"
)
label_lengths
=
batch
[
2
].
astype
(
'int64'
)
label_lengths
=
batch
[
2
].
astype
(
'int64'
)
loss
=
self
.
loss_func
(
predicts
,
labels
,
preds_lengths
,
label_lengths
)
loss
=
self
.
loss_func
(
predicts
,
labels
,
preds_lengths
,
label_lengths
)
loss
=
loss
.
mean
()
# sum
if
self
.
use_focal_loss
:
weight
=
paddle
.
exp
(
-
loss
)
weight
=
paddle
.
subtract
(
paddle
.
to_tensor
([
1.0
]),
weight
)
weight
=
paddle
.
square
(
weight
)
loss
=
paddle
.
multiply
(
loss
,
weight
)
loss
=
loss
.
mean
()
return
{
'loss'
:
loss
}
return
{
'loss'
:
loss
}
ppocr/losses/rec_enhanced_ctc_loss.py
0 → 100644
View file @
006d84bf
# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
absolute_import
from
__future__
import
division
from
__future__
import
print_function
import
paddle
from
paddle
import
nn
from
.ace_loss
import
ACELoss
from
.center_loss
import
CenterLoss
from
.rec_ctc_loss
import
CTCLoss
class
EnhancedCTCLoss
(
nn
.
Layer
):
def
__init__
(
self
,
use_focal_loss
=
False
,
use_ace_loss
=
False
,
ace_loss_weight
=
0.1
,
use_center_loss
=
False
,
center_loss_weight
=
0.05
,
num_classes
=
6625
,
feat_dim
=
96
,
init_center
=
False
,
center_file_path
=
None
,
**
kwargs
):
super
(
EnhancedCTCLoss
,
self
).
__init__
()
self
.
ctc_loss_func
=
CTCLoss
(
use_focal_loss
=
use_focal_loss
)
self
.
use_ace_loss
=
False
if
use_ace_loss
:
self
.
use_ace_loss
=
use_ace_loss
self
.
ace_loss_func
=
ACELoss
()
self
.
ace_loss_weight
=
ace_loss_weight
self
.
use_center_loss
=
False
if
use_center_loss
:
self
.
use_center_loss
=
use_center_loss
self
.
center_loss_func
=
CenterLoss
(
num_classes
=
num_classes
,
feat_dim
=
feat_dim
,
init_center
=
init_center
,
center_file_path
=
center_file_path
)
self
.
center_loss_weight
=
center_loss_weight
def
__call__
(
self
,
predicts
,
batch
):
loss
=
self
.
ctc_loss_func
(
predicts
,
batch
)[
"loss"
]
if
self
.
use_center_loss
:
center_loss
=
self
.
center_loss_func
(
predicts
,
batch
)[
"loss_center"
]
*
self
.
center_loss_weight
loss
=
loss
+
center_loss
if
self
.
use_ace_loss
:
ace_loss
=
self
.
ace_loss_func
(
predicts
,
batch
)[
"loss_ace"
]
*
self
.
ace_loss_weight
loss
=
loss
+
ace_loss
return
{
'enhanced_ctc_loss'
:
loss
}
ppocr/losses/rec_nrtr_loss.py
0 → 100644
View file @
006d84bf
import
paddle
from
paddle
import
nn
import
paddle.nn.functional
as
F
class
NRTRLoss
(
nn
.
Layer
):
def
__init__
(
self
,
smoothing
=
True
,
**
kwargs
):
super
(
NRTRLoss
,
self
).
__init__
()
self
.
loss_func
=
nn
.
CrossEntropyLoss
(
reduction
=
'mean'
,
ignore_index
=
0
)
self
.
smoothing
=
smoothing
def
forward
(
self
,
pred
,
batch
):
pred
=
pred
.
reshape
([
-
1
,
pred
.
shape
[
2
]])
max_len
=
batch
[
2
].
max
()
tgt
=
batch
[
1
][:,
1
:
2
+
max_len
]
tgt
=
tgt
.
reshape
([
-
1
])
if
self
.
smoothing
:
eps
=
0.1
n_class
=
pred
.
shape
[
1
]
one_hot
=
F
.
one_hot
(
tgt
,
pred
.
shape
[
1
])
one_hot
=
one_hot
*
(
1
-
eps
)
+
(
1
-
one_hot
)
*
eps
/
(
n_class
-
1
)
log_prb
=
F
.
log_softmax
(
pred
,
axis
=
1
)
non_pad_mask
=
paddle
.
not_equal
(
tgt
,
paddle
.
zeros
(
tgt
.
shape
,
dtype
=
'int64'
))
loss
=
-
(
one_hot
*
log_prb
).
sum
(
axis
=
1
)
loss
=
loss
.
masked_select
(
non_pad_mask
).
mean
()
else
:
loss
=
self
.
loss_func
(
pred
,
tgt
)
return
{
'loss'
:
loss
}
ppocr/losses/rec_sar_loss.py
0 → 100644
View file @
006d84bf
from
__future__
import
absolute_import
from
__future__
import
division
from
__future__
import
print_function
import
paddle
from
paddle
import
nn
class
SARLoss
(
nn
.
Layer
):
def
__init__
(
self
,
**
kwargs
):
super
(
SARLoss
,
self
).
__init__
()
self
.
loss_func
=
paddle
.
nn
.
loss
.
CrossEntropyLoss
(
reduction
=
"mean"
,
ignore_index
=
92
)
def
forward
(
self
,
predicts
,
batch
):
predict
=
predicts
[:,
:
-
1
,
:]
# ignore last index of outputs to be in same seq_len with targets
label
=
batch
[
1
].
astype
(
"int64"
)[:,
1
:]
# ignore first index of target in loss calculation
batch_size
,
num_steps
,
num_classes
=
predict
.
shape
[
0
],
predict
.
shape
[
1
],
predict
.
shape
[
2
]
assert
len
(
label
.
shape
)
==
len
(
list
(
predict
.
shape
))
-
1
,
\
"The target's shape and inputs's shape is [N, d] and [N, num_steps]"
inputs
=
paddle
.
reshape
(
predict
,
[
-
1
,
num_classes
])
targets
=
paddle
.
reshape
(
label
,
[
-
1
])
loss
=
self
.
loss_func
(
inputs
,
targets
)
return
{
'loss'
:
loss
}
ppocr/metrics/eval_det_iou.py
View file @
006d84bf
...
@@ -169,21 +169,10 @@ class DetectionIoUEvaluator(object):
...
@@ -169,21 +169,10 @@ class DetectionIoUEvaluator(object):
numGlobalCareDet
+=
numDetCare
numGlobalCareDet
+=
numDetCare
perSampleMetrics
=
{
perSampleMetrics
=
{
'precision'
:
precision
,
'recall'
:
recall
,
'hmean'
:
hmean
,
'pairs'
:
pairs
,
'iouMat'
:
[]
if
len
(
detPols
)
>
100
else
iouMat
.
tolist
(),
'gtPolPoints'
:
gtPolPoints
,
'detPolPoints'
:
detPolPoints
,
'gtCare'
:
numGtCare
,
'gtCare'
:
numGtCare
,
'detCare'
:
numDetCare
,
'detCare'
:
numDetCare
,
'gtDontCare'
:
gtDontCarePolsNum
,
'detDontCare'
:
detDontCarePolsNum
,
'detMatched'
:
detMatched
,
'detMatched'
:
detMatched
,
'evaluationLog'
:
evaluationLog
}
}
return
perSampleMetrics
return
perSampleMetrics
def
combine_results
(
self
,
results
):
def
combine_results
(
self
,
results
):
...
...
ppocr/metrics/rec_metric.py
View file @
006d84bf
...
@@ -13,13 +13,20 @@
...
@@ -13,13 +13,20 @@
# limitations under the License.
# limitations under the License.
import
Levenshtein
import
Levenshtein
import
string
class
RecMetric
(
object
):
class
RecMetric
(
object
):
def
__init__
(
self
,
main_indicator
=
'acc'
,
**
kwargs
):
def
__init__
(
self
,
main_indicator
=
'acc'
,
is_filter
=
False
,
**
kwargs
):
self
.
main_indicator
=
main_indicator
self
.
main_indicator
=
main_indicator
self
.
is_filter
=
is_filter
self
.
reset
()
self
.
reset
()
def
_normalize_text
(
self
,
text
):
text
=
''
.
join
(
filter
(
lambda
x
:
x
in
(
string
.
digits
+
string
.
ascii_letters
),
text
))
return
text
.
lower
()
def
__call__
(
self
,
pred_label
,
*
args
,
**
kwargs
):
def
__call__
(
self
,
pred_label
,
*
args
,
**
kwargs
):
preds
,
labels
=
pred_label
preds
,
labels
=
pred_label
correct_num
=
0
correct_num
=
0
...
@@ -28,6 +35,9 @@ class RecMetric(object):
...
@@ -28,6 +35,9 @@ class RecMetric(object):
for
(
pred
,
pred_conf
),
(
target
,
_
)
in
zip
(
preds
,
labels
):
for
(
pred
,
pred_conf
),
(
target
,
_
)
in
zip
(
preds
,
labels
):
pred
=
pred
.
replace
(
" "
,
""
)
pred
=
pred
.
replace
(
" "
,
""
)
target
=
target
.
replace
(
" "
,
""
)
target
=
target
.
replace
(
" "
,
""
)
if
self
.
is_filter
:
pred
=
self
.
_normalize_text
(
pred
)
target
=
self
.
_normalize_text
(
target
)
norm_edit_dis
+=
Levenshtein
.
distance
(
pred
,
target
)
/
max
(
norm_edit_dis
+=
Levenshtein
.
distance
(
pred
,
target
)
/
max
(
len
(
pred
),
len
(
target
),
1
)
len
(
pred
),
len
(
target
),
1
)
if
pred
==
target
:
if
pred
==
target
:
...
...
ppocr/modeling/architectures/base_model.py
View file @
006d84bf
...
@@ -14,7 +14,6 @@
...
@@ -14,7 +14,6 @@
from
__future__
import
absolute_import
from
__future__
import
absolute_import
from
__future__
import
division
from
__future__
import
division
from
__future__
import
print_function
from
__future__
import
print_function
from
paddle
import
nn
from
paddle
import
nn
from
ppocr.modeling.transforms
import
build_transform
from
ppocr.modeling.transforms
import
build_transform
from
ppocr.modeling.backbones
import
build_backbone
from
ppocr.modeling.backbones
import
build_backbone
...
...
ppocr/modeling/backbones/__init__.py
View file @
006d84bf
...
@@ -26,8 +26,12 @@ def build_backbone(config, model_type):
...
@@ -26,8 +26,12 @@ def build_backbone(config, model_type):
from
.rec_resnet_vd
import
ResNet
from
.rec_resnet_vd
import
ResNet
from
.rec_resnet_fpn
import
ResNetFPN
from
.rec_resnet_fpn
import
ResNetFPN
from
.rec_mv1_enhance
import
MobileNetV1Enhance
from
.rec_mv1_enhance
import
MobileNetV1Enhance
from
.rec_nrtr_mtb
import
MTB
from
.rec_resnet_31
import
ResNet31
from
.rec_resnet_aster
import
ResNet_ASTER
support_dict
=
[
support_dict
=
[
"MobileNetV1Enhance"
,
"MobileNetV3"
,
"ResNet"
,
"ResNetFPN"
'MobileNetV1Enhance'
,
'MobileNetV3'
,
'ResNet'
,
'ResNetFPN'
,
'MTB'
,
"ResNet31"
,
"ResNet_ASTER"
]
]
elif
model_type
==
"e2e"
:
elif
model_type
==
"e2e"
:
from
.e2e_resnet_vd_pg
import
ResNet
from
.e2e_resnet_vd_pg
import
ResNet
...
...
ppocr/modeling/backbones/rec_mv1_enhance.py
View file @
006d84bf
# copyright (c) 202
0
PaddlePaddle Authors. All Rights Reserve.
# copyright (c) 202
1
PaddlePaddle Authors. All Rights Reserve.
#
#
# Licensed under the Apache License, Version 2.0 (the "License");
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# you may not use this file except in compliance with the License.
...
@@ -16,26 +16,17 @@ from __future__ import absolute_import
...
@@ -16,26 +16,17 @@ from __future__ import absolute_import
from
__future__
import
division
from
__future__
import
division
from
__future__
import
print_function
from
__future__
import
print_function
import
numpy
as
np
import
paddle
from
paddle
import
ParamAttr
import
paddle.nn
as
nn
import
paddle.nn.functional
as
F
from
paddle.nn
import
Conv2D
,
BatchNorm
,
Linear
,
Dropout
from
paddle.nn
import
AdaptiveAvgPool2D
,
MaxPool2D
,
AvgPool2D
from
paddle.nn.initializer
import
KaimingNormal
import
math
import
math
import
numpy
as
np
import
numpy
as
np
import
paddle
import
paddle
from
paddle
import
ParamAttr
,
reshape
,
transpose
,
concat
,
split
from
paddle
import
ParamAttr
,
reshape
,
transpose
import
paddle.nn
as
nn
import
paddle.nn
as
nn
import
paddle.nn.functional
as
F
import
paddle.nn.functional
as
F
from
paddle.nn
import
Conv2D
,
BatchNorm
,
Linear
,
Dropout
from
paddle.nn
import
Conv2D
,
BatchNorm
,
Linear
,
Dropout
from
paddle.nn
import
AdaptiveAvgPool2D
,
MaxPool2D
,
AvgPool2D
from
paddle.nn
import
AdaptiveAvgPool2D
,
MaxPool2D
,
AvgPool2D
from
paddle.nn.initializer
import
KaimingNormal
from
paddle.nn.initializer
import
KaimingNormal
import
math
from
paddle.nn.functional
import
hardswish
,
hardsigmoid
from
paddle.regularizer
import
L2Decay
from
paddle.regularizer
import
L2Decay
from
paddle.nn.functional
import
hardswish
,
hardsigmoid
class
ConvBNLayer
(
nn
.
Layer
):
class
ConvBNLayer
(
nn
.
Layer
):
...
...
ppocr/modeling/backbones/rec_nrtr_mtb.py
0 → 100644
View file @
006d84bf
# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
paddle
import
nn
import
paddle
class
MTB
(
nn
.
Layer
):
def
__init__
(
self
,
cnn_num
,
in_channels
):
super
(
MTB
,
self
).
__init__
()
self
.
block
=
nn
.
Sequential
()
self
.
out_channels
=
in_channels
self
.
cnn_num
=
cnn_num
if
self
.
cnn_num
==
2
:
for
i
in
range
(
self
.
cnn_num
):
self
.
block
.
add_sublayer
(
'conv_{}'
.
format
(
i
),
nn
.
Conv2D
(
in_channels
=
in_channels
if
i
==
0
else
32
*
(
2
**
(
i
-
1
)),
out_channels
=
32
*
(
2
**
i
),
kernel_size
=
3
,
stride
=
2
,
padding
=
1
))
self
.
block
.
add_sublayer
(
'relu_{}'
.
format
(
i
),
nn
.
ReLU
())
self
.
block
.
add_sublayer
(
'bn_{}'
.
format
(
i
),
nn
.
BatchNorm2D
(
32
*
(
2
**
i
)))
def
forward
(
self
,
images
):
x
=
self
.
block
(
images
)
if
self
.
cnn_num
==
2
:
# (b, w, h, c)
x
=
paddle
.
transpose
(
x
,
[
0
,
3
,
2
,
1
])
x_shape
=
paddle
.
shape
(
x
)
x
=
paddle
.
reshape
(
x
,
[
x_shape
[
0
],
x_shape
[
1
],
x_shape
[
2
]
*
x_shape
[
3
]])
return
x
ppocr/modeling/backbones/rec_resnet_31.py
0 → 100644
View file @
006d84bf
from
__future__
import
absolute_import
from
__future__
import
division
from
__future__
import
print_function
import
paddle
from
paddle
import
ParamAttr
import
paddle.nn
as
nn
import
paddle.nn.functional
as
F
import
numpy
as
np
__all__
=
[
"ResNet31"
]
def
conv3x3
(
in_channel
,
out_channel
,
stride
=
1
):
return
nn
.
Conv2D
(
in_channel
,
out_channel
,
kernel_size
=
3
,
stride
=
stride
,
padding
=
1
,
bias_attr
=
False
)
class
BasicBlock
(
nn
.
Layer
):
expansion
=
1
def
__init__
(
self
,
in_channels
,
channels
,
stride
=
1
,
downsample
=
False
):
super
().
__init__
()
self
.
conv1
=
conv3x3
(
in_channels
,
channels
,
stride
)
self
.
bn1
=
nn
.
BatchNorm2D
(
channels
)
self
.
relu
=
nn
.
ReLU
()
self
.
conv2
=
conv3x3
(
channels
,
channels
)
self
.
bn2
=
nn
.
BatchNorm2D
(
channels
)
self
.
downsample
=
downsample
if
downsample
:
self
.
downsample
=
nn
.
Sequential
(
nn
.
Conv2D
(
in_channels
,
channels
*
self
.
expansion
,
1
,
stride
,
bias_attr
=
False
),
nn
.
BatchNorm2D
(
channels
*
self
.
expansion
),
)
else
:
self
.
downsample
=
nn
.
Sequential
()
self
.
stride
=
stride
def
forward
(
self
,
x
):
residual
=
x
out
=
self
.
conv1
(
x
)
out
=
self
.
bn1
(
out
)
out
=
self
.
relu
(
out
)
out
=
self
.
conv2
(
out
)
out
=
self
.
bn2
(
out
)
if
self
.
downsample
:
residual
=
self
.
downsample
(
x
)
out
+=
residual
out
=
self
.
relu
(
out
)
return
out
class
ResNet31
(
nn
.
Layer
):
'''
Args:
in_channels (int): Number of channels of input image tensor.
layers (list[int]): List of BasicBlock number for each stage.
channels (list[int]): List of out_channels of Conv2d layer.
out_indices (None | Sequence[int]): Indices of output stages.
last_stage_pool (bool): If True, add `MaxPool2d` layer to last stage.
'''
def
__init__
(
self
,
in_channels
=
3
,
layers
=
[
1
,
2
,
5
,
3
],
channels
=
[
64
,
128
,
256
,
256
,
512
,
512
,
512
],
out_indices
=
None
,
last_stage_pool
=
False
):
super
(
ResNet31
,
self
).
__init__
()
assert
isinstance
(
in_channels
,
int
)
assert
isinstance
(
last_stage_pool
,
bool
)
self
.
out_indices
=
out_indices
self
.
last_stage_pool
=
last_stage_pool
# conv 1 (Conv Conv)
self
.
conv1_1
=
nn
.
Conv2D
(
in_channels
,
channels
[
0
],
kernel_size
=
3
,
stride
=
1
,
padding
=
1
)
self
.
bn1_1
=
nn
.
BatchNorm2D
(
channels
[
0
])
self
.
relu1_1
=
nn
.
ReLU
()
self
.
conv1_2
=
nn
.
Conv2D
(
channels
[
0
],
channels
[
1
],
kernel_size
=
3
,
stride
=
1
,
padding
=
1
)
self
.
bn1_2
=
nn
.
BatchNorm2D
(
channels
[
1
])
self
.
relu1_2
=
nn
.
ReLU
()
# conv 2 (Max-pooling, Residual block, Conv)
self
.
pool2
=
nn
.
MaxPool2D
(
kernel_size
=
2
,
stride
=
2
,
padding
=
0
,
ceil_mode
=
True
)
self
.
block2
=
self
.
_make_layer
(
channels
[
1
],
channels
[
2
],
layers
[
0
])
self
.
conv2
=
nn
.
Conv2D
(
channels
[
2
],
channels
[
2
],
kernel_size
=
3
,
stride
=
1
,
padding
=
1
)
self
.
bn2
=
nn
.
BatchNorm2D
(
channels
[
2
])
self
.
relu2
=
nn
.
ReLU
()
# conv 3 (Max-pooling, Residual block, Conv)
self
.
pool3
=
nn
.
MaxPool2D
(
kernel_size
=
2
,
stride
=
2
,
padding
=
0
,
ceil_mode
=
True
)
self
.
block3
=
self
.
_make_layer
(
channels
[
2
],
channels
[
3
],
layers
[
1
])
self
.
conv3
=
nn
.
Conv2D
(
channels
[
3
],
channels
[
3
],
kernel_size
=
3
,
stride
=
1
,
padding
=
1
)
self
.
bn3
=
nn
.
BatchNorm2D
(
channels
[
3
])
self
.
relu3
=
nn
.
ReLU
()
# conv 4 (Max-pooling, Residual block, Conv)
self
.
pool4
=
nn
.
MaxPool2D
(
kernel_size
=
(
2
,
1
),
stride
=
(
2
,
1
),
padding
=
0
,
ceil_mode
=
True
)
self
.
block4
=
self
.
_make_layer
(
channels
[
3
],
channels
[
4
],
layers
[
2
])
self
.
conv4
=
nn
.
Conv2D
(
channels
[
4
],
channels
[
4
],
kernel_size
=
3
,
stride
=
1
,
padding
=
1
)
self
.
bn4
=
nn
.
BatchNorm2D
(
channels
[
4
])
self
.
relu4
=
nn
.
ReLU
()
# conv 5 ((Max-pooling), Residual block, Conv)
self
.
pool5
=
None
if
self
.
last_stage_pool
:
self
.
pool5
=
nn
.
MaxPool2D
(
kernel_size
=
2
,
stride
=
2
,
padding
=
0
,
ceil_mode
=
True
)
self
.
block5
=
self
.
_make_layer
(
channels
[
4
],
channels
[
5
],
layers
[
3
])
self
.
conv5
=
nn
.
Conv2D
(
channels
[
5
],
channels
[
5
],
kernel_size
=
3
,
stride
=
1
,
padding
=
1
)
self
.
bn5
=
nn
.
BatchNorm2D
(
channels
[
5
])
self
.
relu5
=
nn
.
ReLU
()
self
.
out_channels
=
channels
[
-
1
]
def
_make_layer
(
self
,
input_channels
,
output_channels
,
blocks
):
layers
=
[]
for
_
in
range
(
blocks
):
downsample
=
None
if
input_channels
!=
output_channels
:
downsample
=
nn
.
Sequential
(
nn
.
Conv2D
(
input_channels
,
output_channels
,
kernel_size
=
1
,
stride
=
1
,
bias_attr
=
False
),
nn
.
BatchNorm2D
(
output_channels
),
)
layers
.
append
(
BasicBlock
(
input_channels
,
output_channels
,
downsample
=
downsample
))
input_channels
=
output_channels
return
nn
.
Sequential
(
*
layers
)
def
forward
(
self
,
x
):
x
=
self
.
conv1_1
(
x
)
x
=
self
.
bn1_1
(
x
)
x
=
self
.
relu1_1
(
x
)
x
=
self
.
conv1_2
(
x
)
x
=
self
.
bn1_2
(
x
)
x
=
self
.
relu1_2
(
x
)
outs
=
[]
for
i
in
range
(
4
):
layer_index
=
i
+
2
pool_layer
=
getattr
(
self
,
f
'pool
{
layer_index
}
'
)
block_layer
=
getattr
(
self
,
f
'block
{
layer_index
}
'
)
conv_layer
=
getattr
(
self
,
f
'conv
{
layer_index
}
'
)
bn_layer
=
getattr
(
self
,
f
'bn
{
layer_index
}
'
)
relu_layer
=
getattr
(
self
,
f
'relu
{
layer_index
}
'
)
if
pool_layer
is
not
None
:
x
=
pool_layer
(
x
)
x
=
block_layer
(
x
)
x
=
conv_layer
(
x
)
x
=
bn_layer
(
x
)
x
=
relu_layer
(
x
)
outs
.
append
(
x
)
if
self
.
out_indices
is
not
None
:
return
tuple
([
outs
[
i
]
for
i
in
self
.
out_indices
])
return
x
ppocr/modeling/backbones/rec_resnet_aster.py
0 → 100644
View file @
006d84bf
# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
paddle
import
paddle.nn
as
nn
import
sys
import
math
def
conv3x3
(
in_planes
,
out_planes
,
stride
=
1
):
"""3x3 convolution with padding"""
return
nn
.
Conv2D
(
in_planes
,
out_planes
,
kernel_size
=
3
,
stride
=
stride
,
padding
=
1
,
bias_attr
=
False
)
def
conv1x1
(
in_planes
,
out_planes
,
stride
=
1
):
"""1x1 convolution"""
return
nn
.
Conv2D
(
in_planes
,
out_planes
,
kernel_size
=
1
,
stride
=
stride
,
bias_attr
=
False
)
def
get_sinusoid_encoding
(
n_position
,
feat_dim
,
wave_length
=
10000
):
# [n_position]
positions
=
paddle
.
arange
(
0
,
n_position
)
# [feat_dim]
dim_range
=
paddle
.
arange
(
0
,
feat_dim
)
dim_range
=
paddle
.
pow
(
wave_length
,
2
*
(
dim_range
//
2
)
/
feat_dim
)
# [n_position, feat_dim]
angles
=
paddle
.
unsqueeze
(
positions
,
axis
=
1
)
/
paddle
.
unsqueeze
(
dim_range
,
axis
=
0
)
angles
=
paddle
.
cast
(
angles
,
"float32"
)
angles
[:,
0
::
2
]
=
paddle
.
sin
(
angles
[:,
0
::
2
])
angles
[:,
1
::
2
]
=
paddle
.
cos
(
angles
[:,
1
::
2
])
return
angles
class
AsterBlock
(
nn
.
Layer
):
def
__init__
(
self
,
inplanes
,
planes
,
stride
=
1
,
downsample
=
None
):
super
(
AsterBlock
,
self
).
__init__
()
self
.
conv1
=
conv1x1
(
inplanes
,
planes
,
stride
)
self
.
bn1
=
nn
.
BatchNorm2D
(
planes
)
self
.
relu
=
nn
.
ReLU
()
self
.
conv2
=
conv3x3
(
planes
,
planes
)
self
.
bn2
=
nn
.
BatchNorm2D
(
planes
)
self
.
downsample
=
downsample
self
.
stride
=
stride
def
forward
(
self
,
x
):
residual
=
x
out
=
self
.
conv1
(
x
)
out
=
self
.
bn1
(
out
)
out
=
self
.
relu
(
out
)
out
=
self
.
conv2
(
out
)
out
=
self
.
bn2
(
out
)
if
self
.
downsample
is
not
None
:
residual
=
self
.
downsample
(
x
)
out
+=
residual
out
=
self
.
relu
(
out
)
return
out
class
ResNet_ASTER
(
nn
.
Layer
):
"""For aster or crnn"""
def
__init__
(
self
,
with_lstm
=
True
,
n_group
=
1
,
in_channels
=
3
):
super
(
ResNet_ASTER
,
self
).
__init__
()
self
.
with_lstm
=
with_lstm
self
.
n_group
=
n_group
self
.
layer0
=
nn
.
Sequential
(
nn
.
Conv2D
(
in_channels
,
32
,
kernel_size
=
(
3
,
3
),
stride
=
1
,
padding
=
1
,
bias_attr
=
False
),
nn
.
BatchNorm2D
(
32
),
nn
.
ReLU
())
self
.
inplanes
=
32
self
.
layer1
=
self
.
_make_layer
(
32
,
3
,
[
2
,
2
])
# [16, 50]
self
.
layer2
=
self
.
_make_layer
(
64
,
4
,
[
2
,
2
])
# [8, 25]
self
.
layer3
=
self
.
_make_layer
(
128
,
6
,
[
2
,
1
])
# [4, 25]
self
.
layer4
=
self
.
_make_layer
(
256
,
6
,
[
2
,
1
])
# [2, 25]
self
.
layer5
=
self
.
_make_layer
(
512
,
3
,
[
2
,
1
])
# [1, 25]
if
with_lstm
:
self
.
rnn
=
nn
.
LSTM
(
512
,
256
,
direction
=
"bidirect"
,
num_layers
=
2
)
self
.
out_channels
=
2
*
256
else
:
self
.
out_channels
=
512
def
_make_layer
(
self
,
planes
,
blocks
,
stride
):
downsample
=
None
if
stride
!=
[
1
,
1
]
or
self
.
inplanes
!=
planes
:
downsample
=
nn
.
Sequential
(
conv1x1
(
self
.
inplanes
,
planes
,
stride
),
nn
.
BatchNorm2D
(
planes
))
layers
=
[]
layers
.
append
(
AsterBlock
(
self
.
inplanes
,
planes
,
stride
,
downsample
))
self
.
inplanes
=
planes
for
_
in
range
(
1
,
blocks
):
layers
.
append
(
AsterBlock
(
self
.
inplanes
,
planes
))
return
nn
.
Sequential
(
*
layers
)
def
forward
(
self
,
x
):
x0
=
self
.
layer0
(
x
)
x1
=
self
.
layer1
(
x0
)
x2
=
self
.
layer2
(
x1
)
x3
=
self
.
layer3
(
x2
)
x4
=
self
.
layer4
(
x3
)
x5
=
self
.
layer5
(
x4
)
cnn_feat
=
x5
.
squeeze
(
2
)
# [N, c, w]
cnn_feat
=
paddle
.
transpose
(
cnn_feat
,
perm
=
[
0
,
2
,
1
])
if
self
.
with_lstm
:
rnn_feat
,
_
=
self
.
rnn
(
cnn_feat
)
return
rnn_feat
else
:
return
cnn_feat
ppocr/modeling/heads/__init__.py
View file @
006d84bf
...
@@ -20,18 +20,24 @@ def build_head(config):
...
@@ -20,18 +20,24 @@ def build_head(config):
from
.det_db_head
import
DBHead
from
.det_db_head
import
DBHead
from
.det_east_head
import
EASTHead
from
.det_east_head
import
EASTHead
from
.det_sast_head
import
SASTHead
from
.det_sast_head
import
SASTHead
from
.det_pse_head
import
PSEHead
from
.e2e_pg_head
import
PGHead
from
.e2e_pg_head
import
PGHead
# rec head
# rec head
from
.rec_ctc_head
import
CTCHead
from
.rec_ctc_head
import
CTCHead
from
.rec_att_head
import
AttentionHead
from
.rec_att_head
import
AttentionHead
from
.rec_srn_head
import
SRNHead
from
.rec_srn_head
import
SRNHead
from
.rec_nrtr_head
import
Transformer
from
.rec_sar_head
import
SARHead
from
.rec_aster_head
import
AsterHead
# cls head
# cls head
from
.cls_head
import
ClsHead
from
.cls_head
import
ClsHead
support_dict
=
[
support_dict
=
[
'DBHead'
,
'EASTHead'
,
'SASTHead'
,
'CTCHead'
,
'ClsHead'
,
'AttentionHead'
,
'DBHead'
,
'PSEHead'
,
'EASTHead'
,
'SASTHead'
,
'CTCHead'
,
'ClsHead'
,
'SRNHead'
,
'PGHead'
,
'TableAttentionHead'
]
'AttentionHead'
,
'SRNHead'
,
'PGHead'
,
'Transformer'
,
'TableAttentionHead'
,
'SARHead'
,
'AsterHead'
]
#table head
#table head
from
.table_att_head
import
TableAttentionHead
from
.table_att_head
import
TableAttentionHead
...
...
ppocr/modeling/heads/det_pse_head.py
0 → 100644
View file @
006d84bf
# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
paddle
import
nn
class
PSEHead
(
nn
.
Layer
):
def
__init__
(
self
,
in_channels
,
hidden_dim
=
256
,
out_channels
=
7
,
**
kwargs
):
super
(
PSEHead
,
self
).
__init__
()
self
.
conv1
=
nn
.
Conv2D
(
in_channels
,
hidden_dim
,
kernel_size
=
3
,
stride
=
1
,
padding
=
1
)
self
.
bn1
=
nn
.
BatchNorm2D
(
hidden_dim
)
self
.
relu1
=
nn
.
ReLU
()
self
.
conv2
=
nn
.
Conv2D
(
hidden_dim
,
out_channels
,
kernel_size
=
1
,
stride
=
1
,
padding
=
0
)
def
forward
(
self
,
x
,
**
kwargs
):
out
=
self
.
conv1
(
x
)
out
=
self
.
relu1
(
self
.
bn1
(
out
))
out
=
self
.
conv2
(
out
)
return
{
'maps'
:
out
}
ppocr/modeling/heads/multiheadAttention.py
0 → 100755
View file @
006d84bf
# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
paddle
from
paddle
import
nn
import
paddle.nn.functional
as
F
from
paddle.nn
import
Linear
from
paddle.nn.initializer
import
XavierUniform
as
xavier_uniform_
from
paddle.nn.initializer
import
Constant
as
constant_
from
paddle.nn.initializer
import
XavierNormal
as
xavier_normal_
zeros_
=
constant_
(
value
=
0.
)
ones_
=
constant_
(
value
=
1.
)
class
MultiheadAttention
(
nn
.
Layer
):
"""Allows the model to jointly attend to information
from different representation subspaces.
See reference: Attention Is All You Need
.. math::
\t
ext{MultiHead}(Q, K, V) =
\t
ext{Concat}(head_1,\dots,head_h)W^O
\t
ext{where} head_i =
\t
ext{Attention}(QW_i^Q, KW_i^K, VW_i^V)
Args:
embed_dim: total dimension of the model
num_heads: parallel attention layers, or heads
"""
def
__init__
(
self
,
embed_dim
,
num_heads
,
dropout
=
0.
,
bias
=
True
,
add_bias_kv
=
False
,
add_zero_attn
=
False
):
super
(
MultiheadAttention
,
self
).
__init__
()
self
.
embed_dim
=
embed_dim
self
.
num_heads
=
num_heads
self
.
dropout
=
dropout
self
.
head_dim
=
embed_dim
//
num_heads
assert
self
.
head_dim
*
num_heads
==
self
.
embed_dim
,
"embed_dim must be divisible by num_heads"
self
.
scaling
=
self
.
head_dim
**-
0.5
self
.
out_proj
=
Linear
(
embed_dim
,
embed_dim
,
bias_attr
=
bias
)
self
.
_reset_parameters
()
self
.
conv1
=
paddle
.
nn
.
Conv2D
(
in_channels
=
embed_dim
,
out_channels
=
embed_dim
,
kernel_size
=
(
1
,
1
))
self
.
conv2
=
paddle
.
nn
.
Conv2D
(
in_channels
=
embed_dim
,
out_channels
=
embed_dim
,
kernel_size
=
(
1
,
1
))
self
.
conv3
=
paddle
.
nn
.
Conv2D
(
in_channels
=
embed_dim
,
out_channels
=
embed_dim
,
kernel_size
=
(
1
,
1
))
def
_reset_parameters
(
self
):
xavier_uniform_
(
self
.
out_proj
.
weight
)
def
forward
(
self
,
query
,
key
,
value
,
key_padding_mask
=
None
,
incremental_state
=
None
,
attn_mask
=
None
):
"""
Inputs of forward function
query: [target length, batch size, embed dim]
key: [sequence length, batch size, embed dim]
value: [sequence length, batch size, embed dim]
key_padding_mask: if True, mask padding based on batch size
incremental_state: if provided, previous time steps are cashed
need_weights: output attn_output_weights
static_kv: key and value are static
Outputs of forward function
attn_output: [target length, batch size, embed dim]
attn_output_weights: [batch size, target length, sequence length]
"""
q_shape
=
paddle
.
shape
(
query
)
src_shape
=
paddle
.
shape
(
key
)
q
=
self
.
_in_proj_q
(
query
)
k
=
self
.
_in_proj_k
(
key
)
v
=
self
.
_in_proj_v
(
value
)
q
*=
self
.
scaling
q
=
paddle
.
transpose
(
paddle
.
reshape
(
q
,
[
q_shape
[
0
],
q_shape
[
1
],
self
.
num_heads
,
self
.
head_dim
]),
[
1
,
2
,
0
,
3
])
k
=
paddle
.
transpose
(
paddle
.
reshape
(
k
,
[
src_shape
[
0
],
q_shape
[
1
],
self
.
num_heads
,
self
.
head_dim
]),
[
1
,
2
,
0
,
3
])
v
=
paddle
.
transpose
(
paddle
.
reshape
(
v
,
[
src_shape
[
0
],
q_shape
[
1
],
self
.
num_heads
,
self
.
head_dim
]),
[
1
,
2
,
0
,
3
])
if
key_padding_mask
is
not
None
:
assert
key_padding_mask
.
shape
[
0
]
==
q_shape
[
1
]
assert
key_padding_mask
.
shape
[
1
]
==
src_shape
[
0
]
attn_output_weights
=
paddle
.
matmul
(
q
,
paddle
.
transpose
(
k
,
[
0
,
1
,
3
,
2
]))
if
attn_mask
is
not
None
:
attn_mask
=
paddle
.
unsqueeze
(
paddle
.
unsqueeze
(
attn_mask
,
0
),
0
)
attn_output_weights
+=
attn_mask
if
key_padding_mask
is
not
None
:
attn_output_weights
=
paddle
.
reshape
(
attn_output_weights
,
[
q_shape
[
1
],
self
.
num_heads
,
q_shape
[
0
],
src_shape
[
0
]])
key
=
paddle
.
unsqueeze
(
paddle
.
unsqueeze
(
key_padding_mask
,
1
),
2
)
key
=
paddle
.
cast
(
key
,
'float32'
)
y
=
paddle
.
full
(
shape
=
paddle
.
shape
(
key
),
dtype
=
'float32'
,
fill_value
=
'-inf'
)
y
=
paddle
.
where
(
key
==
0.
,
key
,
y
)
attn_output_weights
+=
y
attn_output_weights
=
F
.
softmax
(
attn_output_weights
.
astype
(
'float32'
),
axis
=-
1
,
dtype
=
paddle
.
float32
if
attn_output_weights
.
dtype
==
paddle
.
float16
else
attn_output_weights
.
dtype
)
attn_output_weights
=
F
.
dropout
(
attn_output_weights
,
p
=
self
.
dropout
,
training
=
self
.
training
)
attn_output
=
paddle
.
matmul
(
attn_output_weights
,
v
)
attn_output
=
paddle
.
reshape
(
paddle
.
transpose
(
attn_output
,
[
2
,
0
,
1
,
3
]),
[
q_shape
[
0
],
q_shape
[
1
],
self
.
embed_dim
])
attn_output
=
self
.
out_proj
(
attn_output
)
return
attn_output
def
_in_proj_q
(
self
,
query
):
query
=
paddle
.
transpose
(
query
,
[
1
,
2
,
0
])
query
=
paddle
.
unsqueeze
(
query
,
axis
=
2
)
res
=
self
.
conv1
(
query
)
res
=
paddle
.
squeeze
(
res
,
axis
=
2
)
res
=
paddle
.
transpose
(
res
,
[
2
,
0
,
1
])
return
res
def
_in_proj_k
(
self
,
key
):
key
=
paddle
.
transpose
(
key
,
[
1
,
2
,
0
])
key
=
paddle
.
unsqueeze
(
key
,
axis
=
2
)
res
=
self
.
conv2
(
key
)
res
=
paddle
.
squeeze
(
res
,
axis
=
2
)
res
=
paddle
.
transpose
(
res
,
[
2
,
0
,
1
])
return
res
def
_in_proj_v
(
self
,
value
):
value
=
paddle
.
transpose
(
value
,
[
1
,
2
,
0
])
#(1, 2, 0)
value
=
paddle
.
unsqueeze
(
value
,
axis
=
2
)
res
=
self
.
conv3
(
value
)
res
=
paddle
.
squeeze
(
res
,
axis
=
2
)
res
=
paddle
.
transpose
(
res
,
[
2
,
0
,
1
])
return
res
ppocr/modeling/heads/rec_aster_head.py
0 → 100644
View file @
006d84bf
# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
absolute_import
from
__future__
import
division
from
__future__
import
print_function
import
sys
import
paddle
from
paddle
import
nn
from
paddle.nn
import
functional
as
F
class
AsterHead
(
nn
.
Layer
):
def
__init__
(
self
,
in_channels
,
out_channels
,
sDim
,
attDim
,
max_len_labels
,
time_step
=
25
,
beam_width
=
5
,
**
kwargs
):
super
(
AsterHead
,
self
).
__init__
()
self
.
num_classes
=
out_channels
self
.
in_planes
=
in_channels
self
.
sDim
=
sDim
self
.
attDim
=
attDim
self
.
max_len_labels
=
max_len_labels
self
.
decoder
=
AttentionRecognitionHead
(
in_channels
,
out_channels
,
sDim
,
attDim
,
max_len_labels
)
self
.
time_step
=
time_step
self
.
embeder
=
Embedding
(
self
.
time_step
,
in_channels
)
self
.
beam_width
=
beam_width
self
.
eos
=
self
.
num_classes
-
1
def
forward
(
self
,
x
,
targets
=
None
,
embed
=
None
):
return_dict
=
{}
embedding_vectors
=
self
.
embeder
(
x
)
if
self
.
training
:
rec_targets
,
rec_lengths
,
_
=
targets
rec_pred
=
self
.
decoder
([
x
,
rec_targets
,
rec_lengths
],
embedding_vectors
)
return_dict
[
'rec_pred'
]
=
rec_pred
return_dict
[
'embedding_vectors'
]
=
embedding_vectors
else
:
rec_pred
,
rec_pred_scores
=
self
.
decoder
.
beam_search
(
x
,
self
.
beam_width
,
self
.
eos
,
embedding_vectors
)
return_dict
[
'rec_pred'
]
=
rec_pred
return_dict
[
'rec_pred_scores'
]
=
rec_pred_scores
return_dict
[
'embedding_vectors'
]
=
embedding_vectors
return
return_dict
class
Embedding
(
nn
.
Layer
):
def
__init__
(
self
,
in_timestep
,
in_planes
,
mid_dim
=
4096
,
embed_dim
=
300
):
super
(
Embedding
,
self
).
__init__
()
self
.
in_timestep
=
in_timestep
self
.
in_planes
=
in_planes
self
.
embed_dim
=
embed_dim
self
.
mid_dim
=
mid_dim
self
.
eEmbed
=
nn
.
Linear
(
in_timestep
*
in_planes
,
self
.
embed_dim
)
# Embed encoder output to a word-embedding like
def
forward
(
self
,
x
):
x
=
paddle
.
reshape
(
x
,
[
paddle
.
shape
(
x
)[
0
],
-
1
])
x
=
self
.
eEmbed
(
x
)
return
x
class
AttentionRecognitionHead
(
nn
.
Layer
):
"""
input: [b x 16 x 64 x in_planes]
output: probability sequence: [b x T x num_classes]
"""
def
__init__
(
self
,
in_channels
,
out_channels
,
sDim
,
attDim
,
max_len_labels
):
super
(
AttentionRecognitionHead
,
self
).
__init__
()
self
.
num_classes
=
out_channels
# this is the output classes. So it includes the <EOS>.
self
.
in_planes
=
in_channels
self
.
sDim
=
sDim
self
.
attDim
=
attDim
self
.
max_len_labels
=
max_len_labels
self
.
decoder
=
DecoderUnit
(
sDim
=
sDim
,
xDim
=
in_channels
,
yDim
=
self
.
num_classes
,
attDim
=
attDim
)
def
forward
(
self
,
x
,
embed
):
x
,
targets
,
lengths
=
x
batch_size
=
paddle
.
shape
(
x
)[
0
]
# Decoder
state
=
self
.
decoder
.
get_initial_state
(
embed
)
outputs
=
[]
for
i
in
range
(
max
(
lengths
)):
if
i
==
0
:
y_prev
=
paddle
.
full
(
shape
=
[
batch_size
],
fill_value
=
self
.
num_classes
)
else
:
y_prev
=
targets
[:,
i
-
1
]
output
,
state
=
self
.
decoder
(
x
,
state
,
y_prev
)
outputs
.
append
(
output
)
outputs
=
paddle
.
concat
([
_
.
unsqueeze
(
1
)
for
_
in
outputs
],
1
)
return
outputs
# inference stage.
def
sample
(
self
,
x
):
x
,
_
,
_
=
x
batch_size
=
x
.
size
(
0
)
# Decoder
state
=
paddle
.
zeros
([
1
,
batch_size
,
self
.
sDim
])
predicted_ids
,
predicted_scores
=
[],
[]
for
i
in
range
(
self
.
max_len_labels
):
if
i
==
0
:
y_prev
=
paddle
.
full
(
shape
=
[
batch_size
],
fill_value
=
self
.
num_classes
)
else
:
y_prev
=
predicted
output
,
state
=
self
.
decoder
(
x
,
state
,
y_prev
)
output
=
F
.
softmax
(
output
,
axis
=
1
)
score
,
predicted
=
output
.
max
(
1
)
predicted_ids
.
append
(
predicted
.
unsqueeze
(
1
))
predicted_scores
.
append
(
score
.
unsqueeze
(
1
))
predicted_ids
=
paddle
.
concat
([
predicted_ids
,
1
])
predicted_scores
=
paddle
.
concat
([
predicted_scores
,
1
])
# return predicted_ids.squeeze(), predicted_scores.squeeze()
return
predicted_ids
,
predicted_scores
def
beam_search
(
self
,
x
,
beam_width
,
eos
,
embed
):
def
_inflate
(
tensor
,
times
,
dim
):
repeat_dims
=
[
1
]
*
tensor
.
dim
()
repeat_dims
[
dim
]
=
times
output
=
paddle
.
tile
(
tensor
,
repeat_dims
)
return
output
# https://github.com/IBM/pytorch-seq2seq/blob/fede87655ddce6c94b38886089e05321dc9802af/seq2seq/models/TopKDecoder.py
batch_size
,
l
,
d
=
x
.
shape
x
=
paddle
.
tile
(
paddle
.
transpose
(
x
.
unsqueeze
(
1
),
perm
=
[
1
,
0
,
2
,
3
]),
[
beam_width
,
1
,
1
,
1
])
inflated_encoder_feats
=
paddle
.
reshape
(
paddle
.
transpose
(
x
,
perm
=
[
1
,
0
,
2
,
3
]),
[
-
1
,
l
,
d
])
# Initialize the decoder
state
=
self
.
decoder
.
get_initial_state
(
embed
,
tile_times
=
beam_width
)
pos_index
=
paddle
.
reshape
(
paddle
.
arange
(
batch_size
)
*
beam_width
,
shape
=
[
-
1
,
1
])
# Initialize the scores
sequence_scores
=
paddle
.
full
(
shape
=
[
batch_size
*
beam_width
,
1
],
fill_value
=-
float
(
'Inf'
))
index
=
[
i
*
beam_width
for
i
in
range
(
0
,
batch_size
)]
sequence_scores
[
index
]
=
0.0
# Initialize the input vector
y_prev
=
paddle
.
full
(
shape
=
[
batch_size
*
beam_width
],
fill_value
=
self
.
num_classes
)
# Store decisions for backtracking
stored_scores
=
list
()
stored_predecessors
=
list
()
stored_emitted_symbols
=
list
()
for
i
in
range
(
self
.
max_len_labels
):
output
,
state
=
self
.
decoder
(
inflated_encoder_feats
,
state
,
y_prev
)
state
=
paddle
.
unsqueeze
(
state
,
axis
=
0
)
log_softmax_output
=
paddle
.
nn
.
functional
.
log_softmax
(
output
,
axis
=
1
)
sequence_scores
=
_inflate
(
sequence_scores
,
self
.
num_classes
,
1
)
sequence_scores
+=
log_softmax_output
scores
,
candidates
=
paddle
.
topk
(
paddle
.
reshape
(
sequence_scores
,
[
batch_size
,
-
1
]),
beam_width
,
axis
=
1
)
# Reshape input = (bk, 1) and sequence_scores = (bk, 1)
y_prev
=
paddle
.
reshape
(
candidates
%
self
.
num_classes
,
shape
=
[
batch_size
*
beam_width
])
sequence_scores
=
paddle
.
reshape
(
scores
,
shape
=
[
batch_size
*
beam_width
,
1
])
# Update fields for next timestep
pos_index
=
paddle
.
expand_as
(
pos_index
,
candidates
)
predecessors
=
paddle
.
cast
(
candidates
/
self
.
num_classes
+
pos_index
,
dtype
=
'int64'
)
predecessors
=
paddle
.
reshape
(
predecessors
,
shape
=
[
batch_size
*
beam_width
,
1
])
state
=
paddle
.
index_select
(
state
,
index
=
predecessors
.
squeeze
(),
axis
=
1
)
# Update sequence socres and erase scores for <eos> symbol so that they aren't expanded
stored_scores
.
append
(
sequence_scores
.
clone
())
y_prev
=
paddle
.
reshape
(
y_prev
,
shape
=
[
-
1
,
1
])
eos_prev
=
paddle
.
full_like
(
y_prev
,
fill_value
=
eos
)
mask
=
eos_prev
==
y_prev
mask
=
paddle
.
nonzero
(
mask
)
if
mask
.
dim
()
>
0
:
sequence_scores
=
sequence_scores
.
numpy
()
mask
=
mask
.
numpy
()
sequence_scores
[
mask
]
=
-
float
(
'inf'
)
sequence_scores
=
paddle
.
to_tensor
(
sequence_scores
)
# Cache results for backtracking
stored_predecessors
.
append
(
predecessors
)
y_prev
=
paddle
.
squeeze
(
y_prev
)
stored_emitted_symbols
.
append
(
y_prev
)
# Do backtracking to return the optimal values
#====== backtrak ======#
# Initialize return variables given different types
p
=
list
()
l
=
[[
self
.
max_len_labels
]
*
beam_width
for
_
in
range
(
batch_size
)
]
# Placeholder for lengths of top-k sequences
# the last step output of the beams are not sorted
# thus they are sorted here
sorted_score
,
sorted_idx
=
paddle
.
topk
(
paddle
.
reshape
(
stored_scores
[
-
1
],
shape
=
[
batch_size
,
beam_width
]),
beam_width
)
# initialize the sequence scores with the sorted last step beam scores
s
=
sorted_score
.
clone
()
batch_eos_found
=
[
0
]
*
batch_size
# the number of EOS found
# in the backward loop below for each batch
t
=
self
.
max_len_labels
-
1
# initialize the back pointer with the sorted order of the last step beams.
# add pos_index for indexing variable with b*k as the first dimension.
t_predecessors
=
paddle
.
reshape
(
sorted_idx
+
pos_index
.
expand_as
(
sorted_idx
),
shape
=
[
batch_size
*
beam_width
])
while
t
>=
0
:
# Re-order the variables with the back pointer
current_symbol
=
paddle
.
index_select
(
stored_emitted_symbols
[
t
],
index
=
t_predecessors
,
axis
=
0
)
t_predecessors
=
paddle
.
index_select
(
stored_predecessors
[
t
].
squeeze
(),
index
=
t_predecessors
,
axis
=
0
)
eos_indices
=
stored_emitted_symbols
[
t
]
==
eos
eos_indices
=
paddle
.
nonzero
(
eos_indices
)
if
eos_indices
.
dim
()
>
0
:
for
i
in
range
(
eos_indices
.
shape
[
0
]
-
1
,
-
1
,
-
1
):
# Indices of the EOS symbol for both variables
# with b*k as the first dimension, and b, k for
# the first two dimensions
idx
=
eos_indices
[
i
]
b_idx
=
int
(
idx
[
0
]
/
beam_width
)
# The indices of the replacing position
# according to the replacement strategy noted above
res_k_idx
=
beam_width
-
(
batch_eos_found
[
b_idx
]
%
beam_width
)
-
1
batch_eos_found
[
b_idx
]
+=
1
res_idx
=
b_idx
*
beam_width
+
res_k_idx
# Replace the old information in return variables
# with the new ended sequence information
t_predecessors
[
res_idx
]
=
stored_predecessors
[
t
][
idx
[
0
]]
current_symbol
[
res_idx
]
=
stored_emitted_symbols
[
t
][
idx
[
0
]]
s
[
b_idx
,
res_k_idx
]
=
stored_scores
[
t
][
idx
[
0
],
0
]
l
[
b_idx
][
res_k_idx
]
=
t
+
1
# record the back tracked results
p
.
append
(
current_symbol
)
t
-=
1
# Sort and re-order again as the added ended sequences may change
# the order (very unlikely)
s
,
re_sorted_idx
=
s
.
topk
(
beam_width
)
for
b_idx
in
range
(
batch_size
):
l
[
b_idx
]
=
[
l
[
b_idx
][
k_idx
.
item
()]
for
k_idx
in
re_sorted_idx
[
b_idx
,
:]
]
re_sorted_idx
=
paddle
.
reshape
(
re_sorted_idx
+
pos_index
.
expand_as
(
re_sorted_idx
),
[
batch_size
*
beam_width
])
# Reverse the sequences and re-order at the same time
# It is reversed because the backtracking happens in reverse time order
p
=
[
paddle
.
reshape
(
paddle
.
index_select
(
step
,
re_sorted_idx
,
0
),
shape
=
[
batch_size
,
beam_width
,
-
1
])
for
step
in
reversed
(
p
)
]
p
=
paddle
.
concat
(
p
,
-
1
)[:,
0
,
:]
return
p
,
paddle
.
ones_like
(
p
)
class
AttentionUnit
(
nn
.
Layer
):
def
__init__
(
self
,
sDim
,
xDim
,
attDim
):
super
(
AttentionUnit
,
self
).
__init__
()
self
.
sDim
=
sDim
self
.
xDim
=
xDim
self
.
attDim
=
attDim
self
.
sEmbed
=
nn
.
Linear
(
sDim
,
attDim
)
self
.
xEmbed
=
nn
.
Linear
(
xDim
,
attDim
)
self
.
wEmbed
=
nn
.
Linear
(
attDim
,
1
)
def
forward
(
self
,
x
,
sPrev
):
batch_size
,
T
,
_
=
x
.
shape
# [b x T x xDim]
x
=
paddle
.
reshape
(
x
,
[
-
1
,
self
.
xDim
])
# [(b x T) x xDim]
xProj
=
self
.
xEmbed
(
x
)
# [(b x T) x attDim]
xProj
=
paddle
.
reshape
(
xProj
,
[
batch_size
,
T
,
-
1
])
# [b x T x attDim]
sPrev
=
sPrev
.
squeeze
(
0
)
sProj
=
self
.
sEmbed
(
sPrev
)
# [b x attDim]
sProj
=
paddle
.
unsqueeze
(
sProj
,
1
)
# [b x 1 x attDim]
sProj
=
paddle
.
expand
(
sProj
,
[
batch_size
,
T
,
self
.
attDim
])
# [b x T x attDim]
sumTanh
=
paddle
.
tanh
(
sProj
+
xProj
)
sumTanh
=
paddle
.
reshape
(
sumTanh
,
[
-
1
,
self
.
attDim
])
vProj
=
self
.
wEmbed
(
sumTanh
)
# [(b x T) x 1]
vProj
=
paddle
.
reshape
(
vProj
,
[
batch_size
,
T
])
alpha
=
F
.
softmax
(
vProj
,
axis
=
1
)
# attention weights for each sample in the minibatch
return
alpha
class
DecoderUnit
(
nn
.
Layer
):
def
__init__
(
self
,
sDim
,
xDim
,
yDim
,
attDim
):
super
(
DecoderUnit
,
self
).
__init__
()
self
.
sDim
=
sDim
self
.
xDim
=
xDim
self
.
yDim
=
yDim
self
.
attDim
=
attDim
self
.
emdDim
=
attDim
self
.
attention_unit
=
AttentionUnit
(
sDim
,
xDim
,
attDim
)
self
.
tgt_embedding
=
nn
.
Embedding
(
yDim
+
1
,
self
.
emdDim
,
weight_attr
=
nn
.
initializer
.
Normal
(
std
=
0.01
))
# the last is used for <BOS>
self
.
gru
=
nn
.
GRUCell
(
input_size
=
xDim
+
self
.
emdDim
,
hidden_size
=
sDim
)
self
.
fc
=
nn
.
Linear
(
sDim
,
yDim
,
weight_attr
=
nn
.
initializer
.
Normal
(
std
=
0.01
),
bias_attr
=
nn
.
initializer
.
Constant
(
value
=
0
))
self
.
embed_fc
=
nn
.
Linear
(
300
,
self
.
sDim
)
def
get_initial_state
(
self
,
embed
,
tile_times
=
1
):
assert
embed
.
shape
[
1
]
==
300
state
=
self
.
embed_fc
(
embed
)
# N * sDim
if
tile_times
!=
1
:
state
=
state
.
unsqueeze
(
1
)
trans_state
=
paddle
.
transpose
(
state
,
perm
=
[
1
,
0
,
2
])
state
=
paddle
.
tile
(
trans_state
,
repeat_times
=
[
tile_times
,
1
,
1
])
trans_state
=
paddle
.
transpose
(
state
,
perm
=
[
1
,
0
,
2
])
state
=
paddle
.
reshape
(
trans_state
,
shape
=
[
-
1
,
self
.
sDim
])
state
=
state
.
unsqueeze
(
0
)
# 1 * N * sDim
return
state
def
forward
(
self
,
x
,
sPrev
,
yPrev
):
# x: feature sequence from the image decoder.
batch_size
,
T
,
_
=
x
.
shape
alpha
=
self
.
attention_unit
(
x
,
sPrev
)
context
=
paddle
.
squeeze
(
paddle
.
matmul
(
alpha
.
unsqueeze
(
1
),
x
),
axis
=
1
)
yPrev
=
paddle
.
cast
(
yPrev
,
dtype
=
"int64"
)
yProj
=
self
.
tgt_embedding
(
yPrev
)
concat_context
=
paddle
.
concat
([
yProj
,
context
],
1
)
concat_context
=
paddle
.
squeeze
(
concat_context
,
1
)
sPrev
=
paddle
.
squeeze
(
sPrev
,
0
)
output
,
state
=
self
.
gru
(
concat_context
,
sPrev
)
output
=
paddle
.
squeeze
(
output
,
axis
=
1
)
output
=
self
.
fc
(
output
)
return
output
,
state
\ No newline at end of file
ppocr/modeling/heads/rec_ctc_head.py
View file @
006d84bf
...
@@ -38,6 +38,7 @@ class CTCHead(nn.Layer):
...
@@ -38,6 +38,7 @@ class CTCHead(nn.Layer):
out_channels
,
out_channels
,
fc_decay
=
0.0004
,
fc_decay
=
0.0004
,
mid_channels
=
None
,
mid_channels
=
None
,
return_feats
=
False
,
**
kwargs
):
**
kwargs
):
super
(
CTCHead
,
self
).
__init__
()
super
(
CTCHead
,
self
).
__init__
()
if
mid_channels
is
None
:
if
mid_channels
is
None
:
...
@@ -66,14 +67,22 @@ class CTCHead(nn.Layer):
...
@@ -66,14 +67,22 @@ class CTCHead(nn.Layer):
bias_attr
=
bias_attr2
)
bias_attr
=
bias_attr2
)
self
.
out_channels
=
out_channels
self
.
out_channels
=
out_channels
self
.
mid_channels
=
mid_channels
self
.
mid_channels
=
mid_channels
self
.
return_feats
=
return_feats
def
forward
(
self
,
x
,
targets
=
None
):
def
forward
(
self
,
x
,
targets
=
None
):
if
self
.
mid_channels
is
None
:
if
self
.
mid_channels
is
None
:
predicts
=
self
.
fc
(
x
)
predicts
=
self
.
fc
(
x
)
else
:
else
:
predicts
=
self
.
fc1
(
x
)
x
=
self
.
fc1
(
x
)
predicts
=
self
.
fc2
(
predicts
)
predicts
=
self
.
fc2
(
x
)
if
self
.
return_feats
:
result
=
(
x
,
predicts
)
else
:
result
=
predicts
if
not
self
.
training
:
if
not
self
.
training
:
predicts
=
F
.
softmax
(
predicts
,
axis
=
2
)
predicts
=
F
.
softmax
(
predicts
,
axis
=
2
)
return
predicts
result
=
predicts
return
result
ppocr/modeling/heads/rec_nrtr_head.py
0 → 100644
View file @
006d84bf
# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
math
import
paddle
import
copy
from
paddle
import
nn
import
paddle.nn.functional
as
F
from
paddle.nn
import
LayerList
from
paddle.nn.initializer
import
XavierNormal
as
xavier_uniform_
from
paddle.nn
import
Dropout
,
Linear
,
LayerNorm
,
Conv2D
import
numpy
as
np
from
ppocr.modeling.heads.multiheadAttention
import
MultiheadAttention
from
paddle.nn.initializer
import
Constant
as
constant_
from
paddle.nn.initializer
import
XavierNormal
as
xavier_normal_
zeros_
=
constant_
(
value
=
0.
)
ones_
=
constant_
(
value
=
1.
)
class
Transformer
(
nn
.
Layer
):
"""A transformer model. User is able to modify the attributes as needed. The architechture
is based on the paper "Attention Is All You Need". Ashish Vaswani, Noam Shazeer,
Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, Lukasz Kaiser, and
Illia Polosukhin. 2017. Attention is all you need. In Advances in Neural Information
Processing Systems, pages 6000-6010.
Args:
d_model: the number of expected features in the encoder/decoder inputs (default=512).
nhead: the number of heads in the multiheadattention models (default=8).
num_encoder_layers: the number of sub-encoder-layers in the encoder (default=6).
num_decoder_layers: the number of sub-decoder-layers in the decoder (default=6).
dim_feedforward: the dimension of the feedforward network model (default=2048).
dropout: the dropout value (default=0.1).
custom_encoder: custom encoder (default=None).
custom_decoder: custom decoder (default=None).
"""
def
__init__
(
self
,
d_model
=
512
,
nhead
=
8
,
num_encoder_layers
=
6
,
beam_size
=
0
,
num_decoder_layers
=
6
,
dim_feedforward
=
1024
,
attention_dropout_rate
=
0.0
,
residual_dropout_rate
=
0.1
,
custom_encoder
=
None
,
custom_decoder
=
None
,
in_channels
=
0
,
out_channels
=
0
,
scale_embedding
=
True
):
super
(
Transformer
,
self
).
__init__
()
self
.
out_channels
=
out_channels
+
1
self
.
embedding
=
Embeddings
(
d_model
=
d_model
,
vocab
=
self
.
out_channels
,
padding_idx
=
0
,
scale_embedding
=
scale_embedding
)
self
.
positional_encoding
=
PositionalEncoding
(
dropout
=
residual_dropout_rate
,
dim
=
d_model
,
)
if
custom_encoder
is
not
None
:
self
.
encoder
=
custom_encoder
else
:
if
num_encoder_layers
>
0
:
encoder_layer
=
TransformerEncoderLayer
(
d_model
,
nhead
,
dim_feedforward
,
attention_dropout_rate
,
residual_dropout_rate
)
self
.
encoder
=
TransformerEncoder
(
encoder_layer
,
num_encoder_layers
)
else
:
self
.
encoder
=
None
if
custom_decoder
is
not
None
:
self
.
decoder
=
custom_decoder
else
:
decoder_layer
=
TransformerDecoderLayer
(
d_model
,
nhead
,
dim_feedforward
,
attention_dropout_rate
,
residual_dropout_rate
)
self
.
decoder
=
TransformerDecoder
(
decoder_layer
,
num_decoder_layers
)
self
.
_reset_parameters
()
self
.
beam_size
=
beam_size
self
.
d_model
=
d_model
self
.
nhead
=
nhead
self
.
tgt_word_prj
=
nn
.
Linear
(
d_model
,
self
.
out_channels
,
bias_attr
=
False
)
w0
=
np
.
random
.
normal
(
0.0
,
d_model
**-
0.5
,
(
d_model
,
self
.
out_channels
)).
astype
(
np
.
float32
)
self
.
tgt_word_prj
.
weight
.
set_value
(
w0
)
self
.
apply
(
self
.
_init_weights
)
def
_init_weights
(
self
,
m
):
if
isinstance
(
m
,
nn
.
Conv2D
):
xavier_normal_
(
m
.
weight
)
if
m
.
bias
is
not
None
:
zeros_
(
m
.
bias
)
def
forward_train
(
self
,
src
,
tgt
):
tgt
=
tgt
[:,
:
-
1
]
tgt_key_padding_mask
=
self
.
generate_padding_mask
(
tgt
)
tgt
=
self
.
embedding
(
tgt
).
transpose
([
1
,
0
,
2
])
tgt
=
self
.
positional_encoding
(
tgt
)
tgt_mask
=
self
.
generate_square_subsequent_mask
(
tgt
.
shape
[
0
])
if
self
.
encoder
is
not
None
:
src
=
self
.
positional_encoding
(
src
.
transpose
([
1
,
0
,
2
]))
memory
=
self
.
encoder
(
src
)
else
:
memory
=
src
.
squeeze
(
2
).
transpose
([
2
,
0
,
1
])
output
=
self
.
decoder
(
tgt
,
memory
,
tgt_mask
=
tgt_mask
,
memory_mask
=
None
,
tgt_key_padding_mask
=
tgt_key_padding_mask
,
memory_key_padding_mask
=
None
)
output
=
output
.
transpose
([
1
,
0
,
2
])
logit
=
self
.
tgt_word_prj
(
output
)
return
logit
def
forward
(
self
,
src
,
targets
=
None
):
"""Take in and process masked source/target sequences.
Args:
src: the sequence to the encoder (required).
tgt: the sequence to the decoder (required).
Shape:
- src: :math:`(S, N, E)`.
- tgt: :math:`(T, N, E)`.
Examples:
>>> output = transformer_model(src, tgt)
"""
if
self
.
training
:
max_len
=
targets
[
1
].
max
()
tgt
=
targets
[
0
][:,
:
2
+
max_len
]
return
self
.
forward_train
(
src
,
tgt
)
else
:
if
self
.
beam_size
>
0
:
return
self
.
forward_beam
(
src
)
else
:
return
self
.
forward_test
(
src
)
def
forward_test
(
self
,
src
):
bs
=
paddle
.
shape
(
src
)[
0
]
if
self
.
encoder
is
not
None
:
src
=
self
.
positional_encoding
(
paddle
.
transpose
(
src
,
[
1
,
0
,
2
]))
memory
=
self
.
encoder
(
src
)
else
:
memory
=
paddle
.
transpose
(
paddle
.
squeeze
(
src
,
2
),
[
2
,
0
,
1
])
dec_seq
=
paddle
.
full
((
bs
,
1
),
2
,
dtype
=
paddle
.
int64
)
dec_prob
=
paddle
.
full
((
bs
,
1
),
1.
,
dtype
=
paddle
.
float32
)
for
len_dec_seq
in
range
(
1
,
25
):
dec_seq_embed
=
paddle
.
transpose
(
self
.
embedding
(
dec_seq
),
[
1
,
0
,
2
])
dec_seq_embed
=
self
.
positional_encoding
(
dec_seq_embed
)
tgt_mask
=
self
.
generate_square_subsequent_mask
(
paddle
.
shape
(
dec_seq_embed
)[
0
])
output
=
self
.
decoder
(
dec_seq_embed
,
memory
,
tgt_mask
=
tgt_mask
,
memory_mask
=
None
,
tgt_key_padding_mask
=
None
,
memory_key_padding_mask
=
None
)
dec_output
=
paddle
.
transpose
(
output
,
[
1
,
0
,
2
])
dec_output
=
dec_output
[:,
-
1
,
:]
word_prob
=
F
.
softmax
(
self
.
tgt_word_prj
(
dec_output
),
axis
=
1
)
preds_idx
=
paddle
.
argmax
(
word_prob
,
axis
=
1
)
if
paddle
.
equal_all
(
preds_idx
,
paddle
.
full
(
paddle
.
shape
(
preds_idx
),
3
,
dtype
=
'int64'
)):
break
preds_prob
=
paddle
.
max
(
word_prob
,
axis
=
1
)
dec_seq
=
paddle
.
concat
(
[
dec_seq
,
paddle
.
reshape
(
preds_idx
,
[
-
1
,
1
])],
axis
=
1
)
dec_prob
=
paddle
.
concat
(
[
dec_prob
,
paddle
.
reshape
(
preds_prob
,
[
-
1
,
1
])],
axis
=
1
)
return
[
dec_seq
,
dec_prob
]
def
forward_beam
(
self
,
images
):
''' Translation work in one batch '''
def
get_inst_idx_to_tensor_position_map
(
inst_idx_list
):
''' Indicate the position of an instance in a tensor. '''
return
{
inst_idx
:
tensor_position
for
tensor_position
,
inst_idx
in
enumerate
(
inst_idx_list
)
}
def
collect_active_part
(
beamed_tensor
,
curr_active_inst_idx
,
n_prev_active_inst
,
n_bm
):
''' Collect tensor parts associated to active instances. '''
beamed_tensor_shape
=
paddle
.
shape
(
beamed_tensor
)
n_curr_active_inst
=
len
(
curr_active_inst_idx
)
new_shape
=
(
n_curr_active_inst
*
n_bm
,
beamed_tensor_shape
[
1
],
beamed_tensor_shape
[
2
])
beamed_tensor
=
beamed_tensor
.
reshape
([
n_prev_active_inst
,
-
1
])
beamed_tensor
=
beamed_tensor
.
index_select
(
curr_active_inst_idx
,
axis
=
0
)
beamed_tensor
=
beamed_tensor
.
reshape
(
new_shape
)
return
beamed_tensor
def
collate_active_info
(
src_enc
,
inst_idx_to_position_map
,
active_inst_idx_list
):
# Sentences which are still active are collected,
# so the decoder will not run on completed sentences.
n_prev_active_inst
=
len
(
inst_idx_to_position_map
)
active_inst_idx
=
[
inst_idx_to_position_map
[
k
]
for
k
in
active_inst_idx_list
]
active_inst_idx
=
paddle
.
to_tensor
(
active_inst_idx
,
dtype
=
'int64'
)
active_src_enc
=
collect_active_part
(
src_enc
.
transpose
([
1
,
0
,
2
]),
active_inst_idx
,
n_prev_active_inst
,
n_bm
).
transpose
([
1
,
0
,
2
])
active_inst_idx_to_position_map
=
get_inst_idx_to_tensor_position_map
(
active_inst_idx_list
)
return
active_src_enc
,
active_inst_idx_to_position_map
def
beam_decode_step
(
inst_dec_beams
,
len_dec_seq
,
enc_output
,
inst_idx_to_position_map
,
n_bm
,
memory_key_padding_mask
):
''' Decode and update beam status, and then return active beam idx '''
def
prepare_beam_dec_seq
(
inst_dec_beams
,
len_dec_seq
):
dec_partial_seq
=
[
b
.
get_current_state
()
for
b
in
inst_dec_beams
if
not
b
.
done
]
dec_partial_seq
=
paddle
.
stack
(
dec_partial_seq
)
dec_partial_seq
=
dec_partial_seq
.
reshape
([
-
1
,
len_dec_seq
])
return
dec_partial_seq
def
predict_word
(
dec_seq
,
enc_output
,
n_active_inst
,
n_bm
,
memory_key_padding_mask
):
dec_seq
=
paddle
.
transpose
(
self
.
embedding
(
dec_seq
),
[
1
,
0
,
2
])
dec_seq
=
self
.
positional_encoding
(
dec_seq
)
tgt_mask
=
self
.
generate_square_subsequent_mask
(
paddle
.
shape
(
dec_seq
)[
0
])
dec_output
=
self
.
decoder
(
dec_seq
,
enc_output
,
tgt_mask
=
tgt_mask
,
tgt_key_padding_mask
=
None
,
memory_key_padding_mask
=
memory_key_padding_mask
,
)
dec_output
=
paddle
.
transpose
(
dec_output
,
[
1
,
0
,
2
])
dec_output
=
dec_output
[:,
-
1
,
:]
# Pick the last step: (bh * bm) * d_h
word_prob
=
F
.
softmax
(
self
.
tgt_word_prj
(
dec_output
),
axis
=
1
)
word_prob
=
paddle
.
reshape
(
word_prob
,
[
n_active_inst
,
n_bm
,
-
1
])
return
word_prob
def
collect_active_inst_idx_list
(
inst_beams
,
word_prob
,
inst_idx_to_position_map
):
active_inst_idx_list
=
[]
for
inst_idx
,
inst_position
in
inst_idx_to_position_map
.
items
():
is_inst_complete
=
inst_beams
[
inst_idx
].
advance
(
word_prob
[
inst_position
])
if
not
is_inst_complete
:
active_inst_idx_list
+=
[
inst_idx
]
return
active_inst_idx_list
n_active_inst
=
len
(
inst_idx_to_position_map
)
dec_seq
=
prepare_beam_dec_seq
(
inst_dec_beams
,
len_dec_seq
)
word_prob
=
predict_word
(
dec_seq
,
enc_output
,
n_active_inst
,
n_bm
,
None
)
# Update the beam with predicted word prob information and collect incomplete instances
active_inst_idx_list
=
collect_active_inst_idx_list
(
inst_dec_beams
,
word_prob
,
inst_idx_to_position_map
)
return
active_inst_idx_list
def
collect_hypothesis_and_scores
(
inst_dec_beams
,
n_best
):
all_hyp
,
all_scores
=
[],
[]
for
inst_idx
in
range
(
len
(
inst_dec_beams
)):
scores
,
tail_idxs
=
inst_dec_beams
[
inst_idx
].
sort_scores
()
all_scores
+=
[
scores
[:
n_best
]]
hyps
=
[
inst_dec_beams
[
inst_idx
].
get_hypothesis
(
i
)
for
i
in
tail_idxs
[:
n_best
]
]
all_hyp
+=
[
hyps
]
return
all_hyp
,
all_scores
with
paddle
.
no_grad
():
#-- Encode
if
self
.
encoder
is
not
None
:
src
=
self
.
positional_encoding
(
images
.
transpose
([
1
,
0
,
2
]))
src_enc
=
self
.
encoder
(
src
)
else
:
src_enc
=
images
.
squeeze
(
2
).
transpose
([
0
,
2
,
1
])
n_bm
=
self
.
beam_size
src_shape
=
paddle
.
shape
(
src_enc
)
inst_dec_beams
=
[
Beam
(
n_bm
)
for
_
in
range
(
1
)]
active_inst_idx_list
=
list
(
range
(
1
))
# Repeat data for beam search
src_enc
=
paddle
.
tile
(
src_enc
,
[
1
,
n_bm
,
1
])
inst_idx_to_position_map
=
get_inst_idx_to_tensor_position_map
(
active_inst_idx_list
)
# Decode
for
len_dec_seq
in
range
(
1
,
25
):
src_enc_copy
=
src_enc
.
clone
()
active_inst_idx_list
=
beam_decode_step
(
inst_dec_beams
,
len_dec_seq
,
src_enc_copy
,
inst_idx_to_position_map
,
n_bm
,
None
)
if
not
active_inst_idx_list
:
break
# all instances have finished their path to <EOS>
src_enc
,
inst_idx_to_position_map
=
collate_active_info
(
src_enc_copy
,
inst_idx_to_position_map
,
active_inst_idx_list
)
batch_hyp
,
batch_scores
=
collect_hypothesis_and_scores
(
inst_dec_beams
,
1
)
result_hyp
=
[]
hyp_scores
=
[]
for
bs_hyp
,
score
in
zip
(
batch_hyp
,
batch_scores
):
l
=
len
(
bs_hyp
[
0
])
bs_hyp_pad
=
bs_hyp
[
0
]
+
[
3
]
*
(
25
-
l
)
result_hyp
.
append
(
bs_hyp_pad
)
score
=
float
(
score
)
/
l
hyp_score
=
[
score
for
_
in
range
(
25
)]
hyp_scores
.
append
(
hyp_score
)
return
[
paddle
.
to_tensor
(
np
.
array
(
result_hyp
),
dtype
=
paddle
.
int64
),
paddle
.
to_tensor
(
hyp_scores
)
]
def
generate_square_subsequent_mask
(
self
,
sz
):
"""Generate a square mask for the sequence. The masked positions are filled with float('-inf').
Unmasked positions are filled with float(0.0).
"""
mask
=
paddle
.
zeros
([
sz
,
sz
],
dtype
=
'float32'
)
mask_inf
=
paddle
.
triu
(
paddle
.
full
(
shape
=
[
sz
,
sz
],
dtype
=
'float32'
,
fill_value
=
'-inf'
),
diagonal
=
1
)
mask
=
mask
+
mask_inf
return
mask
def
generate_padding_mask
(
self
,
x
):
padding_mask
=
paddle
.
equal
(
x
,
paddle
.
to_tensor
(
0
,
dtype
=
x
.
dtype
))
return
padding_mask
def
_reset_parameters
(
self
):
"""Initiate parameters in the transformer model."""
for
p
in
self
.
parameters
():
if
p
.
dim
()
>
1
:
xavier_uniform_
(
p
)
class
TransformerEncoder
(
nn
.
Layer
):
"""TransformerEncoder is a stack of N encoder layers
Args:
encoder_layer: an instance of the TransformerEncoderLayer() class (required).
num_layers: the number of sub-encoder-layers in the encoder (required).
norm: the layer normalization component (optional).
"""
def
__init__
(
self
,
encoder_layer
,
num_layers
):
super
(
TransformerEncoder
,
self
).
__init__
()
self
.
layers
=
_get_clones
(
encoder_layer
,
num_layers
)
self
.
num_layers
=
num_layers
def
forward
(
self
,
src
):
"""Pass the input through the endocder layers in turn.
Args:
src: the sequnce to the encoder (required).
mask: the mask for the src sequence (optional).
src_key_padding_mask: the mask for the src keys per batch (optional).
"""
output
=
src
for
i
in
range
(
self
.
num_layers
):
output
=
self
.
layers
[
i
](
output
,
src_mask
=
None
,
src_key_padding_mask
=
None
)
return
output
class
TransformerDecoder
(
nn
.
Layer
):
"""TransformerDecoder is a stack of N decoder layers
Args:
decoder_layer: an instance of the TransformerDecoderLayer() class (required).
num_layers: the number of sub-decoder-layers in the decoder (required).
norm: the layer normalization component (optional).
"""
def
__init__
(
self
,
decoder_layer
,
num_layers
):
super
(
TransformerDecoder
,
self
).
__init__
()
self
.
layers
=
_get_clones
(
decoder_layer
,
num_layers
)
self
.
num_layers
=
num_layers
def
forward
(
self
,
tgt
,
memory
,
tgt_mask
=
None
,
memory_mask
=
None
,
tgt_key_padding_mask
=
None
,
memory_key_padding_mask
=
None
):
"""Pass the inputs (and mask) through the decoder layer in turn.
Args:
tgt: the sequence to the decoder (required).
memory: the sequnce from the last layer of the encoder (required).
tgt_mask: the mask for the tgt sequence (optional).
memory_mask: the mask for the memory sequence (optional).
tgt_key_padding_mask: the mask for the tgt keys per batch (optional).
memory_key_padding_mask: the mask for the memory keys per batch (optional).
"""
output
=
tgt
for
i
in
range
(
self
.
num_layers
):
output
=
self
.
layers
[
i
](
output
,
memory
,
tgt_mask
=
tgt_mask
,
memory_mask
=
memory_mask
,
tgt_key_padding_mask
=
tgt_key_padding_mask
,
memory_key_padding_mask
=
memory_key_padding_mask
)
return
output
class
TransformerEncoderLayer
(
nn
.
Layer
):
"""TransformerEncoderLayer is made up of self-attn and feedforward network.
This standard encoder layer is based on the paper "Attention Is All You Need".
Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez,
Lukasz Kaiser, and Illia Polosukhin. 2017. Attention is all you need. In Advances in
Neural Information Processing Systems, pages 6000-6010. Users may modify or implement
in a different way during application.
Args:
d_model: the number of expected features in the input (required).
nhead: the number of heads in the multiheadattention models (required).
dim_feedforward: the dimension of the feedforward network model (default=2048).
dropout: the dropout value (default=0.1).
"""
def
__init__
(
self
,
d_model
,
nhead
,
dim_feedforward
=
2048
,
attention_dropout_rate
=
0.0
,
residual_dropout_rate
=
0.1
):
super
(
TransformerEncoderLayer
,
self
).
__init__
()
self
.
self_attn
=
MultiheadAttention
(
d_model
,
nhead
,
dropout
=
attention_dropout_rate
)
self
.
conv1
=
Conv2D
(
in_channels
=
d_model
,
out_channels
=
dim_feedforward
,
kernel_size
=
(
1
,
1
))
self
.
conv2
=
Conv2D
(
in_channels
=
dim_feedforward
,
out_channels
=
d_model
,
kernel_size
=
(
1
,
1
))
self
.
norm1
=
LayerNorm
(
d_model
)
self
.
norm2
=
LayerNorm
(
d_model
)
self
.
dropout1
=
Dropout
(
residual_dropout_rate
)
self
.
dropout2
=
Dropout
(
residual_dropout_rate
)
def
forward
(
self
,
src
,
src_mask
=
None
,
src_key_padding_mask
=
None
):
"""Pass the input through the endocder layer.
Args:
src: the sequnce to the encoder layer (required).
src_mask: the mask for the src sequence (optional).
src_key_padding_mask: the mask for the src keys per batch (optional).
"""
src2
=
self
.
self_attn
(
src
,
src
,
src
,
attn_mask
=
src_mask
,
key_padding_mask
=
src_key_padding_mask
)
src
=
src
+
self
.
dropout1
(
src2
)
src
=
self
.
norm1
(
src
)
src
=
paddle
.
transpose
(
src
,
[
1
,
2
,
0
])
src
=
paddle
.
unsqueeze
(
src
,
2
)
src2
=
self
.
conv2
(
F
.
relu
(
self
.
conv1
(
src
)))
src2
=
paddle
.
squeeze
(
src2
,
2
)
src2
=
paddle
.
transpose
(
src2
,
[
2
,
0
,
1
])
src
=
paddle
.
squeeze
(
src
,
2
)
src
=
paddle
.
transpose
(
src
,
[
2
,
0
,
1
])
src
=
src
+
self
.
dropout2
(
src2
)
src
=
self
.
norm2
(
src
)
return
src
class
TransformerDecoderLayer
(
nn
.
Layer
):
"""TransformerDecoderLayer is made up of self-attn, multi-head-attn and feedforward network.
This standard decoder layer is based on the paper "Attention Is All You Need".
Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez,
Lukasz Kaiser, and Illia Polosukhin. 2017. Attention is all you need. In Advances in
Neural Information Processing Systems, pages 6000-6010. Users may modify or implement
in a different way during application.
Args:
d_model: the number of expected features in the input (required).
nhead: the number of heads in the multiheadattention models (required).
dim_feedforward: the dimension of the feedforward network model (default=2048).
dropout: the dropout value (default=0.1).
"""
def
__init__
(
self
,
d_model
,
nhead
,
dim_feedforward
=
2048
,
attention_dropout_rate
=
0.0
,
residual_dropout_rate
=
0.1
):
super
(
TransformerDecoderLayer
,
self
).
__init__
()
self
.
self_attn
=
MultiheadAttention
(
d_model
,
nhead
,
dropout
=
attention_dropout_rate
)
self
.
multihead_attn
=
MultiheadAttention
(
d_model
,
nhead
,
dropout
=
attention_dropout_rate
)
self
.
conv1
=
Conv2D
(
in_channels
=
d_model
,
out_channels
=
dim_feedforward
,
kernel_size
=
(
1
,
1
))
self
.
conv2
=
Conv2D
(
in_channels
=
dim_feedforward
,
out_channels
=
d_model
,
kernel_size
=
(
1
,
1
))
self
.
norm1
=
LayerNorm
(
d_model
)
self
.
norm2
=
LayerNorm
(
d_model
)
self
.
norm3
=
LayerNorm
(
d_model
)
self
.
dropout1
=
Dropout
(
residual_dropout_rate
)
self
.
dropout2
=
Dropout
(
residual_dropout_rate
)
self
.
dropout3
=
Dropout
(
residual_dropout_rate
)
def
forward
(
self
,
tgt
,
memory
,
tgt_mask
=
None
,
memory_mask
=
None
,
tgt_key_padding_mask
=
None
,
memory_key_padding_mask
=
None
):
"""Pass the inputs (and mask) through the decoder layer.
Args:
tgt: the sequence to the decoder layer (required).
memory: the sequnce from the last layer of the encoder (required).
tgt_mask: the mask for the tgt sequence (optional).
memory_mask: the mask for the memory sequence (optional).
tgt_key_padding_mask: the mask for the tgt keys per batch (optional).
memory_key_padding_mask: the mask for the memory keys per batch (optional).
"""
tgt2
=
self
.
self_attn
(
tgt
,
tgt
,
tgt
,
attn_mask
=
tgt_mask
,
key_padding_mask
=
tgt_key_padding_mask
)
tgt
=
tgt
+
self
.
dropout1
(
tgt2
)
tgt
=
self
.
norm1
(
tgt
)
tgt2
=
self
.
multihead_attn
(
tgt
,
memory
,
memory
,
attn_mask
=
memory_mask
,
key_padding_mask
=
memory_key_padding_mask
)
tgt
=
tgt
+
self
.
dropout2
(
tgt2
)
tgt
=
self
.
norm2
(
tgt
)
# default
tgt
=
paddle
.
transpose
(
tgt
,
[
1
,
2
,
0
])
tgt
=
paddle
.
unsqueeze
(
tgt
,
2
)
tgt2
=
self
.
conv2
(
F
.
relu
(
self
.
conv1
(
tgt
)))
tgt2
=
paddle
.
squeeze
(
tgt2
,
2
)
tgt2
=
paddle
.
transpose
(
tgt2
,
[
2
,
0
,
1
])
tgt
=
paddle
.
squeeze
(
tgt
,
2
)
tgt
=
paddle
.
transpose
(
tgt
,
[
2
,
0
,
1
])
tgt
=
tgt
+
self
.
dropout3
(
tgt2
)
tgt
=
self
.
norm3
(
tgt
)
return
tgt
def
_get_clones
(
module
,
N
):
return
LayerList
([
copy
.
deepcopy
(
module
)
for
i
in
range
(
N
)])
class
PositionalEncoding
(
nn
.
Layer
):
"""Inject some information about the relative or absolute position of the tokens
in the sequence. The positional encodings have the same dimension as
the embeddings, so that the two can be summed. Here, we use sine and cosine
functions of different frequencies.
.. math::
\t
ext{PosEncoder}(pos, 2i) = sin(pos/10000^(2i/d_model))
\t
ext{PosEncoder}(pos, 2i+1) = cos(pos/10000^(2i/d_model))
\t
ext{where pos is the word position and i is the embed idx)
Args:
d_model: the embed dim (required).
dropout: the dropout value (default=0.1).
max_len: the max. length of the incoming sequence (default=5000).
Examples:
>>> pos_encoder = PositionalEncoding(d_model)
"""
def
__init__
(
self
,
dropout
,
dim
,
max_len
=
5000
):
super
(
PositionalEncoding
,
self
).
__init__
()
self
.
dropout
=
nn
.
Dropout
(
p
=
dropout
)
pe
=
paddle
.
zeros
([
max_len
,
dim
])
position
=
paddle
.
arange
(
0
,
max_len
,
dtype
=
paddle
.
float32
).
unsqueeze
(
1
)
div_term
=
paddle
.
exp
(
paddle
.
arange
(
0
,
dim
,
2
).
astype
(
'float32'
)
*
(
-
math
.
log
(
10000.0
)
/
dim
))
pe
[:,
0
::
2
]
=
paddle
.
sin
(
position
*
div_term
)
pe
[:,
1
::
2
]
=
paddle
.
cos
(
position
*
div_term
)
pe
=
paddle
.
unsqueeze
(
pe
,
0
)
pe
=
paddle
.
transpose
(
pe
,
[
1
,
0
,
2
])
self
.
register_buffer
(
'pe'
,
pe
)
def
forward
(
self
,
x
):
"""Inputs of forward function
Args:
x: the sequence fed to the positional encoder model (required).
Shape:
x: [sequence length, batch size, embed dim]
output: [sequence length, batch size, embed dim]
Examples:
>>> output = pos_encoder(x)
"""
x
=
x
+
self
.
pe
[:
paddle
.
shape
(
x
)[
0
],
:]
return
self
.
dropout
(
x
)
class
PositionalEncoding_2d
(
nn
.
Layer
):
"""Inject some information about the relative or absolute position of the tokens
in the sequence. The positional encodings have the same dimension as
the embeddings, so that the two can be summed. Here, we use sine and cosine
functions of different frequencies.
.. math::
\t
ext{PosEncoder}(pos, 2i) = sin(pos/10000^(2i/d_model))
\t
ext{PosEncoder}(pos, 2i+1) = cos(pos/10000^(2i/d_model))
\t
ext{where pos is the word position and i is the embed idx)
Args:
d_model: the embed dim (required).
dropout: the dropout value (default=0.1).
max_len: the max. length of the incoming sequence (default=5000).
Examples:
>>> pos_encoder = PositionalEncoding(d_model)
"""
def
__init__
(
self
,
dropout
,
dim
,
max_len
=
5000
):
super
(
PositionalEncoding_2d
,
self
).
__init__
()
self
.
dropout
=
nn
.
Dropout
(
p
=
dropout
)
pe
=
paddle
.
zeros
([
max_len
,
dim
])
position
=
paddle
.
arange
(
0
,
max_len
,
dtype
=
paddle
.
float32
).
unsqueeze
(
1
)
div_term
=
paddle
.
exp
(
paddle
.
arange
(
0
,
dim
,
2
).
astype
(
'float32'
)
*
(
-
math
.
log
(
10000.0
)
/
dim
))
pe
[:,
0
::
2
]
=
paddle
.
sin
(
position
*
div_term
)
pe
[:,
1
::
2
]
=
paddle
.
cos
(
position
*
div_term
)
pe
=
paddle
.
transpose
(
paddle
.
unsqueeze
(
pe
,
0
),
[
1
,
0
,
2
])
self
.
register_buffer
(
'pe'
,
pe
)
self
.
avg_pool_1
=
nn
.
AdaptiveAvgPool2D
((
1
,
1
))
self
.
linear1
=
nn
.
Linear
(
dim
,
dim
)
self
.
linear1
.
weight
.
data
.
fill_
(
1.
)
self
.
avg_pool_2
=
nn
.
AdaptiveAvgPool2D
((
1
,
1
))
self
.
linear2
=
nn
.
Linear
(
dim
,
dim
)
self
.
linear2
.
weight
.
data
.
fill_
(
1.
)
def
forward
(
self
,
x
):
"""Inputs of forward function
Args:
x: the sequence fed to the positional encoder model (required).
Shape:
x: [sequence length, batch size, embed dim]
output: [sequence length, batch size, embed dim]
Examples:
>>> output = pos_encoder(x)
"""
w_pe
=
self
.
pe
[:
paddle
.
shape
(
x
)[
-
1
],
:]
w1
=
self
.
linear1
(
self
.
avg_pool_1
(
x
).
squeeze
()).
unsqueeze
(
0
)
w_pe
=
w_pe
*
w1
w_pe
=
paddle
.
transpose
(
w_pe
,
[
1
,
2
,
0
])
w_pe
=
paddle
.
unsqueeze
(
w_pe
,
2
)
h_pe
=
self
.
pe
[:
paddle
.
shape
(
x
).
shape
[
-
2
],
:]
w2
=
self
.
linear2
(
self
.
avg_pool_2
(
x
).
squeeze
()).
unsqueeze
(
0
)
h_pe
=
h_pe
*
w2
h_pe
=
paddle
.
transpose
(
h_pe
,
[
1
,
2
,
0
])
h_pe
=
paddle
.
unsqueeze
(
h_pe
,
3
)
x
=
x
+
w_pe
+
h_pe
x
=
paddle
.
transpose
(
paddle
.
reshape
(
x
,
[
x
.
shape
[
0
],
x
.
shape
[
1
],
x
.
shape
[
2
]
*
x
.
shape
[
3
]]),
[
2
,
0
,
1
])
return
self
.
dropout
(
x
)
class
Embeddings
(
nn
.
Layer
):
def
__init__
(
self
,
d_model
,
vocab
,
padding_idx
,
scale_embedding
):
super
(
Embeddings
,
self
).
__init__
()
self
.
embedding
=
nn
.
Embedding
(
vocab
,
d_model
,
padding_idx
=
padding_idx
)
w0
=
np
.
random
.
normal
(
0.0
,
d_model
**-
0.5
,
(
vocab
,
d_model
)).
astype
(
np
.
float32
)
self
.
embedding
.
weight
.
set_value
(
w0
)
self
.
d_model
=
d_model
self
.
scale_embedding
=
scale_embedding
def
forward
(
self
,
x
):
if
self
.
scale_embedding
:
x
=
self
.
embedding
(
x
)
return
x
*
math
.
sqrt
(
self
.
d_model
)
return
self
.
embedding
(
x
)
class
Beam
():
''' Beam search '''
def
__init__
(
self
,
size
,
device
=
False
):
self
.
size
=
size
self
.
_done
=
False
# The score for each translation on the beam.
self
.
scores
=
paddle
.
zeros
((
size
,
),
dtype
=
paddle
.
float32
)
self
.
all_scores
=
[]
# The backpointers at each time-step.
self
.
prev_ks
=
[]
# The outputs at each time-step.
self
.
next_ys
=
[
paddle
.
full
((
size
,
),
0
,
dtype
=
paddle
.
int64
)]
self
.
next_ys
[
0
][
0
]
=
2
def
get_current_state
(
self
):
"Get the outputs for the current timestep."
return
self
.
get_tentative_hypothesis
()
def
get_current_origin
(
self
):
"Get the backpointers for the current timestep."
return
self
.
prev_ks
[
-
1
]
@
property
def
done
(
self
):
return
self
.
_done
def
advance
(
self
,
word_prob
):
"Update beam status and check if finished or not."
num_words
=
word_prob
.
shape
[
1
]
# Sum the previous scores.
if
len
(
self
.
prev_ks
)
>
0
:
beam_lk
=
word_prob
+
self
.
scores
.
unsqueeze
(
1
).
expand_as
(
word_prob
)
else
:
beam_lk
=
word_prob
[
0
]
flat_beam_lk
=
beam_lk
.
reshape
([
-
1
])
best_scores
,
best_scores_id
=
flat_beam_lk
.
topk
(
self
.
size
,
0
,
True
,
True
)
# 1st sort
self
.
all_scores
.
append
(
self
.
scores
)
self
.
scores
=
best_scores
# bestScoresId is flattened as a (beam x word) array,
# so we need to calculate which word and beam each score came from
prev_k
=
best_scores_id
//
num_words
self
.
prev_ks
.
append
(
prev_k
)
self
.
next_ys
.
append
(
best_scores_id
-
prev_k
*
num_words
)
# End condition is when top-of-beam is EOS.
if
self
.
next_ys
[
-
1
][
0
]
==
3
:
self
.
_done
=
True
self
.
all_scores
.
append
(
self
.
scores
)
return
self
.
_done
def
sort_scores
(
self
):
"Sort the scores."
return
self
.
scores
,
paddle
.
to_tensor
(
[
i
for
i
in
range
(
int
(
self
.
scores
.
shape
[
0
]))],
dtype
=
'int32'
)
def
get_the_best_score_and_idx
(
self
):
"Get the score of the best in the beam."
scores
,
ids
=
self
.
sort_scores
()
return
scores
[
1
],
ids
[
1
]
def
get_tentative_hypothesis
(
self
):
"Get the decoded sequence for the current timestep."
if
len
(
self
.
next_ys
)
==
1
:
dec_seq
=
self
.
next_ys
[
0
].
unsqueeze
(
1
)
else
:
_
,
keys
=
self
.
sort_scores
()
hyps
=
[
self
.
get_hypothesis
(
k
)
for
k
in
keys
]
hyps
=
[[
2
]
+
h
for
h
in
hyps
]
dec_seq
=
paddle
.
to_tensor
(
hyps
,
dtype
=
'int64'
)
return
dec_seq
def
get_hypothesis
(
self
,
k
):
""" Walk back to construct the full hypothesis. """
hyp
=
[]
for
j
in
range
(
len
(
self
.
prev_ks
)
-
1
,
-
1
,
-
1
):
hyp
.
append
(
self
.
next_ys
[
j
+
1
][
k
])
k
=
self
.
prev_ks
[
j
][
k
]
return
list
(
map
(
lambda
x
:
x
.
item
(),
hyp
[::
-
1
]))
Prev
1
…
6
7
8
9
10
11
12
13
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment