Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
ModelZoo
PaddleOCR_paddle_onnxruntime
Commits
f1506916
Commit
f1506916
authored
May 18, 2023
by
sugon_cxj
Browse files
first commit
parent
55c28ed5
Pipeline
#266
canceled with stages
Changes
432
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1571 additions
and
0 deletions
+1571
-0
ppocr/modeling/heads/__pycache__/rec_aster_head.cpython-37.pyc
.../modeling/heads/__pycache__/rec_aster_head.cpython-37.pyc
+0
-0
ppocr/modeling/heads/__pycache__/rec_att_head.cpython-37.pyc
ppocr/modeling/heads/__pycache__/rec_att_head.cpython-37.pyc
+0
-0
ppocr/modeling/heads/__pycache__/rec_ctc_head.cpython-37.pyc
ppocr/modeling/heads/__pycache__/rec_ctc_head.cpython-37.pyc
+0
-0
ppocr/modeling/heads/__pycache__/rec_multi_head.cpython-37.pyc
.../modeling/heads/__pycache__/rec_multi_head.cpython-37.pyc
+0
-0
ppocr/modeling/heads/__pycache__/rec_nrtr_head.cpython-37.pyc
...r/modeling/heads/__pycache__/rec_nrtr_head.cpython-37.pyc
+0
-0
ppocr/modeling/heads/__pycache__/rec_pren_head.cpython-37.pyc
...r/modeling/heads/__pycache__/rec_pren_head.cpython-37.pyc
+0
-0
ppocr/modeling/heads/__pycache__/rec_sar_head.cpython-37.pyc
ppocr/modeling/heads/__pycache__/rec_sar_head.cpython-37.pyc
+0
-0
ppocr/modeling/heads/__pycache__/rec_srn_head.cpython-37.pyc
ppocr/modeling/heads/__pycache__/rec_srn_head.cpython-37.pyc
+0
-0
ppocr/modeling/heads/__pycache__/self_attention.cpython-37.pyc
.../modeling/heads/__pycache__/self_attention.cpython-37.pyc
+0
-0
ppocr/modeling/heads/__pycache__/table_att_head.cpython-37.pyc
.../modeling/heads/__pycache__/table_att_head.cpython-37.pyc
+0
-0
ppocr/modeling/heads/cls_head.py
ppocr/modeling/heads/cls_head.py
+52
-0
ppocr/modeling/heads/det_db_head.py
ppocr/modeling/heads/det_db_head.py
+118
-0
ppocr/modeling/heads/det_east_head.py
ppocr/modeling/heads/det_east_head.py
+121
-0
ppocr/modeling/heads/det_fce_head.py
ppocr/modeling/heads/det_fce_head.py
+99
-0
ppocr/modeling/heads/det_pse_head.py
ppocr/modeling/heads/det_pse_head.py
+37
-0
ppocr/modeling/heads/det_sast_head.py
ppocr/modeling/heads/det_sast_head.py
+128
-0
ppocr/modeling/heads/e2e_pg_head.py
ppocr/modeling/heads/e2e_pg_head.py
+253
-0
ppocr/modeling/heads/kie_sdmgr_head.py
ppocr/modeling/heads/kie_sdmgr_head.py
+207
-0
ppocr/modeling/heads/multiheadAttention.py
ppocr/modeling/heads/multiheadAttention.py
+163
-0
ppocr/modeling/heads/rec_aster_head.py
ppocr/modeling/heads/rec_aster_head.py
+393
-0
No files found.
ppocr/modeling/heads/__pycache__/rec_aster_head.cpython-37.pyc
0 → 100644
View file @
f1506916
File added
ppocr/modeling/heads/__pycache__/rec_att_head.cpython-37.pyc
0 → 100644
View file @
f1506916
File added
ppocr/modeling/heads/__pycache__/rec_ctc_head.cpython-37.pyc
0 → 100644
View file @
f1506916
File added
ppocr/modeling/heads/__pycache__/rec_multi_head.cpython-37.pyc
0 → 100644
View file @
f1506916
File added
ppocr/modeling/heads/__pycache__/rec_nrtr_head.cpython-37.pyc
0 → 100644
View file @
f1506916
File added
ppocr/modeling/heads/__pycache__/rec_pren_head.cpython-37.pyc
0 → 100644
View file @
f1506916
File added
ppocr/modeling/heads/__pycache__/rec_sar_head.cpython-37.pyc
0 → 100644
View file @
f1506916
File added
ppocr/modeling/heads/__pycache__/rec_srn_head.cpython-37.pyc
0 → 100644
View file @
f1506916
File added
ppocr/modeling/heads/__pycache__/self_attention.cpython-37.pyc
0 → 100644
View file @
f1506916
File added
ppocr/modeling/heads/__pycache__/table_att_head.cpython-37.pyc
0 → 100644
View file @
f1506916
File added
ppocr/modeling/heads/cls_head.py
0 → 100755
View file @
f1506916
# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
absolute_import
from
__future__
import
division
from
__future__
import
print_function
import
math
import
paddle
from
paddle
import
nn
,
ParamAttr
import
paddle.nn.functional
as
F
class
ClsHead
(
nn
.
Layer
):
"""
Class orientation
Args:
params(dict): super parameters for build Class network
"""
def
__init__
(
self
,
in_channels
,
class_dim
,
**
kwargs
):
super
(
ClsHead
,
self
).
__init__
()
self
.
pool
=
nn
.
AdaptiveAvgPool2D
(
1
)
stdv
=
1.0
/
math
.
sqrt
(
in_channels
*
1.0
)
self
.
fc
=
nn
.
Linear
(
in_channels
,
class_dim
,
weight_attr
=
ParamAttr
(
name
=
"fc_0.w_0"
,
initializer
=
nn
.
initializer
.
Uniform
(
-
stdv
,
stdv
)),
bias_attr
=
ParamAttr
(
name
=
"fc_0.b_0"
),
)
def
forward
(
self
,
x
,
targets
=
None
):
x
=
self
.
pool
(
x
)
x
=
paddle
.
reshape
(
x
,
shape
=
[
x
.
shape
[
0
],
x
.
shape
[
1
]])
x
=
self
.
fc
(
x
)
if
not
self
.
training
:
x
=
F
.
softmax
(
x
,
axis
=
1
)
return
x
ppocr/modeling/heads/det_db_head.py
0 → 100755
View file @
f1506916
# copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
absolute_import
from
__future__
import
division
from
__future__
import
print_function
import
math
import
paddle
from
paddle
import
nn
import
paddle.nn.functional
as
F
from
paddle
import
ParamAttr
def
get_bias_attr
(
k
):
stdv
=
1.0
/
math
.
sqrt
(
k
*
1.0
)
initializer
=
paddle
.
nn
.
initializer
.
Uniform
(
-
stdv
,
stdv
)
bias_attr
=
ParamAttr
(
initializer
=
initializer
)
return
bias_attr
class
Head
(
nn
.
Layer
):
def
__init__
(
self
,
in_channels
,
name_list
,
kernel_list
=
[
3
,
2
,
2
],
**
kwargs
):
super
(
Head
,
self
).
__init__
()
self
.
conv1
=
nn
.
Conv2D
(
in_channels
=
in_channels
,
out_channels
=
in_channels
//
4
,
kernel_size
=
kernel_list
[
0
],
padding
=
int
(
kernel_list
[
0
]
//
2
),
weight_attr
=
ParamAttr
(),
bias_attr
=
False
)
self
.
conv_bn1
=
nn
.
BatchNorm
(
num_channels
=
in_channels
//
4
,
param_attr
=
ParamAttr
(
initializer
=
paddle
.
nn
.
initializer
.
Constant
(
value
=
1.0
)),
bias_attr
=
ParamAttr
(
initializer
=
paddle
.
nn
.
initializer
.
Constant
(
value
=
1e-4
)),
act
=
'relu'
)
self
.
conv2
=
nn
.
Conv2DTranspose
(
in_channels
=
in_channels
//
4
,
out_channels
=
in_channels
//
4
,
kernel_size
=
kernel_list
[
1
],
stride
=
2
,
weight_attr
=
ParamAttr
(
initializer
=
paddle
.
nn
.
initializer
.
KaimingUniform
()),
bias_attr
=
get_bias_attr
(
in_channels
//
4
))
self
.
conv_bn2
=
nn
.
BatchNorm
(
num_channels
=
in_channels
//
4
,
param_attr
=
ParamAttr
(
initializer
=
paddle
.
nn
.
initializer
.
Constant
(
value
=
1.0
)),
bias_attr
=
ParamAttr
(
initializer
=
paddle
.
nn
.
initializer
.
Constant
(
value
=
1e-4
)),
act
=
"relu"
)
self
.
conv3
=
nn
.
Conv2DTranspose
(
in_channels
=
in_channels
//
4
,
out_channels
=
1
,
kernel_size
=
kernel_list
[
2
],
stride
=
2
,
weight_attr
=
ParamAttr
(
initializer
=
paddle
.
nn
.
initializer
.
KaimingUniform
()),
bias_attr
=
get_bias_attr
(
in_channels
//
4
),
)
def
forward
(
self
,
x
):
x
=
self
.
conv1
(
x
)
x
=
self
.
conv_bn1
(
x
)
x
=
self
.
conv2
(
x
)
x
=
self
.
conv_bn2
(
x
)
x
=
self
.
conv3
(
x
)
x
=
F
.
sigmoid
(
x
)
return
x
class
DBHead
(
nn
.
Layer
):
"""
Differentiable Binarization (DB) for text detection:
see https://arxiv.org/abs/1911.08947
args:
params(dict): super parameters for build DB network
"""
def
__init__
(
self
,
in_channels
,
k
=
50
,
**
kwargs
):
super
(
DBHead
,
self
).
__init__
()
self
.
k
=
k
binarize_name_list
=
[
'conv2d_56'
,
'batch_norm_47'
,
'conv2d_transpose_0'
,
'batch_norm_48'
,
'conv2d_transpose_1'
,
'binarize'
]
thresh_name_list
=
[
'conv2d_57'
,
'batch_norm_49'
,
'conv2d_transpose_2'
,
'batch_norm_50'
,
'conv2d_transpose_3'
,
'thresh'
]
self
.
binarize
=
Head
(
in_channels
,
binarize_name_list
,
**
kwargs
)
self
.
thresh
=
Head
(
in_channels
,
thresh_name_list
,
**
kwargs
)
def
step_function
(
self
,
x
,
y
):
return
paddle
.
reciprocal
(
1
+
paddle
.
exp
(
-
self
.
k
*
(
x
-
y
)))
def
forward
(
self
,
x
,
targets
=
None
):
shrink_maps
=
self
.
binarize
(
x
)
if
not
self
.
training
:
return
{
'maps'
:
shrink_maps
}
threshold_maps
=
self
.
thresh
(
x
)
binary_maps
=
self
.
step_function
(
shrink_maps
,
threshold_maps
)
y
=
paddle
.
concat
([
shrink_maps
,
threshold_maps
,
binary_maps
],
axis
=
1
)
return
{
'maps'
:
y
}
ppocr/modeling/heads/det_east_head.py
0 → 100755
View file @
f1506916
# copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
absolute_import
from
__future__
import
division
from
__future__
import
print_function
import
math
import
paddle
from
paddle
import
nn
import
paddle.nn.functional
as
F
from
paddle
import
ParamAttr
class
ConvBNLayer
(
nn
.
Layer
):
def
__init__
(
self
,
in_channels
,
out_channels
,
kernel_size
,
stride
,
padding
,
groups
=
1
,
if_act
=
True
,
act
=
None
,
name
=
None
):
super
(
ConvBNLayer
,
self
).
__init__
()
self
.
if_act
=
if_act
self
.
act
=
act
self
.
conv
=
nn
.
Conv2D
(
in_channels
=
in_channels
,
out_channels
=
out_channels
,
kernel_size
=
kernel_size
,
stride
=
stride
,
padding
=
padding
,
groups
=
groups
,
weight_attr
=
ParamAttr
(
name
=
name
+
'_weights'
),
bias_attr
=
False
)
self
.
bn
=
nn
.
BatchNorm
(
num_channels
=
out_channels
,
act
=
act
,
param_attr
=
ParamAttr
(
name
=
"bn_"
+
name
+
"_scale"
),
bias_attr
=
ParamAttr
(
name
=
"bn_"
+
name
+
"_offset"
),
moving_mean_name
=
"bn_"
+
name
+
"_mean"
,
moving_variance_name
=
"bn_"
+
name
+
"_variance"
)
def
forward
(
self
,
x
):
x
=
self
.
conv
(
x
)
x
=
self
.
bn
(
x
)
return
x
class
EASTHead
(
nn
.
Layer
):
"""
"""
def
__init__
(
self
,
in_channels
,
model_name
,
**
kwargs
):
super
(
EASTHead
,
self
).
__init__
()
self
.
model_name
=
model_name
if
self
.
model_name
==
"large"
:
num_outputs
=
[
128
,
64
,
1
,
8
]
else
:
num_outputs
=
[
64
,
32
,
1
,
8
]
self
.
det_conv1
=
ConvBNLayer
(
in_channels
=
in_channels
,
out_channels
=
num_outputs
[
0
],
kernel_size
=
3
,
stride
=
1
,
padding
=
1
,
if_act
=
True
,
act
=
'relu'
,
name
=
"det_head1"
)
self
.
det_conv2
=
ConvBNLayer
(
in_channels
=
num_outputs
[
0
],
out_channels
=
num_outputs
[
1
],
kernel_size
=
3
,
stride
=
1
,
padding
=
1
,
if_act
=
True
,
act
=
'relu'
,
name
=
"det_head2"
)
self
.
score_conv
=
ConvBNLayer
(
in_channels
=
num_outputs
[
1
],
out_channels
=
num_outputs
[
2
],
kernel_size
=
1
,
stride
=
1
,
padding
=
0
,
if_act
=
False
,
act
=
None
,
name
=
"f_score"
)
self
.
geo_conv
=
ConvBNLayer
(
in_channels
=
num_outputs
[
1
],
out_channels
=
num_outputs
[
3
],
kernel_size
=
1
,
stride
=
1
,
padding
=
0
,
if_act
=
False
,
act
=
None
,
name
=
"f_geo"
)
def
forward
(
self
,
x
,
targets
=
None
):
f_det
=
self
.
det_conv1
(
x
)
f_det
=
self
.
det_conv2
(
f_det
)
f_score
=
self
.
score_conv
(
f_det
)
f_score
=
F
.
sigmoid
(
f_score
)
f_geo
=
self
.
geo_conv
(
f_det
)
f_geo
=
(
F
.
sigmoid
(
f_geo
)
-
0.5
)
*
2
*
800
pred
=
{
'f_score'
:
f_score
,
'f_geo'
:
f_geo
}
return
pred
ppocr/modeling/heads/det_fce_head.py
0 → 100755
View file @
f1506916
# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
This code is refer from:
https://github.com/open-mmlab/mmocr/blob/main/mmocr/models/textdet/dense_heads/fce_head.py
"""
from
paddle
import
nn
from
paddle
import
ParamAttr
import
paddle.nn.functional
as
F
from
paddle.nn.initializer
import
Normal
import
paddle
from
functools
import
partial
def
multi_apply
(
func
,
*
args
,
**
kwargs
):
pfunc
=
partial
(
func
,
**
kwargs
)
if
kwargs
else
func
map_results
=
map
(
pfunc
,
*
args
)
return
tuple
(
map
(
list
,
zip
(
*
map_results
)))
class
FCEHead
(
nn
.
Layer
):
"""The class for implementing FCENet head.
FCENet(CVPR2021): Fourier Contour Embedding for Arbitrary-shaped Text
Detection.
[https://arxiv.org/abs/2104.10442]
Args:
in_channels (int): The number of input channels.
scales (list[int]) : The scale of each layer.
fourier_degree (int) : The maximum Fourier transform degree k.
"""
def
__init__
(
self
,
in_channels
,
fourier_degree
=
5
):
super
().
__init__
()
assert
isinstance
(
in_channels
,
int
)
self
.
downsample_ratio
=
1.0
self
.
in_channels
=
in_channels
self
.
fourier_degree
=
fourier_degree
self
.
out_channels_cls
=
4
self
.
out_channels_reg
=
(
2
*
self
.
fourier_degree
+
1
)
*
2
self
.
out_conv_cls
=
nn
.
Conv2D
(
in_channels
=
self
.
in_channels
,
out_channels
=
self
.
out_channels_cls
,
kernel_size
=
3
,
stride
=
1
,
padding
=
1
,
groups
=
1
,
weight_attr
=
ParamAttr
(
name
=
'cls_weights'
,
initializer
=
Normal
(
mean
=
0.
,
std
=
0.01
)),
bias_attr
=
True
)
self
.
out_conv_reg
=
nn
.
Conv2D
(
in_channels
=
self
.
in_channels
,
out_channels
=
self
.
out_channels_reg
,
kernel_size
=
3
,
stride
=
1
,
padding
=
1
,
groups
=
1
,
weight_attr
=
ParamAttr
(
name
=
'reg_weights'
,
initializer
=
Normal
(
mean
=
0.
,
std
=
0.01
)),
bias_attr
=
True
)
def
forward
(
self
,
feats
,
targets
=
None
):
cls_res
,
reg_res
=
multi_apply
(
self
.
forward_single
,
feats
)
level_num
=
len
(
cls_res
)
outs
=
{}
if
not
self
.
training
:
for
i
in
range
(
level_num
):
tr_pred
=
F
.
softmax
(
cls_res
[
i
][:,
0
:
2
,
:,
:],
axis
=
1
)
tcl_pred
=
F
.
softmax
(
cls_res
[
i
][:,
2
:,
:,
:],
axis
=
1
)
outs
[
'level_{}'
.
format
(
i
)]
=
paddle
.
concat
(
[
tr_pred
,
tcl_pred
,
reg_res
[
i
]],
axis
=
1
)
else
:
preds
=
[[
cls_res
[
i
],
reg_res
[
i
]]
for
i
in
range
(
level_num
)]
outs
[
'levels'
]
=
preds
return
outs
def
forward_single
(
self
,
x
):
cls_predict
=
self
.
out_conv_cls
(
x
)
reg_predict
=
self
.
out_conv_reg
(
x
)
return
cls_predict
,
reg_predict
ppocr/modeling/heads/det_pse_head.py
0 → 100755
View file @
f1506916
# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
This code is refer from:
https://github.com/whai362/PSENet/blob/python3/models/head/psenet_head.py
"""
from
paddle
import
nn
class
PSEHead
(
nn
.
Layer
):
def
__init__
(
self
,
in_channels
,
hidden_dim
=
256
,
out_channels
=
7
,
**
kwargs
):
super
(
PSEHead
,
self
).
__init__
()
self
.
conv1
=
nn
.
Conv2D
(
in_channels
,
hidden_dim
,
kernel_size
=
3
,
stride
=
1
,
padding
=
1
)
self
.
bn1
=
nn
.
BatchNorm2D
(
hidden_dim
)
self
.
relu1
=
nn
.
ReLU
()
self
.
conv2
=
nn
.
Conv2D
(
hidden_dim
,
out_channels
,
kernel_size
=
1
,
stride
=
1
,
padding
=
0
)
def
forward
(
self
,
x
,
**
kwargs
):
out
=
self
.
conv1
(
x
)
out
=
self
.
relu1
(
self
.
bn1
(
out
))
out
=
self
.
conv2
(
out
)
return
{
'maps'
:
out
}
ppocr/modeling/heads/det_sast_head.py
0 → 100755
View file @
f1506916
# copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
absolute_import
from
__future__
import
division
from
__future__
import
print_function
import
math
import
paddle
from
paddle
import
nn
import
paddle.nn.functional
as
F
from
paddle
import
ParamAttr
class
ConvBNLayer
(
nn
.
Layer
):
def
__init__
(
self
,
in_channels
,
out_channels
,
kernel_size
,
stride
,
groups
=
1
,
if_act
=
True
,
act
=
None
,
name
=
None
):
super
(
ConvBNLayer
,
self
).
__init__
()
self
.
if_act
=
if_act
self
.
act
=
act
self
.
conv
=
nn
.
Conv2D
(
in_channels
=
in_channels
,
out_channels
=
out_channels
,
kernel_size
=
kernel_size
,
stride
=
stride
,
padding
=
(
kernel_size
-
1
)
//
2
,
groups
=
groups
,
weight_attr
=
ParamAttr
(
name
=
name
+
'_weights'
),
bias_attr
=
False
)
self
.
bn
=
nn
.
BatchNorm
(
num_channels
=
out_channels
,
act
=
act
,
param_attr
=
ParamAttr
(
name
=
"bn_"
+
name
+
"_scale"
),
bias_attr
=
ParamAttr
(
name
=
"bn_"
+
name
+
"_offset"
),
moving_mean_name
=
"bn_"
+
name
+
"_mean"
,
moving_variance_name
=
"bn_"
+
name
+
"_variance"
)
def
forward
(
self
,
x
):
x
=
self
.
conv
(
x
)
x
=
self
.
bn
(
x
)
return
x
class
SAST_Header1
(
nn
.
Layer
):
def
__init__
(
self
,
in_channels
,
**
kwargs
):
super
(
SAST_Header1
,
self
).
__init__
()
out_channels
=
[
64
,
64
,
128
]
self
.
score_conv
=
nn
.
Sequential
(
ConvBNLayer
(
in_channels
,
out_channels
[
0
],
1
,
1
,
act
=
'relu'
,
name
=
'f_score1'
),
ConvBNLayer
(
out_channels
[
0
],
out_channels
[
1
],
3
,
1
,
act
=
'relu'
,
name
=
'f_score2'
),
ConvBNLayer
(
out_channels
[
1
],
out_channels
[
2
],
1
,
1
,
act
=
'relu'
,
name
=
'f_score3'
),
ConvBNLayer
(
out_channels
[
2
],
1
,
3
,
1
,
act
=
None
,
name
=
'f_score4'
)
)
self
.
border_conv
=
nn
.
Sequential
(
ConvBNLayer
(
in_channels
,
out_channels
[
0
],
1
,
1
,
act
=
'relu'
,
name
=
'f_border1'
),
ConvBNLayer
(
out_channels
[
0
],
out_channels
[
1
],
3
,
1
,
act
=
'relu'
,
name
=
'f_border2'
),
ConvBNLayer
(
out_channels
[
1
],
out_channels
[
2
],
1
,
1
,
act
=
'relu'
,
name
=
'f_border3'
),
ConvBNLayer
(
out_channels
[
2
],
4
,
3
,
1
,
act
=
None
,
name
=
'f_border4'
)
)
def
forward
(
self
,
x
):
f_score
=
self
.
score_conv
(
x
)
f_score
=
F
.
sigmoid
(
f_score
)
f_border
=
self
.
border_conv
(
x
)
return
f_score
,
f_border
class
SAST_Header2
(
nn
.
Layer
):
def
__init__
(
self
,
in_channels
,
**
kwargs
):
super
(
SAST_Header2
,
self
).
__init__
()
out_channels
=
[
64
,
64
,
128
]
self
.
tvo_conv
=
nn
.
Sequential
(
ConvBNLayer
(
in_channels
,
out_channels
[
0
],
1
,
1
,
act
=
'relu'
,
name
=
'f_tvo1'
),
ConvBNLayer
(
out_channels
[
0
],
out_channels
[
1
],
3
,
1
,
act
=
'relu'
,
name
=
'f_tvo2'
),
ConvBNLayer
(
out_channels
[
1
],
out_channels
[
2
],
1
,
1
,
act
=
'relu'
,
name
=
'f_tvo3'
),
ConvBNLayer
(
out_channels
[
2
],
8
,
3
,
1
,
act
=
None
,
name
=
'f_tvo4'
)
)
self
.
tco_conv
=
nn
.
Sequential
(
ConvBNLayer
(
in_channels
,
out_channels
[
0
],
1
,
1
,
act
=
'relu'
,
name
=
'f_tco1'
),
ConvBNLayer
(
out_channels
[
0
],
out_channels
[
1
],
3
,
1
,
act
=
'relu'
,
name
=
'f_tco2'
),
ConvBNLayer
(
out_channels
[
1
],
out_channels
[
2
],
1
,
1
,
act
=
'relu'
,
name
=
'f_tco3'
),
ConvBNLayer
(
out_channels
[
2
],
2
,
3
,
1
,
act
=
None
,
name
=
'f_tco4'
)
)
def
forward
(
self
,
x
):
f_tvo
=
self
.
tvo_conv
(
x
)
f_tco
=
self
.
tco_conv
(
x
)
return
f_tvo
,
f_tco
class
SASTHead
(
nn
.
Layer
):
"""
"""
def
__init__
(
self
,
in_channels
,
**
kwargs
):
super
(
SASTHead
,
self
).
__init__
()
self
.
head1
=
SAST_Header1
(
in_channels
)
self
.
head2
=
SAST_Header2
(
in_channels
)
def
forward
(
self
,
x
,
targets
=
None
):
f_score
,
f_border
=
self
.
head1
(
x
)
f_tvo
,
f_tco
=
self
.
head2
(
x
)
predicts
=
{}
predicts
[
'f_score'
]
=
f_score
predicts
[
'f_border'
]
=
f_border
predicts
[
'f_tvo'
]
=
f_tvo
predicts
[
'f_tco'
]
=
f_tco
return
predicts
\ No newline at end of file
ppocr/modeling/heads/e2e_pg_head.py
0 → 100755
View file @
f1506916
# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
absolute_import
from
__future__
import
division
from
__future__
import
print_function
import
math
import
paddle
from
paddle
import
nn
import
paddle.nn.functional
as
F
from
paddle
import
ParamAttr
class
ConvBNLayer
(
nn
.
Layer
):
def
__init__
(
self
,
in_channels
,
out_channels
,
kernel_size
,
stride
,
padding
,
groups
=
1
,
if_act
=
True
,
act
=
None
,
name
=
None
):
super
(
ConvBNLayer
,
self
).
__init__
()
self
.
if_act
=
if_act
self
.
act
=
act
self
.
conv
=
nn
.
Conv2D
(
in_channels
=
in_channels
,
out_channels
=
out_channels
,
kernel_size
=
kernel_size
,
stride
=
stride
,
padding
=
padding
,
groups
=
groups
,
weight_attr
=
ParamAttr
(
name
=
name
+
'_weights'
),
bias_attr
=
False
)
self
.
bn
=
nn
.
BatchNorm
(
num_channels
=
out_channels
,
act
=
act
,
param_attr
=
ParamAttr
(
name
=
"bn_"
+
name
+
"_scale"
),
bias_attr
=
ParamAttr
(
name
=
"bn_"
+
name
+
"_offset"
),
moving_mean_name
=
"bn_"
+
name
+
"_mean"
,
moving_variance_name
=
"bn_"
+
name
+
"_variance"
,
use_global_stats
=
False
)
def
forward
(
self
,
x
):
x
=
self
.
conv
(
x
)
x
=
self
.
bn
(
x
)
return
x
class
PGHead
(
nn
.
Layer
):
"""
"""
def
__init__
(
self
,
in_channels
,
**
kwargs
):
super
(
PGHead
,
self
).
__init__
()
self
.
conv_f_score1
=
ConvBNLayer
(
in_channels
=
in_channels
,
out_channels
=
64
,
kernel_size
=
1
,
stride
=
1
,
padding
=
0
,
act
=
'relu'
,
name
=
"conv_f_score{}"
.
format
(
1
))
self
.
conv_f_score2
=
ConvBNLayer
(
in_channels
=
64
,
out_channels
=
64
,
kernel_size
=
3
,
stride
=
1
,
padding
=
1
,
act
=
'relu'
,
name
=
"conv_f_score{}"
.
format
(
2
))
self
.
conv_f_score3
=
ConvBNLayer
(
in_channels
=
64
,
out_channels
=
128
,
kernel_size
=
1
,
stride
=
1
,
padding
=
0
,
act
=
'relu'
,
name
=
"conv_f_score{}"
.
format
(
3
))
self
.
conv1
=
nn
.
Conv2D
(
in_channels
=
128
,
out_channels
=
1
,
kernel_size
=
3
,
stride
=
1
,
padding
=
1
,
groups
=
1
,
weight_attr
=
ParamAttr
(
name
=
"conv_f_score{}"
.
format
(
4
)),
bias_attr
=
False
)
self
.
conv_f_boder1
=
ConvBNLayer
(
in_channels
=
in_channels
,
out_channels
=
64
,
kernel_size
=
1
,
stride
=
1
,
padding
=
0
,
act
=
'relu'
,
name
=
"conv_f_boder{}"
.
format
(
1
))
self
.
conv_f_boder2
=
ConvBNLayer
(
in_channels
=
64
,
out_channels
=
64
,
kernel_size
=
3
,
stride
=
1
,
padding
=
1
,
act
=
'relu'
,
name
=
"conv_f_boder{}"
.
format
(
2
))
self
.
conv_f_boder3
=
ConvBNLayer
(
in_channels
=
64
,
out_channels
=
128
,
kernel_size
=
1
,
stride
=
1
,
padding
=
0
,
act
=
'relu'
,
name
=
"conv_f_boder{}"
.
format
(
3
))
self
.
conv2
=
nn
.
Conv2D
(
in_channels
=
128
,
out_channels
=
4
,
kernel_size
=
3
,
stride
=
1
,
padding
=
1
,
groups
=
1
,
weight_attr
=
ParamAttr
(
name
=
"conv_f_boder{}"
.
format
(
4
)),
bias_attr
=
False
)
self
.
conv_f_char1
=
ConvBNLayer
(
in_channels
=
in_channels
,
out_channels
=
128
,
kernel_size
=
1
,
stride
=
1
,
padding
=
0
,
act
=
'relu'
,
name
=
"conv_f_char{}"
.
format
(
1
))
self
.
conv_f_char2
=
ConvBNLayer
(
in_channels
=
128
,
out_channels
=
128
,
kernel_size
=
3
,
stride
=
1
,
padding
=
1
,
act
=
'relu'
,
name
=
"conv_f_char{}"
.
format
(
2
))
self
.
conv_f_char3
=
ConvBNLayer
(
in_channels
=
128
,
out_channels
=
256
,
kernel_size
=
1
,
stride
=
1
,
padding
=
0
,
act
=
'relu'
,
name
=
"conv_f_char{}"
.
format
(
3
))
self
.
conv_f_char4
=
ConvBNLayer
(
in_channels
=
256
,
out_channels
=
256
,
kernel_size
=
3
,
stride
=
1
,
padding
=
1
,
act
=
'relu'
,
name
=
"conv_f_char{}"
.
format
(
4
))
self
.
conv_f_char5
=
ConvBNLayer
(
in_channels
=
256
,
out_channels
=
256
,
kernel_size
=
1
,
stride
=
1
,
padding
=
0
,
act
=
'relu'
,
name
=
"conv_f_char{}"
.
format
(
5
))
self
.
conv3
=
nn
.
Conv2D
(
in_channels
=
256
,
out_channels
=
37
,
kernel_size
=
3
,
stride
=
1
,
padding
=
1
,
groups
=
1
,
weight_attr
=
ParamAttr
(
name
=
"conv_f_char{}"
.
format
(
6
)),
bias_attr
=
False
)
self
.
conv_f_direc1
=
ConvBNLayer
(
in_channels
=
in_channels
,
out_channels
=
64
,
kernel_size
=
1
,
stride
=
1
,
padding
=
0
,
act
=
'relu'
,
name
=
"conv_f_direc{}"
.
format
(
1
))
self
.
conv_f_direc2
=
ConvBNLayer
(
in_channels
=
64
,
out_channels
=
64
,
kernel_size
=
3
,
stride
=
1
,
padding
=
1
,
act
=
'relu'
,
name
=
"conv_f_direc{}"
.
format
(
2
))
self
.
conv_f_direc3
=
ConvBNLayer
(
in_channels
=
64
,
out_channels
=
128
,
kernel_size
=
1
,
stride
=
1
,
padding
=
0
,
act
=
'relu'
,
name
=
"conv_f_direc{}"
.
format
(
3
))
self
.
conv4
=
nn
.
Conv2D
(
in_channels
=
128
,
out_channels
=
2
,
kernel_size
=
3
,
stride
=
1
,
padding
=
1
,
groups
=
1
,
weight_attr
=
ParamAttr
(
name
=
"conv_f_direc{}"
.
format
(
4
)),
bias_attr
=
False
)
def
forward
(
self
,
x
,
targets
=
None
):
f_score
=
self
.
conv_f_score1
(
x
)
f_score
=
self
.
conv_f_score2
(
f_score
)
f_score
=
self
.
conv_f_score3
(
f_score
)
f_score
=
self
.
conv1
(
f_score
)
f_score
=
F
.
sigmoid
(
f_score
)
# f_border
f_border
=
self
.
conv_f_boder1
(
x
)
f_border
=
self
.
conv_f_boder2
(
f_border
)
f_border
=
self
.
conv_f_boder3
(
f_border
)
f_border
=
self
.
conv2
(
f_border
)
f_char
=
self
.
conv_f_char1
(
x
)
f_char
=
self
.
conv_f_char2
(
f_char
)
f_char
=
self
.
conv_f_char3
(
f_char
)
f_char
=
self
.
conv_f_char4
(
f_char
)
f_char
=
self
.
conv_f_char5
(
f_char
)
f_char
=
self
.
conv3
(
f_char
)
f_direction
=
self
.
conv_f_direc1
(
x
)
f_direction
=
self
.
conv_f_direc2
(
f_direction
)
f_direction
=
self
.
conv_f_direc3
(
f_direction
)
f_direction
=
self
.
conv4
(
f_direction
)
predicts
=
{}
predicts
[
'f_score'
]
=
f_score
predicts
[
'f_border'
]
=
f_border
predicts
[
'f_char'
]
=
f_char
predicts
[
'f_direction'
]
=
f_direction
return
predicts
ppocr/modeling/heads/kie_sdmgr_head.py
0 → 100755
View file @
f1506916
# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# reference from : https://github.com/open-mmlab/mmocr/blob/main/mmocr/models/kie/heads/sdmgr_head.py
from
__future__
import
absolute_import
from
__future__
import
division
from
__future__
import
print_function
import
math
import
paddle
from
paddle
import
nn
import
paddle.nn.functional
as
F
from
paddle
import
ParamAttr
class
SDMGRHead
(
nn
.
Layer
):
def
__init__
(
self
,
in_channels
,
num_chars
=
92
,
visual_dim
=
16
,
fusion_dim
=
1024
,
node_input
=
32
,
node_embed
=
256
,
edge_input
=
5
,
edge_embed
=
256
,
num_gnn
=
2
,
num_classes
=
26
,
bidirectional
=
False
):
super
().
__init__
()
self
.
fusion
=
Block
([
visual_dim
,
node_embed
],
node_embed
,
fusion_dim
)
self
.
node_embed
=
nn
.
Embedding
(
num_chars
,
node_input
,
0
)
hidden
=
node_embed
//
2
if
bidirectional
else
node_embed
self
.
rnn
=
nn
.
LSTM
(
input_size
=
node_input
,
hidden_size
=
hidden
,
num_layers
=
1
)
self
.
edge_embed
=
nn
.
Linear
(
edge_input
,
edge_embed
)
self
.
gnn_layers
=
nn
.
LayerList
(
[
GNNLayer
(
node_embed
,
edge_embed
)
for
_
in
range
(
num_gnn
)])
self
.
node_cls
=
nn
.
Linear
(
node_embed
,
num_classes
)
self
.
edge_cls
=
nn
.
Linear
(
edge_embed
,
2
)
def
forward
(
self
,
input
,
targets
):
relations
,
texts
,
x
=
input
node_nums
,
char_nums
=
[],
[]
for
text
in
texts
:
node_nums
.
append
(
text
.
shape
[
0
])
char_nums
.
append
(
paddle
.
sum
((
text
>
-
1
).
astype
(
int
),
axis
=-
1
))
max_num
=
max
([
char_num
.
max
()
for
char_num
in
char_nums
])
all_nodes
=
paddle
.
concat
([
paddle
.
concat
(
[
text
,
paddle
.
zeros
(
(
text
.
shape
[
0
],
max_num
-
text
.
shape
[
1
]))],
-
1
)
for
text
in
texts
])
temp
=
paddle
.
clip
(
all_nodes
,
min
=
0
).
astype
(
int
)
embed_nodes
=
self
.
node_embed
(
temp
)
rnn_nodes
,
_
=
self
.
rnn
(
embed_nodes
)
b
,
h
,
w
=
rnn_nodes
.
shape
nodes
=
paddle
.
zeros
([
b
,
w
])
all_nums
=
paddle
.
concat
(
char_nums
)
valid
=
paddle
.
nonzero
((
all_nums
>
0
).
astype
(
int
))
temp_all_nums
=
(
paddle
.
gather
(
all_nums
,
valid
)
-
1
).
unsqueeze
(
-
1
).
unsqueeze
(
-
1
)
temp_all_nums
=
paddle
.
expand
(
temp_all_nums
,
[
temp_all_nums
.
shape
[
0
],
temp_all_nums
.
shape
[
1
],
rnn_nodes
.
shape
[
-
1
]
])
temp_all_nodes
=
paddle
.
gather
(
rnn_nodes
,
valid
)
N
,
C
,
A
=
temp_all_nodes
.
shape
one_hot
=
F
.
one_hot
(
temp_all_nums
[:,
0
,
:],
num_classes
=
C
).
transpose
([
0
,
2
,
1
])
one_hot
=
paddle
.
multiply
(
temp_all_nodes
,
one_hot
.
astype
(
"float32"
)).
sum
(
axis
=
1
,
keepdim
=
True
)
t
=
one_hot
.
expand
([
N
,
1
,
A
]).
squeeze
(
1
)
nodes
=
paddle
.
scatter
(
nodes
,
valid
.
squeeze
(
1
),
t
)
if
x
is
not
None
:
nodes
=
self
.
fusion
([
x
,
nodes
])
all_edges
=
paddle
.
concat
(
[
rel
.
reshape
([
-
1
,
rel
.
shape
[
-
1
]])
for
rel
in
relations
])
embed_edges
=
self
.
edge_embed
(
all_edges
.
astype
(
'float32'
))
embed_edges
=
F
.
normalize
(
embed_edges
)
for
gnn_layer
in
self
.
gnn_layers
:
nodes
,
cat_nodes
=
gnn_layer
(
nodes
,
embed_edges
,
node_nums
)
node_cls
,
edge_cls
=
self
.
node_cls
(
nodes
),
self
.
edge_cls
(
cat_nodes
)
return
node_cls
,
edge_cls
class
GNNLayer
(
nn
.
Layer
):
def
__init__
(
self
,
node_dim
=
256
,
edge_dim
=
256
):
super
().
__init__
()
self
.
in_fc
=
nn
.
Linear
(
node_dim
*
2
+
edge_dim
,
node_dim
)
self
.
coef_fc
=
nn
.
Linear
(
node_dim
,
1
)
self
.
out_fc
=
nn
.
Linear
(
node_dim
,
node_dim
)
self
.
relu
=
nn
.
ReLU
()
def
forward
(
self
,
nodes
,
edges
,
nums
):
start
,
cat_nodes
=
0
,
[]
for
num
in
nums
:
sample_nodes
=
nodes
[
start
:
start
+
num
]
cat_nodes
.
append
(
paddle
.
concat
([
paddle
.
expand
(
sample_nodes
.
unsqueeze
(
1
),
[
-
1
,
num
,
-
1
]),
paddle
.
expand
(
sample_nodes
.
unsqueeze
(
0
),
[
num
,
-
1
,
-
1
])
],
-
1
).
reshape
([
num
**
2
,
-
1
]))
start
+=
num
cat_nodes
=
paddle
.
concat
([
paddle
.
concat
(
cat_nodes
),
edges
],
-
1
)
cat_nodes
=
self
.
relu
(
self
.
in_fc
(
cat_nodes
))
coefs
=
self
.
coef_fc
(
cat_nodes
)
start
,
residuals
=
0
,
[]
for
num
in
nums
:
residual
=
F
.
softmax
(
-
paddle
.
eye
(
num
).
unsqueeze
(
-
1
)
*
1e9
+
coefs
[
start
:
start
+
num
**
2
].
reshape
([
num
,
num
,
-
1
]),
1
)
residuals
.
append
((
residual
*
cat_nodes
[
start
:
start
+
num
**
2
]
.
reshape
([
num
,
num
,
-
1
])).
sum
(
1
))
start
+=
num
**
2
nodes
+=
self
.
relu
(
self
.
out_fc
(
paddle
.
concat
(
residuals
)))
return
[
nodes
,
cat_nodes
]
class
Block
(
nn
.
Layer
):
def
__init__
(
self
,
input_dims
,
output_dim
,
mm_dim
=
1600
,
chunks
=
20
,
rank
=
15
,
shared
=
False
,
dropout_input
=
0.
,
dropout_pre_lin
=
0.
,
dropout_output
=
0.
,
pos_norm
=
'before_cat'
):
super
().
__init__
()
self
.
rank
=
rank
self
.
dropout_input
=
dropout_input
self
.
dropout_pre_lin
=
dropout_pre_lin
self
.
dropout_output
=
dropout_output
assert
(
pos_norm
in
[
'before_cat'
,
'after_cat'
])
self
.
pos_norm
=
pos_norm
# Modules
self
.
linear0
=
nn
.
Linear
(
input_dims
[
0
],
mm_dim
)
self
.
linear1
=
(
self
.
linear0
if
shared
else
nn
.
Linear
(
input_dims
[
1
],
mm_dim
))
self
.
merge_linears0
=
nn
.
LayerList
()
self
.
merge_linears1
=
nn
.
LayerList
()
self
.
chunks
=
self
.
chunk_sizes
(
mm_dim
,
chunks
)
for
size
in
self
.
chunks
:
ml0
=
nn
.
Linear
(
size
,
size
*
rank
)
self
.
merge_linears0
.
append
(
ml0
)
ml1
=
ml0
if
shared
else
nn
.
Linear
(
size
,
size
*
rank
)
self
.
merge_linears1
.
append
(
ml1
)
self
.
linear_out
=
nn
.
Linear
(
mm_dim
,
output_dim
)
def
forward
(
self
,
x
):
x0
=
self
.
linear0
(
x
[
0
])
x1
=
self
.
linear1
(
x
[
1
])
bs
=
x1
.
shape
[
0
]
if
self
.
dropout_input
>
0
:
x0
=
F
.
dropout
(
x0
,
p
=
self
.
dropout_input
,
training
=
self
.
training
)
x1
=
F
.
dropout
(
x1
,
p
=
self
.
dropout_input
,
training
=
self
.
training
)
x0_chunks
=
paddle
.
split
(
x0
,
self
.
chunks
,
-
1
)
x1_chunks
=
paddle
.
split
(
x1
,
self
.
chunks
,
-
1
)
zs
=
[]
for
x0_c
,
x1_c
,
m0
,
m1
in
zip
(
x0_chunks
,
x1_chunks
,
self
.
merge_linears0
,
self
.
merge_linears1
):
m
=
m0
(
x0_c
)
*
m1
(
x1_c
)
# bs x split_size*rank
m
=
m
.
reshape
([
bs
,
self
.
rank
,
-
1
])
z
=
paddle
.
sum
(
m
,
1
)
if
self
.
pos_norm
==
'before_cat'
:
z
=
paddle
.
sqrt
(
F
.
relu
(
z
))
-
paddle
.
sqrt
(
F
.
relu
(
-
z
))
z
=
F
.
normalize
(
z
)
zs
.
append
(
z
)
z
=
paddle
.
concat
(
zs
,
1
)
if
self
.
pos_norm
==
'after_cat'
:
z
=
paddle
.
sqrt
(
F
.
relu
(
z
))
-
paddle
.
sqrt
(
F
.
relu
(
-
z
))
z
=
F
.
normalize
(
z
)
if
self
.
dropout_pre_lin
>
0
:
z
=
F
.
dropout
(
z
,
p
=
self
.
dropout_pre_lin
,
training
=
self
.
training
)
z
=
self
.
linear_out
(
z
)
if
self
.
dropout_output
>
0
:
z
=
F
.
dropout
(
z
,
p
=
self
.
dropout_output
,
training
=
self
.
training
)
return
z
def
chunk_sizes
(
self
,
dim
,
chunks
):
split_size
=
(
dim
+
chunks
-
1
)
//
chunks
sizes_list
=
[
split_size
]
*
chunks
sizes_list
[
-
1
]
=
sizes_list
[
-
1
]
-
(
sum
(
sizes_list
)
-
dim
)
return
sizes_list
ppocr/modeling/heads/multiheadAttention.py
0 → 100755
View file @
f1506916
# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
paddle
from
paddle
import
nn
import
paddle.nn.functional
as
F
from
paddle.nn
import
Linear
from
paddle.nn.initializer
import
XavierUniform
as
xavier_uniform_
from
paddle.nn.initializer
import
Constant
as
constant_
from
paddle.nn.initializer
import
XavierNormal
as
xavier_normal_
zeros_
=
constant_
(
value
=
0.
)
ones_
=
constant_
(
value
=
1.
)
class
MultiheadAttention
(
nn
.
Layer
):
"""Allows the model to jointly attend to information
from different representation subspaces.
See reference: Attention Is All You Need
.. math::
\t
ext{MultiHead}(Q, K, V) =
\t
ext{Concat}(head_1,\dots,head_h)W^O
\t
ext{where} head_i =
\t
ext{Attention}(QW_i^Q, KW_i^K, VW_i^V)
Args:
embed_dim: total dimension of the model
num_heads: parallel attention layers, or heads
"""
def
__init__
(
self
,
embed_dim
,
num_heads
,
dropout
=
0.
,
bias
=
True
,
add_bias_kv
=
False
,
add_zero_attn
=
False
):
super
(
MultiheadAttention
,
self
).
__init__
()
self
.
embed_dim
=
embed_dim
self
.
num_heads
=
num_heads
self
.
dropout
=
dropout
self
.
head_dim
=
embed_dim
//
num_heads
assert
self
.
head_dim
*
num_heads
==
self
.
embed_dim
,
"embed_dim must be divisible by num_heads"
self
.
scaling
=
self
.
head_dim
**-
0.5
self
.
out_proj
=
Linear
(
embed_dim
,
embed_dim
,
bias_attr
=
bias
)
self
.
_reset_parameters
()
self
.
conv1
=
paddle
.
nn
.
Conv2D
(
in_channels
=
embed_dim
,
out_channels
=
embed_dim
,
kernel_size
=
(
1
,
1
))
self
.
conv2
=
paddle
.
nn
.
Conv2D
(
in_channels
=
embed_dim
,
out_channels
=
embed_dim
,
kernel_size
=
(
1
,
1
))
self
.
conv3
=
paddle
.
nn
.
Conv2D
(
in_channels
=
embed_dim
,
out_channels
=
embed_dim
,
kernel_size
=
(
1
,
1
))
def
_reset_parameters
(
self
):
xavier_uniform_
(
self
.
out_proj
.
weight
)
def
forward
(
self
,
query
,
key
,
value
,
key_padding_mask
=
None
,
incremental_state
=
None
,
attn_mask
=
None
):
"""
Inputs of forward function
query: [target length, batch size, embed dim]
key: [sequence length, batch size, embed dim]
value: [sequence length, batch size, embed dim]
key_padding_mask: if True, mask padding based on batch size
incremental_state: if provided, previous time steps are cashed
need_weights: output attn_output_weights
static_kv: key and value are static
Outputs of forward function
attn_output: [target length, batch size, embed dim]
attn_output_weights: [batch size, target length, sequence length]
"""
q_shape
=
paddle
.
shape
(
query
)
src_shape
=
paddle
.
shape
(
key
)
q
=
self
.
_in_proj_q
(
query
)
k
=
self
.
_in_proj_k
(
key
)
v
=
self
.
_in_proj_v
(
value
)
q
*=
self
.
scaling
q
=
paddle
.
transpose
(
paddle
.
reshape
(
q
,
[
q_shape
[
0
],
q_shape
[
1
],
self
.
num_heads
,
self
.
head_dim
]),
[
1
,
2
,
0
,
3
])
k
=
paddle
.
transpose
(
paddle
.
reshape
(
k
,
[
src_shape
[
0
],
q_shape
[
1
],
self
.
num_heads
,
self
.
head_dim
]),
[
1
,
2
,
0
,
3
])
v
=
paddle
.
transpose
(
paddle
.
reshape
(
v
,
[
src_shape
[
0
],
q_shape
[
1
],
self
.
num_heads
,
self
.
head_dim
]),
[
1
,
2
,
0
,
3
])
if
key_padding_mask
is
not
None
:
assert
key_padding_mask
.
shape
[
0
]
==
q_shape
[
1
]
assert
key_padding_mask
.
shape
[
1
]
==
src_shape
[
0
]
attn_output_weights
=
paddle
.
matmul
(
q
,
paddle
.
transpose
(
k
,
[
0
,
1
,
3
,
2
]))
if
attn_mask
is
not
None
:
attn_mask
=
paddle
.
unsqueeze
(
paddle
.
unsqueeze
(
attn_mask
,
0
),
0
)
attn_output_weights
+=
attn_mask
if
key_padding_mask
is
not
None
:
attn_output_weights
=
paddle
.
reshape
(
attn_output_weights
,
[
q_shape
[
1
],
self
.
num_heads
,
q_shape
[
0
],
src_shape
[
0
]])
key
=
paddle
.
unsqueeze
(
paddle
.
unsqueeze
(
key_padding_mask
,
1
),
2
)
key
=
paddle
.
cast
(
key
,
'float32'
)
y
=
paddle
.
full
(
shape
=
paddle
.
shape
(
key
),
dtype
=
'float32'
,
fill_value
=
'-inf'
)
y
=
paddle
.
where
(
key
==
0.
,
key
,
y
)
attn_output_weights
+=
y
attn_output_weights
=
F
.
softmax
(
attn_output_weights
.
astype
(
'float32'
),
axis
=-
1
,
dtype
=
paddle
.
float32
if
attn_output_weights
.
dtype
==
paddle
.
float16
else
attn_output_weights
.
dtype
)
attn_output_weights
=
F
.
dropout
(
attn_output_weights
,
p
=
self
.
dropout
,
training
=
self
.
training
)
attn_output
=
paddle
.
matmul
(
attn_output_weights
,
v
)
attn_output
=
paddle
.
reshape
(
paddle
.
transpose
(
attn_output
,
[
2
,
0
,
1
,
3
]),
[
q_shape
[
0
],
q_shape
[
1
],
self
.
embed_dim
])
attn_output
=
self
.
out_proj
(
attn_output
)
return
attn_output
def
_in_proj_q
(
self
,
query
):
query
=
paddle
.
transpose
(
query
,
[
1
,
2
,
0
])
query
=
paddle
.
unsqueeze
(
query
,
axis
=
2
)
res
=
self
.
conv1
(
query
)
res
=
paddle
.
squeeze
(
res
,
axis
=
2
)
res
=
paddle
.
transpose
(
res
,
[
2
,
0
,
1
])
return
res
def
_in_proj_k
(
self
,
key
):
key
=
paddle
.
transpose
(
key
,
[
1
,
2
,
0
])
key
=
paddle
.
unsqueeze
(
key
,
axis
=
2
)
res
=
self
.
conv2
(
key
)
res
=
paddle
.
squeeze
(
res
,
axis
=
2
)
res
=
paddle
.
transpose
(
res
,
[
2
,
0
,
1
])
return
res
def
_in_proj_v
(
self
,
value
):
value
=
paddle
.
transpose
(
value
,
[
1
,
2
,
0
])
#(1, 2, 0)
value
=
paddle
.
unsqueeze
(
value
,
axis
=
2
)
res
=
self
.
conv3
(
value
)
res
=
paddle
.
squeeze
(
res
,
axis
=
2
)
res
=
paddle
.
transpose
(
res
,
[
2
,
0
,
1
])
return
res
ppocr/modeling/heads/rec_aster_head.py
0 → 100755
View file @
f1506916
# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
This code is refer from:
https://github.com/ayumiymk/aster.pytorch/blob/master/lib/models/attention_recognition_head.py
"""
from
__future__
import
absolute_import
from
__future__
import
division
from
__future__
import
print_function
import
sys
import
paddle
from
paddle
import
nn
from
paddle.nn
import
functional
as
F
class
AsterHead
(
nn
.
Layer
):
def
__init__
(
self
,
in_channels
,
out_channels
,
sDim
,
attDim
,
max_len_labels
,
time_step
=
25
,
beam_width
=
5
,
**
kwargs
):
super
(
AsterHead
,
self
).
__init__
()
self
.
num_classes
=
out_channels
self
.
in_planes
=
in_channels
self
.
sDim
=
sDim
self
.
attDim
=
attDim
self
.
max_len_labels
=
max_len_labels
self
.
decoder
=
AttentionRecognitionHead
(
in_channels
,
out_channels
,
sDim
,
attDim
,
max_len_labels
)
self
.
time_step
=
time_step
self
.
embeder
=
Embedding
(
self
.
time_step
,
in_channels
)
self
.
beam_width
=
beam_width
self
.
eos
=
self
.
num_classes
-
3
def
forward
(
self
,
x
,
targets
=
None
,
embed
=
None
):
return_dict
=
{}
embedding_vectors
=
self
.
embeder
(
x
)
if
self
.
training
:
rec_targets
,
rec_lengths
,
_
=
targets
rec_pred
=
self
.
decoder
([
x
,
rec_targets
,
rec_lengths
],
embedding_vectors
)
return_dict
[
'rec_pred'
]
=
rec_pred
return_dict
[
'embedding_vectors'
]
=
embedding_vectors
else
:
rec_pred
,
rec_pred_scores
=
self
.
decoder
.
beam_search
(
x
,
self
.
beam_width
,
self
.
eos
,
embedding_vectors
)
return_dict
[
'rec_pred'
]
=
rec_pred
return_dict
[
'rec_pred_scores'
]
=
rec_pred_scores
return_dict
[
'embedding_vectors'
]
=
embedding_vectors
return
return_dict
class
Embedding
(
nn
.
Layer
):
def
__init__
(
self
,
in_timestep
,
in_planes
,
mid_dim
=
4096
,
embed_dim
=
300
):
super
(
Embedding
,
self
).
__init__
()
self
.
in_timestep
=
in_timestep
self
.
in_planes
=
in_planes
self
.
embed_dim
=
embed_dim
self
.
mid_dim
=
mid_dim
self
.
eEmbed
=
nn
.
Linear
(
in_timestep
*
in_planes
,
self
.
embed_dim
)
# Embed encoder output to a word-embedding like
def
forward
(
self
,
x
):
x
=
paddle
.
reshape
(
x
,
[
paddle
.
shape
(
x
)[
0
],
-
1
])
x
=
self
.
eEmbed
(
x
)
return
x
class
AttentionRecognitionHead
(
nn
.
Layer
):
"""
input: [b x 16 x 64 x in_planes]
output: probability sequence: [b x T x num_classes]
"""
def
__init__
(
self
,
in_channels
,
out_channels
,
sDim
,
attDim
,
max_len_labels
):
super
(
AttentionRecognitionHead
,
self
).
__init__
()
self
.
num_classes
=
out_channels
# this is the output classes. So it includes the <EOS>.
self
.
in_planes
=
in_channels
self
.
sDim
=
sDim
self
.
attDim
=
attDim
self
.
max_len_labels
=
max_len_labels
self
.
decoder
=
DecoderUnit
(
sDim
=
sDim
,
xDim
=
in_channels
,
yDim
=
self
.
num_classes
,
attDim
=
attDim
)
def
forward
(
self
,
x
,
embed
):
x
,
targets
,
lengths
=
x
batch_size
=
paddle
.
shape
(
x
)[
0
]
# Decoder
state
=
self
.
decoder
.
get_initial_state
(
embed
)
outputs
=
[]
for
i
in
range
(
max
(
lengths
)):
if
i
==
0
:
y_prev
=
paddle
.
full
(
shape
=
[
batch_size
],
fill_value
=
self
.
num_classes
)
else
:
y_prev
=
targets
[:,
i
-
1
]
output
,
state
=
self
.
decoder
(
x
,
state
,
y_prev
)
outputs
.
append
(
output
)
outputs
=
paddle
.
concat
([
_
.
unsqueeze
(
1
)
for
_
in
outputs
],
1
)
return
outputs
# inference stage.
def
sample
(
self
,
x
):
x
,
_
,
_
=
x
batch_size
=
x
.
size
(
0
)
# Decoder
state
=
paddle
.
zeros
([
1
,
batch_size
,
self
.
sDim
])
predicted_ids
,
predicted_scores
=
[],
[]
for
i
in
range
(
self
.
max_len_labels
):
if
i
==
0
:
y_prev
=
paddle
.
full
(
shape
=
[
batch_size
],
fill_value
=
self
.
num_classes
)
else
:
y_prev
=
predicted
output
,
state
=
self
.
decoder
(
x
,
state
,
y_prev
)
output
=
F
.
softmax
(
output
,
axis
=
1
)
score
,
predicted
=
output
.
max
(
1
)
predicted_ids
.
append
(
predicted
.
unsqueeze
(
1
))
predicted_scores
.
append
(
score
.
unsqueeze
(
1
))
predicted_ids
=
paddle
.
concat
([
predicted_ids
,
1
])
predicted_scores
=
paddle
.
concat
([
predicted_scores
,
1
])
# return predicted_ids.squeeze(), predicted_scores.squeeze()
return
predicted_ids
,
predicted_scores
def
beam_search
(
self
,
x
,
beam_width
,
eos
,
embed
):
def
_inflate
(
tensor
,
times
,
dim
):
repeat_dims
=
[
1
]
*
tensor
.
dim
()
repeat_dims
[
dim
]
=
times
output
=
paddle
.
tile
(
tensor
,
repeat_dims
)
return
output
# https://github.com/IBM/pytorch-seq2seq/blob/fede87655ddce6c94b38886089e05321dc9802af/seq2seq/models/TopKDecoder.py
batch_size
,
l
,
d
=
x
.
shape
x
=
paddle
.
tile
(
paddle
.
transpose
(
x
.
unsqueeze
(
1
),
perm
=
[
1
,
0
,
2
,
3
]),
[
beam_width
,
1
,
1
,
1
])
inflated_encoder_feats
=
paddle
.
reshape
(
paddle
.
transpose
(
x
,
perm
=
[
1
,
0
,
2
,
3
]),
[
-
1
,
l
,
d
])
# Initialize the decoder
state
=
self
.
decoder
.
get_initial_state
(
embed
,
tile_times
=
beam_width
)
pos_index
=
paddle
.
reshape
(
paddle
.
arange
(
batch_size
)
*
beam_width
,
shape
=
[
-
1
,
1
])
# Initialize the scores
sequence_scores
=
paddle
.
full
(
shape
=
[
batch_size
*
beam_width
,
1
],
fill_value
=-
float
(
'Inf'
))
index
=
[
i
*
beam_width
for
i
in
range
(
0
,
batch_size
)]
sequence_scores
[
index
]
=
0.0
# Initialize the input vector
y_prev
=
paddle
.
full
(
shape
=
[
batch_size
*
beam_width
],
fill_value
=
self
.
num_classes
)
# Store decisions for backtracking
stored_scores
=
list
()
stored_predecessors
=
list
()
stored_emitted_symbols
=
list
()
for
i
in
range
(
self
.
max_len_labels
):
output
,
state
=
self
.
decoder
(
inflated_encoder_feats
,
state
,
y_prev
)
state
=
paddle
.
unsqueeze
(
state
,
axis
=
0
)
log_softmax_output
=
paddle
.
nn
.
functional
.
log_softmax
(
output
,
axis
=
1
)
sequence_scores
=
_inflate
(
sequence_scores
,
self
.
num_classes
,
1
)
sequence_scores
+=
log_softmax_output
scores
,
candidates
=
paddle
.
topk
(
paddle
.
reshape
(
sequence_scores
,
[
batch_size
,
-
1
]),
beam_width
,
axis
=
1
)
# Reshape input = (bk, 1) and sequence_scores = (bk, 1)
y_prev
=
paddle
.
reshape
(
candidates
%
self
.
num_classes
,
shape
=
[
batch_size
*
beam_width
])
sequence_scores
=
paddle
.
reshape
(
scores
,
shape
=
[
batch_size
*
beam_width
,
1
])
# Update fields for next timestep
pos_index
=
paddle
.
expand_as
(
pos_index
,
candidates
)
predecessors
=
paddle
.
cast
(
candidates
/
self
.
num_classes
+
pos_index
,
dtype
=
'int64'
)
predecessors
=
paddle
.
reshape
(
predecessors
,
shape
=
[
batch_size
*
beam_width
,
1
])
state
=
paddle
.
index_select
(
state
,
index
=
predecessors
.
squeeze
(),
axis
=
1
)
# Update sequence socres and erase scores for <eos> symbol so that they aren't expanded
stored_scores
.
append
(
sequence_scores
.
clone
())
y_prev
=
paddle
.
reshape
(
y_prev
,
shape
=
[
-
1
,
1
])
eos_prev
=
paddle
.
full_like
(
y_prev
,
fill_value
=
eos
)
mask
=
eos_prev
==
y_prev
mask
=
paddle
.
nonzero
(
mask
)
if
mask
.
dim
()
>
0
:
sequence_scores
=
sequence_scores
.
numpy
()
mask
=
mask
.
numpy
()
sequence_scores
[
mask
]
=
-
float
(
'inf'
)
sequence_scores
=
paddle
.
to_tensor
(
sequence_scores
)
# Cache results for backtracking
stored_predecessors
.
append
(
predecessors
)
y_prev
=
paddle
.
squeeze
(
y_prev
)
stored_emitted_symbols
.
append
(
y_prev
)
# Do backtracking to return the optimal values
#====== backtrak ======#
# Initialize return variables given different types
p
=
list
()
l
=
[[
self
.
max_len_labels
]
*
beam_width
for
_
in
range
(
batch_size
)
]
# Placeholder for lengths of top-k sequences
# the last step output of the beams are not sorted
# thus they are sorted here
sorted_score
,
sorted_idx
=
paddle
.
topk
(
paddle
.
reshape
(
stored_scores
[
-
1
],
shape
=
[
batch_size
,
beam_width
]),
beam_width
)
# initialize the sequence scores with the sorted last step beam scores
s
=
sorted_score
.
clone
()
batch_eos_found
=
[
0
]
*
batch_size
# the number of EOS found
# in the backward loop below for each batch
t
=
self
.
max_len_labels
-
1
# initialize the back pointer with the sorted order of the last step beams.
# add pos_index for indexing variable with b*k as the first dimension.
t_predecessors
=
paddle
.
reshape
(
sorted_idx
+
pos_index
.
expand_as
(
sorted_idx
),
shape
=
[
batch_size
*
beam_width
])
while
t
>=
0
:
# Re-order the variables with the back pointer
current_symbol
=
paddle
.
index_select
(
stored_emitted_symbols
[
t
],
index
=
t_predecessors
,
axis
=
0
)
t_predecessors
=
paddle
.
index_select
(
stored_predecessors
[
t
].
squeeze
(),
index
=
t_predecessors
,
axis
=
0
)
eos_indices
=
stored_emitted_symbols
[
t
]
==
eos
eos_indices
=
paddle
.
nonzero
(
eos_indices
)
if
eos_indices
.
dim
()
>
0
:
for
i
in
range
(
eos_indices
.
shape
[
0
]
-
1
,
-
1
,
-
1
):
# Indices of the EOS symbol for both variables
# with b*k as the first dimension, and b, k for
# the first two dimensions
idx
=
eos_indices
[
i
]
b_idx
=
int
(
idx
[
0
]
/
beam_width
)
# The indices of the replacing position
# according to the replacement strategy noted above
res_k_idx
=
beam_width
-
(
batch_eos_found
[
b_idx
]
%
beam_width
)
-
1
batch_eos_found
[
b_idx
]
+=
1
res_idx
=
b_idx
*
beam_width
+
res_k_idx
# Replace the old information in return variables
# with the new ended sequence information
t_predecessors
[
res_idx
]
=
stored_predecessors
[
t
][
idx
[
0
]]
current_symbol
[
res_idx
]
=
stored_emitted_symbols
[
t
][
idx
[
0
]]
s
[
b_idx
,
res_k_idx
]
=
stored_scores
[
t
][
idx
[
0
],
0
]
l
[
b_idx
][
res_k_idx
]
=
t
+
1
# record the back tracked results
p
.
append
(
current_symbol
)
t
-=
1
# Sort and re-order again as the added ended sequences may change
# the order (very unlikely)
s
,
re_sorted_idx
=
s
.
topk
(
beam_width
)
for
b_idx
in
range
(
batch_size
):
l
[
b_idx
]
=
[
l
[
b_idx
][
k_idx
.
item
()]
for
k_idx
in
re_sorted_idx
[
b_idx
,
:]
]
re_sorted_idx
=
paddle
.
reshape
(
re_sorted_idx
+
pos_index
.
expand_as
(
re_sorted_idx
),
[
batch_size
*
beam_width
])
# Reverse the sequences and re-order at the same time
# It is reversed because the backtracking happens in reverse time order
p
=
[
paddle
.
reshape
(
paddle
.
index_select
(
step
,
re_sorted_idx
,
0
),
shape
=
[
batch_size
,
beam_width
,
-
1
])
for
step
in
reversed
(
p
)
]
p
=
paddle
.
concat
(
p
,
-
1
)[:,
0
,
:]
return
p
,
paddle
.
ones_like
(
p
)
class
AttentionUnit
(
nn
.
Layer
):
def
__init__
(
self
,
sDim
,
xDim
,
attDim
):
super
(
AttentionUnit
,
self
).
__init__
()
self
.
sDim
=
sDim
self
.
xDim
=
xDim
self
.
attDim
=
attDim
self
.
sEmbed
=
nn
.
Linear
(
sDim
,
attDim
)
self
.
xEmbed
=
nn
.
Linear
(
xDim
,
attDim
)
self
.
wEmbed
=
nn
.
Linear
(
attDim
,
1
)
def
forward
(
self
,
x
,
sPrev
):
batch_size
,
T
,
_
=
x
.
shape
# [b x T x xDim]
x
=
paddle
.
reshape
(
x
,
[
-
1
,
self
.
xDim
])
# [(b x T) x xDim]
xProj
=
self
.
xEmbed
(
x
)
# [(b x T) x attDim]
xProj
=
paddle
.
reshape
(
xProj
,
[
batch_size
,
T
,
-
1
])
# [b x T x attDim]
sPrev
=
sPrev
.
squeeze
(
0
)
sProj
=
self
.
sEmbed
(
sPrev
)
# [b x attDim]
sProj
=
paddle
.
unsqueeze
(
sProj
,
1
)
# [b x 1 x attDim]
sProj
=
paddle
.
expand
(
sProj
,
[
batch_size
,
T
,
self
.
attDim
])
# [b x T x attDim]
sumTanh
=
paddle
.
tanh
(
sProj
+
xProj
)
sumTanh
=
paddle
.
reshape
(
sumTanh
,
[
-
1
,
self
.
attDim
])
vProj
=
self
.
wEmbed
(
sumTanh
)
# [(b x T) x 1]
vProj
=
paddle
.
reshape
(
vProj
,
[
batch_size
,
T
])
alpha
=
F
.
softmax
(
vProj
,
axis
=
1
)
# attention weights for each sample in the minibatch
return
alpha
class
DecoderUnit
(
nn
.
Layer
):
def
__init__
(
self
,
sDim
,
xDim
,
yDim
,
attDim
):
super
(
DecoderUnit
,
self
).
__init__
()
self
.
sDim
=
sDim
self
.
xDim
=
xDim
self
.
yDim
=
yDim
self
.
attDim
=
attDim
self
.
emdDim
=
attDim
self
.
attention_unit
=
AttentionUnit
(
sDim
,
xDim
,
attDim
)
self
.
tgt_embedding
=
nn
.
Embedding
(
yDim
+
1
,
self
.
emdDim
,
weight_attr
=
nn
.
initializer
.
Normal
(
std
=
0.01
))
# the last is used for <BOS>
self
.
gru
=
nn
.
GRUCell
(
input_size
=
xDim
+
self
.
emdDim
,
hidden_size
=
sDim
)
self
.
fc
=
nn
.
Linear
(
sDim
,
yDim
,
weight_attr
=
nn
.
initializer
.
Normal
(
std
=
0.01
),
bias_attr
=
nn
.
initializer
.
Constant
(
value
=
0
))
self
.
embed_fc
=
nn
.
Linear
(
300
,
self
.
sDim
)
def
get_initial_state
(
self
,
embed
,
tile_times
=
1
):
assert
embed
.
shape
[
1
]
==
300
state
=
self
.
embed_fc
(
embed
)
# N * sDim
if
tile_times
!=
1
:
state
=
state
.
unsqueeze
(
1
)
trans_state
=
paddle
.
transpose
(
state
,
perm
=
[
1
,
0
,
2
])
state
=
paddle
.
tile
(
trans_state
,
repeat_times
=
[
tile_times
,
1
,
1
])
trans_state
=
paddle
.
transpose
(
state
,
perm
=
[
1
,
0
,
2
])
state
=
paddle
.
reshape
(
trans_state
,
shape
=
[
-
1
,
self
.
sDim
])
state
=
state
.
unsqueeze
(
0
)
# 1 * N * sDim
return
state
def
forward
(
self
,
x
,
sPrev
,
yPrev
):
# x: feature sequence from the image decoder.
batch_size
,
T
,
_
=
x
.
shape
alpha
=
self
.
attention_unit
(
x
,
sPrev
)
context
=
paddle
.
squeeze
(
paddle
.
matmul
(
alpha
.
unsqueeze
(
1
),
x
),
axis
=
1
)
yPrev
=
paddle
.
cast
(
yPrev
,
dtype
=
"int64"
)
yProj
=
self
.
tgt_embedding
(
yPrev
)
concat_context
=
paddle
.
concat
([
yProj
,
context
],
1
)
concat_context
=
paddle
.
squeeze
(
concat_context
,
1
)
sPrev
=
paddle
.
squeeze
(
sPrev
,
0
)
output
,
state
=
self
.
gru
(
concat_context
,
sPrev
)
output
=
paddle
.
squeeze
(
output
,
axis
=
1
)
output
=
self
.
fc
(
output
)
return
output
,
state
\ No newline at end of file
Prev
1
…
11
12
13
14
15
16
17
18
19
…
22
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment