rec_srn_all_head.py 9 KB
Newer Older
tink2123's avatar
tink2123 committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
#copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
#
#Licensed under the Apache License, Version 2.0 (the "License");
#you may not use this file except in compliance with the License.
#You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
#Unless required by applicable law or agreed to in writing, software
#distributed under the License is distributed on an "AS IS" BASIS,
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#See the License for the specific language governing permissions and
#limitations under the License.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import math

import paddle
import paddle.fluid as fluid
from paddle.fluid.param_attr import ParamAttr
import numpy as np
from .self_attention.model import wrap_encoder
from .self_attention.model import wrap_encoder_forFeature
gradient_clip = 10


class SRNPredict(object):
    def __init__(self, params):
        super(SRNPredict, self).__init__()
        self.char_num = params['char_num']
        self.max_length = params['max_text_length']

        self.num_heads = params['num_heads']
        self.num_encoder_TUs = params['num_encoder_TUs']
        self.num_decoder_TUs = params['num_decoder_TUs']
        self.hidden_dims = params['hidden_dims']

    def pvam(self, inputs, others):

        b, c, h, w = inputs.shape
        conv_features = fluid.layers.reshape(x=inputs, shape=[-1, c, h * w])
        conv_features = fluid.layers.transpose(x=conv_features, perm=[0, 2, 1])

        #===== Transformer encoder =====
        b, t, c = conv_features.shape
        encoder_word_pos = others["encoder_word_pos"]
        gsrm_word_pos = others["gsrm_word_pos"]

        enc_inputs = [conv_features, encoder_word_pos, None]
tink2123's avatar
tink2123 committed
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
        word_features = wrap_encoder_forFeature(
            src_vocab_size=-1,
            max_length=t,
            n_layer=self.num_encoder_TUs,
            n_head=self.num_heads,
            d_key=int(self.hidden_dims / self.num_heads),
            d_value=int(self.hidden_dims / self.num_heads),
            d_model=self.hidden_dims,
            d_inner_hid=self.hidden_dims,
            prepostprocess_dropout=0.1,
            attention_dropout=0.1,
            relu_dropout=0.1,
            preprocess_cmd="n",
            postprocess_cmd="da",
            weight_sharing=True,
            enc_inputs=enc_inputs, )
        fluid.clip.set_gradient_clip(
            fluid.clip.GradientClipByValue(gradient_clip))
tink2123's avatar
tink2123 committed
71
72
73
74

        #===== Parallel Visual Attention Module =====
        b, t, c = word_features.shape

tink2123's avatar
tink2123 committed
75
        word_features = fluid.layers.fc(word_features, c, num_flatten_dims=2)
tink2123's avatar
tink2123 committed
76
        word_features_ = fluid.layers.reshape(word_features, [-1, 1, t, c])
tink2123's avatar
tink2123 committed
77
78
79
80
81
82
        word_features_ = fluid.layers.expand(word_features_,
                                             [1, self.max_length, 1, 1])
        word_pos_feature = fluid.layers.embedding(gsrm_word_pos,
                                                  [self.max_length, c])
        word_pos_ = fluid.layers.reshape(word_pos_feature,
                                         [-1, self.max_length, 1, c])
tink2123's avatar
tink2123 committed
83
        word_pos_ = fluid.layers.expand(word_pos_, [1, 1, t, 1])
tink2123's avatar
tink2123 committed
84
85
86
87
88
89
90
91
92
93
        temp = fluid.layers.elementwise_add(
            word_features_, word_pos_, act='tanh')

        attention_weight = fluid.layers.fc(input=temp,
                                           size=1,
                                           num_flatten_dims=3,
                                           bias_attr=False)
        attention_weight = fluid.layers.reshape(
            x=attention_weight, shape=[-1, self.max_length, t])
        attention_weight = fluid.layers.softmax(input=attention_weight, axis=-1)
tink2123's avatar
tink2123 committed
94

tink2123's avatar
tink2123 committed
95
96
        pvam_features = fluid.layers.matmul(attention_weight,
                                            word_features)  #[b, max_length, c]
tink2123's avatar
tink2123 committed
97
98

        return pvam_features
tink2123's avatar
tink2123 committed
99

tink2123's avatar
tink2123 committed
100
101
102
103
    def gsrm(self, pvam_features, others):

        #===== GSRM Visual-to-semantic embedding block =====
        b, t, c = pvam_features.shape
tink2123's avatar
tink2123 committed
104
105
106
107
        word_out = fluid.layers.fc(
            input=fluid.layers.reshape(pvam_features, [-1, c]),
            size=self.char_num,
            act="softmax")
tink2123's avatar
tink2123 committed
108
109
110
111
112
113
114
        #word_out.stop_gradient = True
        word_ids = fluid.layers.argmax(word_out, axis=1)
        word_ids.stop_gradient = True
        word_ids = fluid.layers.reshape(x=word_ids, shape=[-1, t, 1])

        #===== GSRM Semantic reasoning block =====
        """
tink2123's avatar
tink2123 committed
115
        This module is achieved through bi-transformers,
tink2123's avatar
tink2123 committed
116
117
118
119
120
121
122
123
124
125
126
127
128
        ngram_feature1 is the froward one, ngram_fetaure2 is the backward one
        """
        pad_idx = self.char_num
        gsrm_word_pos = others["gsrm_word_pos"]
        gsrm_slf_attn_bias1 = others["gsrm_slf_attn_bias1"]
        gsrm_slf_attn_bias2 = others["gsrm_slf_attn_bias2"]

        def prepare_bi(word_ids):
            """
            prepare bi for gsrm
            word1 for forward; word2 for backward
            """
            word1 = fluid.layers.cast(word_ids, "float32")
tink2123's avatar
tink2123 committed
129
130
            word1 = fluid.layers.pad(word1, [0, 0, 1, 0, 0, 0],
                                     pad_value=1.0 * pad_idx)
tink2123's avatar
tink2123 committed
131
132
133
134
135
136
137
138
139
140
141
            word1 = fluid.layers.cast(word1, "int64")
            word1 = word1[:, :-1, :]
            word2 = word_ids
            return word1, word2

        word1, word2 = prepare_bi(word_ids)
        word1.stop_gradient = True
        word2.stop_gradient = True
        enc_inputs_1 = [word1, gsrm_word_pos, gsrm_slf_attn_bias1]
        enc_inputs_2 = [word2, gsrm_word_pos, gsrm_slf_attn_bias2]

tink2123's avatar
tink2123 committed
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
        gsrm_feature1 = wrap_encoder(
            src_vocab_size=self.char_num + 1,
            max_length=self.max_length,
            n_layer=self.num_decoder_TUs,
            n_head=self.num_heads,
            d_key=int(self.hidden_dims / self.num_heads),
            d_value=int(self.hidden_dims / self.num_heads),
            d_model=self.hidden_dims,
            d_inner_hid=self.hidden_dims,
            prepostprocess_dropout=0.1,
            attention_dropout=0.1,
            relu_dropout=0.1,
            preprocess_cmd="n",
            postprocess_cmd="da",
            weight_sharing=True,
            enc_inputs=enc_inputs_1, )
        gsrm_feature2 = wrap_encoder(
            src_vocab_size=self.char_num + 1,
            max_length=self.max_length,
            n_layer=self.num_decoder_TUs,
            n_head=self.num_heads,
            d_key=int(self.hidden_dims / self.num_heads),
            d_value=int(self.hidden_dims / self.num_heads),
            d_model=self.hidden_dims,
            d_inner_hid=self.hidden_dims,
            prepostprocess_dropout=0.1,
            attention_dropout=0.1,
            relu_dropout=0.1,
            preprocess_cmd="n",
            postprocess_cmd="da",
            weight_sharing=True,
            enc_inputs=enc_inputs_2, )
        gsrm_feature2 = fluid.layers.pad(gsrm_feature2, [0, 0, 0, 1, 0, 0],
                                         pad_value=0.)
tink2123's avatar
tink2123 committed
176
177
178
179
180
181
182
        gsrm_feature2 = gsrm_feature2[:, 1:, ]
        gsrm_features = gsrm_feature1 + gsrm_feature2

        b, t, c = gsrm_features.shape

        gsrm_out = fluid.layers.matmul(
            x=gsrm_features,
tink2123's avatar
tink2123 committed
183
184
            y=fluid.default_main_program().global_block().var(
                "src_word_emb_table"),
tink2123's avatar
tink2123 committed
185
            transpose_y=True)
tink2123's avatar
tink2123 committed
186
187
188
        b, t, c = gsrm_out.shape
        gsrm_out = fluid.layers.softmax(input=fluid.layers.reshape(gsrm_out,
                                                                   [-1, c]))
tink2123's avatar
tink2123 committed
189
190
191
192
193
194
195
196

        return gsrm_features, word_out, gsrm_out

    def vsfd(self, pvam_features, gsrm_features):

        #===== Visual-Semantic Fusion Decoder Module =====
        b, t, c1 = pvam_features.shape
        b, t, c2 = gsrm_features.shape
tink2123's avatar
tink2123 committed
197
198
199
200
201
202
203
204
205
206
207
208
209
        combine_features_ = fluid.layers.concat(
            [pvam_features, gsrm_features], axis=2)
        img_comb_features_ = fluid.layers.reshape(
            x=combine_features_, shape=[-1, c1 + c2])
        img_comb_features_map = fluid.layers.fc(input=img_comb_features_,
                                                size=c1,
                                                act="sigmoid")
        img_comb_features_map = fluid.layers.reshape(
            x=img_comb_features_map, shape=[-1, t, c1])
        combine_features = img_comb_features_map * pvam_features + (
            1.0 - img_comb_features_map) * gsrm_features
        img_comb_features = fluid.layers.reshape(
            x=combine_features, shape=[-1, c1])
tink2123's avatar
tink2123 committed
210
211
212
213
214
215
216
217
218
219
220
221
222

        fc_out = fluid.layers.fc(input=img_comb_features,
                                 size=self.char_num,
                                 act="softmax")
        return fc_out

    def __call__(self, inputs, others, mode=None):

        pvam_features = self.pvam(inputs, others)
        gsrm_features, word_out, gsrm_out = self.gsrm(pvam_features, others)
        final_out = self.vsfd(pvam_features, gsrm_features)

        _, decoded_out = fluid.layers.topk(input=final_out, k=1)
tink2123's avatar
tink2123 committed
223
224
225
226
227
228
        predicts = {
            'predict': final_out,
            'decoded_out': decoded_out,
            'word_out': word_out,
            'gsrm_out': gsrm_out
        }
tink2123's avatar
tink2123 committed
229
230

        return predicts