fconv.py 21.3 KB
Newer Older
Sergey Edunov's avatar
Sergey Edunov committed
1
2
3
4
5
6
7
8
9
10
11
12
# Copyright (c) 2017-present, Facebook, Inc.
# All rights reserved.
#
# This source code is licensed under the license found in the LICENSE file in
# the root directory of this source tree. An additional grant of patent rights
# can be found in the PATENTS file in the same directory.

import math
import torch
import torch.nn as nn
import torch.nn.functional as F

13
from fairseq import utils
Myle Ott's avatar
Myle Ott committed
14
from fairseq.data import LanguagePairDataset
15
from fairseq.modules import BeamableMM, GradMultiply, LearnedPositionalEmbedding, LinearizedConvolution
Sergey Edunov's avatar
Sergey Edunov committed
16

Myle Ott's avatar
Myle Ott committed
17
from . import FairseqEncoder, FairseqIncrementalDecoder, FairseqModel, register_model, register_model_architecture
Sergey Edunov's avatar
Sergey Edunov committed
18
19


Myle Ott's avatar
Myle Ott committed
20
@register_model('fconv')
Myle Ott's avatar
Myle Ott committed
21
22
23
24
class FConvModel(FairseqModel):
    def __init__(self, encoder, decoder):
        super().__init__(encoder, decoder)
        self.encoder.num_attention_layers = sum(layer is not None for layer in decoder.attention)
Sergey Edunov's avatar
Sergey Edunov committed
25

Myle Ott's avatar
Myle Ott committed
26
27
28
29
30
31
32
    @staticmethod
    def add_args(parser):
        """Add model-specific arguments to the parser."""
        parser.add_argument('--dropout', default=0.1, type=float, metavar='D',
                            help='dropout probability')
        parser.add_argument('--encoder-embed-dim', type=int, metavar='N',
                            help='encoder embedding dimension')
Sai's avatar
Sai committed
33
34
        parser.add_argument('--encoder-embed-path', default=None, type=str, metavar='STR',
                            help='path to pre-trained encoder embedding')
Myle Ott's avatar
Myle Ott committed
35
36
37
38
        parser.add_argument('--encoder-layers', type=str, metavar='EXPR',
                            help='encoder layers [(dim, kernel_size), ...]')
        parser.add_argument('--decoder-embed-dim', type=int, metavar='N',
                            help='decoder embedding dimension')
Sai's avatar
Sai committed
39
40
        parser.add_argument('--decoder-embed-path', default=None, type=str, metavar='STR',
                            help='path to pre-trained decoder embedding')
Myle Ott's avatar
Myle Ott committed
41
42
43
44
45
46
47
48
49
50
51
52
53
        parser.add_argument('--decoder-layers', type=str, metavar='EXPR',
                            help='decoder layers [(dim, kernel_size), ...]')
        parser.add_argument('--decoder-out-embed-dim', type=int, metavar='N',
                            help='decoder output embedding dimension')
        parser.add_argument('--decoder-attention', type=str, metavar='EXPR',
                            help='decoder attention [True, ...]')
        parser.add_argument('--share-input-output-embed', action='store_true',
                            help='share input and output embeddings (requires'
                                 ' --decoder-out-embed-dim and --decoder-embed-dim'
                                 ' to be equal)')

    @classmethod
    def build_model(cls, args, src_dict, dst_dict):
54
55
56
        # make sure that all args are properly defaulted (in case there are any new ones)
        base_architecture(args)

Myle Ott's avatar
Myle Ott committed
57
        """Build a new model instance."""
Myle Ott's avatar
Myle Ott committed
58
59
60
61
62
        if not hasattr(args, 'max_source_positions'):
            args.max_source_positions = args.max_positions
            args.max_target_positions = args.max_positions
        if not hasattr(args, 'share_input_output_embed'):
            args.share_input_output_embed = False
Sai's avatar
Sai committed
63
64
65
66
67
68
69
70
71
72
73

        encoder_embed_dict = None
        if args.encoder_embed_path:
            encoder_embed_dict = utils.parse_embedding(args.encoder_embed_path)
            utils.print_embed_overlap(encoder_embed_dict, src_dict)

        decoder_embed_dict = None
        if args.decoder_embed_path:
            decoder_embed_dict = utils.parse_embedding(args.decoder_embed_path)
            utils.print_embed_overlap(decoder_embed_dict, dst_dict)

Myle Ott's avatar
Myle Ott committed
74
75
76
        encoder = FConvEncoder(
            src_dict,
            embed_dim=args.encoder_embed_dim,
Sai's avatar
Sai committed
77
            embed_dict=encoder_embed_dict,
Myle Ott's avatar
Myle Ott committed
78
79
80
81
82
83
84
            convolutions=eval(args.encoder_layers),
            dropout=args.dropout,
            max_positions=args.max_source_positions,
        )
        decoder = FConvDecoder(
            dst_dict,
            embed_dim=args.decoder_embed_dim,
Sai's avatar
Sai committed
85
            embed_dict=decoder_embed_dict,
Myle Ott's avatar
Myle Ott committed
86
87
88
89
90
91
92
93
94
            convolutions=eval(args.decoder_layers),
            out_embed_dim=args.decoder_out_embed_dim,
            attention=eval(args.decoder_attention),
            dropout=args.dropout,
            max_positions=args.max_target_positions,
            share_embed=args.share_input_output_embed
        )
        return FConvModel(encoder, decoder)

Sergey Edunov's avatar
Sergey Edunov committed
95

Myle Ott's avatar
Myle Ott committed
96
class FConvEncoder(FairseqEncoder):
Sergey Edunov's avatar
Sergey Edunov committed
97
    """Convolutional encoder"""
Sai's avatar
Sai committed
98
99
    def __init__(self, dictionary, embed_dim=512, embed_dict=None,
                 max_positions=1024, convolutions=((512, 3),) * 20, dropout=0.1):
100
        super().__init__(dictionary)
Sergey Edunov's avatar
Sergey Edunov committed
101
102
        self.dropout = dropout
        self.num_attention_layers = None
103
104

        num_embeddings = len(dictionary)
105
106
        self.padding_idx = dictionary.pad()
        self.embed_tokens = Embedding(num_embeddings, embed_dim, self.padding_idx)
Sai's avatar
Sai committed
107
108
109
        if embed_dict:
            self.embed_tokens = utils.load_embedding(embed_dict, self.dictionary, self.embed_tokens)

Myle Ott's avatar
Myle Ott committed
110
111
112
        self.embed_positions = PositionalEmbedding(
            max_positions,
            embed_dim,
113
            self.padding_idx,
Myle Ott's avatar
Myle Ott committed
114
115
            left_pad=LanguagePairDataset.LEFT_PAD_SOURCE,
        )
Sergey Edunov's avatar
Sergey Edunov committed
116
117
118
119
120
121
122
123

        in_channels = convolutions[0][0]
        self.fc1 = Linear(embed_dim, in_channels, dropout=dropout)
        self.projections = nn.ModuleList()
        self.convolutions = nn.ModuleList()
        for (out_channels, kernel_size) in convolutions:
            self.projections.append(Linear(in_channels, out_channels)
                                    if in_channels != out_channels else None)
124
            if kernel_size % 2 == 1:
Myle Ott's avatar
Flake8  
Myle Ott committed
125
                padding = kernel_size // 2
126
            else:
Myle Ott's avatar
Flake8  
Myle Ott committed
127
                padding = 0
Sergey Edunov's avatar
Sergey Edunov committed
128
            self.convolutions.append(
129
                ConvTBC(in_channels, out_channels * 2, kernel_size,
130
                        dropout=dropout, padding=padding)
131
            )
Sergey Edunov's avatar
Sergey Edunov committed
132
133
134
            in_channels = out_channels
        self.fc2 = Linear(in_channels, embed_dim)

Myle Ott's avatar
Myle Ott committed
135
    def forward(self, src_tokens, src_lengths):
Sergey Edunov's avatar
Sergey Edunov committed
136
        # embed tokens and positions
137
        x = self.embed_tokens(src_tokens) + self.embed_positions(src_tokens)
Sergey Edunov's avatar
Sergey Edunov committed
138
139
140
141
142
143
        x = F.dropout(x, p=self.dropout, training=self.training)
        input_embedding = x

        # project to size of convolution
        x = self.fc1(x)

144
145
146
147
148
        # used to mask padding in input
        encoder_padding_mask = src_tokens.eq(self.padding_idx).t()  # -> T x B
        if not encoder_padding_mask.any():
            encoder_padding_mask = None

Sergey Edunov's avatar
Sergey Edunov committed
149
150
151
152
153
154
        # B x T x C -> T x B x C
        x = x.transpose(0, 1)

        # temporal convolutions
        for proj, conv in zip(self.projections, self.convolutions):
            residual = x if proj is None else proj(x)
155
156
157
158

            if encoder_padding_mask is not None:
                x = x.masked_fill(encoder_padding_mask.unsqueeze(-1), 0)

Sergey Edunov's avatar
Sergey Edunov committed
159
            x = F.dropout(x, p=self.dropout, training=self.training)
160
161
162
            if conv.kernel_size[0] % 2 == 1:
                # padding is implicit in the conv
                x = conv(x)
Sai's avatar
Sai committed
163
            else:
164
165
166
167
                padding_l = (conv.kernel_size[0] - 1) // 2
                padding_r = conv.kernel_size[0] // 2
                x = F.pad(x, (0, 0, 0, 0, padding_l, padding_r))
                x = conv(x)
168
            x = F.glu(x, dim=2)
Sergey Edunov's avatar
Sergey Edunov committed
169
170
171
172
173
174
175
176
            x = (x + residual) * math.sqrt(0.5)

        # T x B x C -> B x T x C
        x = x.transpose(1, 0)

        # project back to size of embedding
        x = self.fc2(x)

177
178
179
180
        if encoder_padding_mask is not None:
            encoder_padding_mask = encoder_padding_mask.t()  # -> B x T
            x = x.masked_fill(encoder_padding_mask.unsqueeze(-1), 0)

Sergey Edunov's avatar
Sergey Edunov committed
181
        # scale gradients (this only affects backward, not forward)
Myle Ott's avatar
Myle Ott committed
182
        x = GradMultiply.apply(x, 1.0 / (2.0 * self.num_attention_layers))
Sergey Edunov's avatar
Sergey Edunov committed
183
184
185
186

        # add output to input embedding for attention
        y = (x + input_embedding) * math.sqrt(0.5)

187
188
189
190
        return {
            'encoder_out': (x, y),
            'encoder_padding_mask': encoder_padding_mask,  # B x T
        }
Sergey Edunov's avatar
Sergey Edunov committed
191

Myle Ott's avatar
Myle Ott committed
192
193
    def max_positions(self):
        """Maximum input length supported by the encoder."""
194
        return self.embed_positions.max_positions()
Myle Ott's avatar
Myle Ott committed
195

Sergey Edunov's avatar
Sergey Edunov committed
196
197
198

class AttentionLayer(nn.Module):
    def __init__(self, conv_channels, embed_dim, bmm=None):
Myle Ott's avatar
Myle Ott committed
199
        super().__init__()
Sergey Edunov's avatar
Sergey Edunov committed
200
201
202
203
204
205
206
        # projects from output of convolution to embedding dimension
        self.in_projection = Linear(conv_channels, embed_dim)
        # projects from embedding dimension to convolution size
        self.out_projection = Linear(embed_dim, conv_channels)

        self.bmm = bmm if bmm is not None else torch.bmm

207
    def forward(self, x, target_embedding, encoder_out, encoder_padding_mask):
Sergey Edunov's avatar
Sergey Edunov committed
208
209
210
211
212
213
        residual = x

        # attention
        x = (self.in_projection(x) + target_embedding) * math.sqrt(0.5)
        x = self.bmm(x, encoder_out[0])

214
215
216
217
218
219
220
        # don't attend over padding
        if encoder_padding_mask is not None:
            x = x.float().masked_fill(
                encoder_padding_mask.unsqueeze(1),
                float('-inf')
            ).type_as(x)  # FP16 support: cast to float and back

Sergey Edunov's avatar
Sergey Edunov committed
221
222
        # softmax over last dim
        sz = x.size()
223
        x = F.softmax(x.view(sz[0] * sz[1], sz[2]), dim=1)
Sergey Edunov's avatar
Sergey Edunov committed
224
225
226
227
228
        x = x.view(sz)
        attn_scores = x

        x = self.bmm(x, encoder_out[1])

229
        # scale attention output (respecting potentially different lengths)
Sergey Edunov's avatar
Sergey Edunov committed
230
        s = encoder_out[1].size(1)
231
232
233
234
235
236
        if encoder_padding_mask is None:
            x = x * (s * math.sqrt(1.0 / s))
        else:
            s = s - encoder_padding_mask.type_as(x).sum(dim=1, keepdim=True)  # exclude padding
            s = s.unsqueeze(-1)
            x = x * (s * s.rsqrt())
Sergey Edunov's avatar
Sergey Edunov committed
237
238
239
240
241

        # project back
        x = (self.out_projection(x) + residual) * math.sqrt(0.5)
        return x, attn_scores

Myle Ott's avatar
Myle Ott committed
242
243
244
    def make_generation_fast_(self, beamable_mm_beam_size=None, **kwargs):
        """Replace torch.bmm with BeamableMM."""
        if beamable_mm_beam_size is not None:
Myle Ott's avatar
Myle Ott committed
245
246
            del self.bmm
            self.add_module('bmm', BeamableMM(beamable_mm_beam_size))
Sergey Edunov's avatar
Sergey Edunov committed
247

Myle Ott's avatar
Myle Ott committed
248
249

class FConvDecoder(FairseqIncrementalDecoder):
Sergey Edunov's avatar
Sergey Edunov committed
250
    """Convolutional decoder"""
Sai's avatar
Sai committed
251
252
    def __init__(self, dictionary, embed_dim=512,
                 embed_dict=None, out_embed_dim=256,
Sergey Edunov's avatar
Sergey Edunov committed
253
                 max_positions=1024, convolutions=((512, 3),) * 20,
Sergey Edunov's avatar
Sergey Edunov committed
254
                 attention=True, dropout=0.1, share_embed=False):
255
        super().__init__(dictionary)
256
        self.register_buffer('version', torch.Tensor([2]))
Sergey Edunov's avatar
Sergey Edunov committed
257
258
259
260
261
262
        self.dropout = dropout

        in_channels = convolutions[0][0]
        if isinstance(attention, bool):
            # expand True into [True, True, ...] and do the same with False
            attention = [attention] * len(convolutions)
263
264
265
        if not isinstance(attention, list) or len(attention) != len(convolutions):
            raise ValueError('Attention is expected to be a list of booleans of '
                             'length equal to the number of layers.')
Sergey Edunov's avatar
Sergey Edunov committed
266

267
268
        num_embeddings = len(dictionary)
        padding_idx = dictionary.pad()
Sergey Edunov's avatar
Sergey Edunov committed
269
        self.embed_tokens = Embedding(num_embeddings, embed_dim, padding_idx)
Sai's avatar
Sai committed
270
271
272
        if embed_dict:
            self.embed_tokens = utils.load_embedding(embed_dict, self.dictionary, self.embed_tokens)

Myle Ott's avatar
Myle Ott committed
273
274
275
276
277
278
        self.embed_positions = PositionalEmbedding(
            max_positions,
            embed_dim,
            padding_idx,
            left_pad=LanguagePairDataset.LEFT_PAD_TARGET,
        )
279

Sergey Edunov's avatar
Sergey Edunov committed
280
281
282
283
284
285
286
287
288
        self.fc1 = Linear(embed_dim, in_channels, dropout=dropout)
        self.projections = nn.ModuleList()
        self.convolutions = nn.ModuleList()
        self.attention = nn.ModuleList()
        for i, (out_channels, kernel_size) in enumerate(convolutions):
            self.projections.append(Linear(in_channels, out_channels)
                                    if in_channels != out_channels else None)
            self.convolutions.append(
                LinearizedConv1d(in_channels, out_channels * 2, kernel_size,
289
290
                                 padding=(kernel_size - 1), dropout=dropout)
            )
Sergey Edunov's avatar
Sergey Edunov committed
291
292
293
294
            self.attention.append(AttentionLayer(out_channels, embed_dim)
                                  if attention[i] else None)
            in_channels = out_channels
        self.fc2 = Linear(in_channels, out_embed_dim)
Sergey Edunov's avatar
Sergey Edunov committed
295
296
297
298
299
300
301
302
        if share_embed:
            assert out_embed_dim == embed_dim, \
                "Shared embed weights implies same dimensions " \
                " out_embed_dim={} vs embed_dim={}".format(out_embed_dim, embed_dim)
            self.fc3 = nn.Linear(out_embed_dim, num_embeddings)
            self.fc3.weight = self.embed_tokens.weight
        else:
            self.fc3 = Linear(out_embed_dim, num_embeddings, dropout=dropout)
Sergey Edunov's avatar
Sergey Edunov committed
303

304
305
306
307
    def forward(self, prev_output_tokens, encoder_out_dict, incremental_state=None):
        encoder_out = encoder_out_dict['encoder_out']
        encoder_padding_mask = encoder_out_dict['encoder_padding_mask']

Myle Ott's avatar
Myle Ott committed
308
        # split and transpose encoder outputs
309
        encoder_a, encoder_b = self._split_encoder_out(encoder_out, incremental_state)
Sergey Edunov's avatar
Sergey Edunov committed
310

311
        # embed tokens and combine with positional embeddings
Myle Ott's avatar
Myle Ott committed
312
313
314
        pos_embed = self.embed_positions(prev_output_tokens, incremental_state)
        if incremental_state is not None:
            prev_output_tokens = prev_output_tokens[:, -1:]
315
        x = self._embed_tokens(prev_output_tokens, incremental_state)
Myle Ott's avatar
Myle Ott committed
316
        x += pos_embed
Sergey Edunov's avatar
Sergey Edunov committed
317
318
319
320
321
322
323
        x = F.dropout(x, p=self.dropout, training=self.training)
        target_embedding = x

        # project to size of convolution
        x = self.fc1(x)

        # B x T x C -> T x B x C
324
        x = self._transpose_if_training(x, incremental_state)
Sergey Edunov's avatar
Sergey Edunov committed
325
326

        # temporal convolutions
Myle Ott's avatar
Myle Ott committed
327
328
        avg_attn_scores = None
        num_attn_layers = len(self.attention)
Sergey Edunov's avatar
Sergey Edunov committed
329
330
331
332
        for proj, conv, attention in zip(self.projections, self.convolutions, self.attention):
            residual = x if proj is None else proj(x)

            x = F.dropout(x, p=self.dropout, training=self.training)
333
            x = conv(x, incremental_state)
334
            x = F.glu(x, dim=2)
Sergey Edunov's avatar
Sergey Edunov committed
335
336
337

            # attention
            if attention is not None:
338
                x = self._transpose_if_training(x, incremental_state)
Sergey Edunov's avatar
Sergey Edunov committed
339

340
                x, attn_scores = attention(x, target_embedding, (encoder_a, encoder_b), encoder_padding_mask)
Sergey Edunov's avatar
Sergey Edunov committed
341
342
343
344
                attn_scores = attn_scores / num_attn_layers
                if avg_attn_scores is None:
                    avg_attn_scores = attn_scores
                else:
Myle Ott's avatar
Myle Ott committed
345
346
                    avg_attn_scores.add_(attn_scores)

347
                x = self._transpose_if_training(x, incremental_state)
Sergey Edunov's avatar
Sergey Edunov committed
348
349
350
351

            # residual
            x = (x + residual) * math.sqrt(0.5)

Myle Ott's avatar
Myle Ott committed
352
        # T x B x C -> B x T x C
353
        x = self._transpose_if_training(x, incremental_state)
Myle Ott's avatar
Myle Ott committed
354

Sergey Edunov's avatar
Sergey Edunov committed
355
356
        # project back to size of vocabulary
        x = self.fc2(x)
Myle Ott's avatar
Myle Ott committed
357
        x = F.dropout(x, p=self.dropout, training=self.training)
Sergey Edunov's avatar
Sergey Edunov committed
358
359
360
361
        x = self.fc3(x)

        return x, avg_attn_scores

Myle Ott's avatar
Myle Ott committed
362
363
364
365
366
367
368
369
370
371
372
373
374
    def reorder_incremental_state(self, incremental_state, new_order):
        super().reorder_incremental_state(incremental_state, new_order)
        encoder_out = utils.get_incremental_state(self, incremental_state, 'encoder_out')
        if encoder_out is not None:
            encoder_out = tuple(eo.index_select(0, new_order) for eo in encoder_out)
            utils.set_incremental_state(self, incremental_state, 'encoder_out', encoder_out)

    def reorder_encoder_out(self, encoder_out_dict, new_order):
        if encoder_out_dict['encoder_padding_mask'] is not None:
            encoder_out_dict['encoder_padding_mask'] = \
                encoder_out_dict['encoder_padding_mask'].index_select(0, new_order)
        return encoder_out_dict

Myle Ott's avatar
Myle Ott committed
375
376
    def max_positions(self):
        """Maximum output length supported by the decoder."""
377
        return self.embed_positions.max_positions()
Sergey Edunov's avatar
Sergey Edunov committed
378

379
380
381
382
383
384
385
386
387
388
    def upgrade_state_dict(self, state_dict):
        if state_dict.get('decoder.version', torch.Tensor([1]))[0] < 2:
            # old models use incorrect weight norm dimension
            for i, conv in enumerate(self.convolutions):
                # reconfigure weight norm
                nn.utils.remove_weight_norm(conv)
                self.convolutions[i] = nn.utils.weight_norm(conv, dim=0)
            state_dict['decoder.version'] = torch.Tensor([1])
        return state_dict

389
390
391
392
393
394
395
    def _embed_tokens(self, tokens, incremental_state):
        if incremental_state is not None:
            # keep only the last token for incremental forward pass
            tokens = tokens[:, -1:]
        return self.embed_tokens(tokens)

    def _split_encoder_out(self, encoder_out, incremental_state):
Myle Ott's avatar
Myle Ott committed
396
        """Split and transpose encoder outputs.
Sergey Edunov's avatar
Sergey Edunov committed
397

Myle Ott's avatar
Myle Ott committed
398
        This is cached when doing incremental inference.
Sergey Edunov's avatar
Sergey Edunov committed
399
        """
400
401
        cached_result = utils.get_incremental_state(self, incremental_state, 'encoder_out')
        if cached_result is not None:
Myle Ott's avatar
Myle Ott committed
402
403
404
405
406
407
408
            return cached_result

        # transpose only once to speed up attention layers
        encoder_a, encoder_b = encoder_out
        encoder_a = encoder_a.transpose(1, 2).contiguous()
        result = (encoder_a, encoder_b)

409
410
411
        if incremental_state is not None:
            utils.set_incremental_state(self, incremental_state, 'encoder_out', result)
        return result
Sergey Edunov's avatar
Sergey Edunov committed
412

413
414
415
416
    def _transpose_if_training(self, x, incremental_state):
        if incremental_state is None:
            x = x.transpose(0, 1)
        return x
Sergey Edunov's avatar
Sergey Edunov committed
417
418
419
420


def Embedding(num_embeddings, embedding_dim, padding_idx):
    m = nn.Embedding(num_embeddings, embedding_dim, padding_idx=padding_idx)
421
422
    nn.init.normal(m.weight, 0, 0.1)
    nn.init.constant(m.weight[padding_idx], 0)
Sergey Edunov's avatar
Sergey Edunov committed
423
424
425
    return m


426
427
def PositionalEmbedding(num_embeddings, embedding_dim, padding_idx, left_pad):
    m = LearnedPositionalEmbedding(num_embeddings, embedding_dim, padding_idx, left_pad)
428
429
    nn.init.normal(m.weight, 0, 0.1)
    nn.init.constant(m.weight[padding_idx], 0)
430
431
432
    return m


Sergey Edunov's avatar
Sergey Edunov committed
433
434
435
436
437
438
439
440
441
442
443
444
445
446
def Linear(in_features, out_features, dropout=0):
    """Weight-normalized Linear layer (input: N x T x C)"""
    m = nn.Linear(in_features, out_features)
    m.weight.data.normal_(mean=0, std=math.sqrt((1 - dropout) / in_features))
    m.bias.data.zero_()
    return nn.utils.weight_norm(m)


def LinearizedConv1d(in_channels, out_channels, kernel_size, dropout=0, **kwargs):
    """Weight-normalized Conv1d layer optimized for decoding"""
    m = LinearizedConvolution(in_channels, out_channels, kernel_size, **kwargs)
    std = math.sqrt((4 * (1.0 - dropout)) / (m.kernel_size[0] * in_channels))
    m.weight.data.normal_(mean=0, std=std)
    m.bias.data.zero_()
447
    return nn.utils.weight_norm(m, dim=2)
Sergey Edunov's avatar
Sergey Edunov committed
448
449
450
451
452
453
454
455
456
457
458
459


def ConvTBC(in_channels, out_channels, kernel_size, dropout=0, **kwargs):
    """Weight-normalized Conv1d layer"""
    from fairseq.modules import ConvTBC
    m = ConvTBC(in_channels, out_channels, kernel_size, **kwargs)
    std = math.sqrt((4 * (1.0 - dropout)) / (m.kernel_size[0] * in_channels))
    m.weight.data.normal_(mean=0, std=std)
    m.bias.data.zero_()
    return nn.utils.weight_norm(m, dim=2)


Myle Ott's avatar
Myle Ott committed
460
461
@register_model_architecture('fconv', 'fconv')
def base_architecture(args):
462
463
464
465
466
467
    args.encoder_embed_dim = getattr(args, 'encoder_embed_dim', 512)
    args.encoder_layers = getattr(args, 'encoder_layers', '[(512, 3)] * 20')
    args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 512)
    args.decoder_layers = getattr(args, 'decoder_layers', '[(512, 3)] * 20')
    args.decoder_out_embed_dim = getattr(args, 'decoder_out_embed_dim', 256)
    args.decoder_attention = getattr(args, 'decoder_attention', 'True')
Sergey Edunov's avatar
Sergey Edunov committed
468
    args.share_input_output_embed = getattr(args, 'share_input_output_embed', False)
Myle Ott's avatar
Myle Ott committed
469
470
471
472


@register_model_architecture('fconv', 'fconv_iwslt_de_en')
def fconv_iwslt_de_en(args):
473
474
475
476
477
    args.encoder_embed_dim = getattr(args, 'encoder_embed_dim', 256)
    args.encoder_layers = getattr(args, 'encoder_layers', '[(256, 3)] * 4')
    args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 256)
    args.decoder_layers = getattr(args, 'decoder_layers', '[(256, 3)] * 3')
    args.decoder_out_embed_dim = getattr(args, 'decoder_out_embed_dim', 256)
Myle Ott's avatar
Myle Ott committed
478
479
480
481
482
    base_architecture(args)


@register_model_architecture('fconv', 'fconv_wmt_en_ro')
def fconv_wmt_en_ro(args):
483
    args.decoder_out_embed_dim = getattr(args, 'decoder_out_embed_dim', 512)
Myle Ott's avatar
Myle Ott committed
484
485
486
487
488
489
490
491
    base_architecture(args)


@register_model_architecture('fconv', 'fconv_wmt_en_de')
def fconv_wmt_en_de(args):
    convs = '[(512, 3)] * 9'       # first 9 layers have 512 units
    convs += ' + [(1024, 3)] * 4'  # next 4 layers have 1024 units
    convs += ' + [(2048, 1)] * 2'  # final 2 layers use 1x1 convolutions
492
493
494
495
496
497
498

    args.encoder_embed_dim = getattr(args, 'encoder_embed_dim', 768)
    args.encoder_layers = getattr(args, 'encoder_layers', convs)
    args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 768)
    args.decoder_layers = getattr(args, 'decoder_layers', convs)
    args.decoder_out_embed_dim = getattr(args, 'decoder_out_embed_dim', 512)
    base_architecture(args)
Myle Ott's avatar
Myle Ott committed
499
500
501
502
503
504
505
506
507


@register_model_architecture('fconv', 'fconv_wmt_en_fr')
def fconv_wmt_en_fr(args):
    convs = '[(512, 3)] * 6'       # first 6 layers have 512 units
    convs += ' + [(768, 3)] * 4'   # next 4 layers have 768 units
    convs += ' + [(1024, 3)] * 3'  # next 3 layers have 1024 units
    convs += ' + [(2048, 1)] * 1'  # next 1 layer uses 1x1 convolutions
    convs += ' + [(4096, 1)] * 1'  # final 1 layer uses 1x1 convolutions
508
509
510
511
512
513
514

    args.encoder_embed_dim = getattr(args, 'encoder_embed_dim', 768)
    args.encoder_layers = getattr(args, 'encoder_layers', convs)
    args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 768)
    args.decoder_layers = getattr(args, 'decoder_layers', convs)
    args.decoder_out_embed_dim = getattr(args, 'decoder_out_embed_dim', 512)
    base_architecture(args)