"examples/translation/t5/README.md" did not exist on "b4fb94fe6d831b17c0df364b2848c80ef3add154"
Commit 9e8a8c05 authored by jerrrrry's avatar jerrrrry
Browse files

Initial commit

parents
# Copyright (c) 2017-present, Facebook, Inc.
# All rights reserved.
#
# This source code is licensed under the license found in the LICENSE file in
# the root directory of this source tree. An additional grant of patent rights
# can be found in the PATENTS file in the same directory.
from . import FairseqEncoder
class CompositeEncoder(FairseqEncoder):
"""
Encoder class that forwards on multiple encoders, for example for a fusion model or question-answering
Accepts a dictionary of encoder, the first encoder's dictionary is used for initialization
"""
def __init__(self, encoders):
super().__init__(next(iter(encoders.values())).dictionary)
self.encoders = encoders
for key in self.encoders:
self.add_module(key, self.encoders[key])
def forward(self, src_tokens, src_lengths):
encoder_out = {}
for key in self.encoders:
encoder_out[key] = self.encoders[key](src_tokens, src_lengths)
return encoder_out
def reorder_encoder_out(self, encoder_out, new_order):
"""Reorder encoder output according to new_order."""
for key in self.encoders:
encoder_out[key] = self.encoders[key].reorder_encoder_out(encoder_out[key], new_order)
return encoder_out
def max_positions(self):
return min([self.encoders[key].max_positions() for key in self.encoders])
def upgrade_state_dict(self, state_dict):
for key in self.encoders:
self.encoders[key].upgrade_state_dict(state_dict)
return state_dict
# Copyright (c) 2017-present, Facebook, Inc.
# All rights reserved.
#
# This source code is licensed under the license found in the LICENSE file in
# the root directory of this source tree. An additional grant of patent rights
# can be found in the PATENTS file in the same directory.
import torch
import torch.nn as nn
import torch.nn.functional as F
class FairseqDecoder(nn.Module):
"""Base class for decoders."""
def __init__(self, dictionary):
super().__init__()
self.dictionary = dictionary
def forward(self, prev_output_tokens, encoder_out):
raise NotImplementedError
def get_normalized_probs(self, net_output, log_probs, sample):
"""Get normalized probabilities (or log probs) from a net's output."""
if hasattr(self, 'adaptive_softmax') and self.adaptive_softmax is not None:
assert sample is not None and 'target' in sample
out = self.adaptive_softmax.get_log_prob(net_output[0], sample['target'])
return out.exp_() if not log_probs else out
#logits = net_output[0].float()
logits = net_output[0] #.float()
if log_probs:
return F.log_softmax(logits, dim=-1, dtype=torch.float32)
else:
return F.softmax(logits, dim=-1, dtype=torch.float32)
def max_positions(self):
"""Maximum input length supported by the decoder."""
raise NotImplementedError
def upgrade_state_dict(self, state_dict):
return state_dict
# Copyright (c) 2017-present, Facebook, Inc.
# All rights reserved.
#
# This source code is licensed under the license found in the LICENSE file in
# the root directory of this source tree. An additional grant of patent rights
# can be found in the PATENTS file in the same directory.
import torch.nn as nn
class FairseqEncoder(nn.Module):
"""Base class for encoders."""
def __init__(self, dictionary):
super().__init__()
self.dictionary = dictionary
def forward(self, src_tokens, src_lengths):
raise NotImplementedError
def reorder_encoder_out(self, encoder_out, new_order):
"""Reorder encoder output according to new_order."""
raise NotImplementedError
def max_positions(self):
"""Maximum input length supported by the encoder."""
raise NotImplementedError
def upgrade_state_dict(self, state_dict):
return state_dict
# Copyright (c) 2017-present, Facebook, Inc.
# All rights reserved.
#
# This source code is licensed under the license found in the LICENSE file in
# the root directory of this source tree. An additional grant of patent rights
# can be found in the PATENTS file in the same directory.
from . import FairseqDecoder
class FairseqIncrementalDecoder(FairseqDecoder):
"""Base class for incremental decoders."""
def __init__(self, dictionary):
super().__init__(dictionary)
def forward(self, prev_output_tokens, encoder_out, incremental_state=None):
raise NotImplementedError
def reorder_incremental_state(self, incremental_state, new_order):
"""Reorder incremental state.
This should be called when the order of the input has changed from the
previous time step. A typical use case is beam search, where the input
order changes between time steps based on the selection of beams.
"""
def apply_reorder_incremental_state(module):
if module != self and hasattr(module, 'reorder_incremental_state'):
module.reorder_incremental_state(
incremental_state,
new_order,
)
self.apply(apply_reorder_incremental_state)
def set_beam_size(self, beam_size):
"""Sets the beam size in the decoder and all children."""
if getattr(self, '_beam_size', -1) != beam_size:
def apply_set_beam_size(module):
if module != self and hasattr(module, 'set_beam_size'):
module.set_beam_size(beam_size)
self.apply(apply_set_beam_size)
self._beam_size = beam_size
# Copyright (c) 2017-present, Facebook, Inc.
# All rights reserved.
#
# This source code is licensed under the license found in the LICENSE file in
# the root directory of this source tree. An additional grant of patent rights
# can be found in the PATENTS file in the same directory.
import torch.nn as nn
from . import FairseqDecoder, FairseqEncoder
class BaseFairseqModel(nn.Module):
"""Base class for fairseq models."""
def __init__(self):
super().__init__()
self._is_generation_fast = False
@staticmethod
def add_args(parser):
"""Add model-specific arguments to the parser."""
pass
@classmethod
def build_model(cls, args, task):
"""Build a new model instance."""
raise NotImplementedError
def get_targets(self, sample, net_output):
"""Get targets from either the sample or the net's output."""
return sample['target']
def get_normalized_probs(self, net_output, log_probs, sample=None):
"""Get normalized probabilities (or log probs) from a net's output."""
return self.decoder.get_normalized_probs(net_output, log_probs, sample)
def max_positions(self):
"""Maximum length supported by the model."""
raise NotImplementedError
def max_decoder_positions(self):
"""Maximum length supported by the decoder."""
return self.decoder.max_positions()
def load_state_dict(self, state_dict, strict=True):
"""Copies parameters and buffers from state_dict into this module and
its descendants.
Overrides the method in nn.Module; compared with that method this
additionally "upgrades" state_dicts from old checkpoints.
"""
self.upgrade_state_dict(state_dict)
super().load_state_dict(state_dict, strict)
def upgrade_state_dict(self, state_dict):
assert state_dict is not None
def do_upgrade(m):
if m != self and hasattr(m, 'upgrade_state_dict'):
m.upgrade_state_dict(state_dict)
self.apply(do_upgrade)
def make_generation_fast_(self, **kwargs):
"""Optimize model for faster generation."""
if self._is_generation_fast:
return # only apply once
self._is_generation_fast = True
# remove weight norm from all modules in the network
def apply_remove_weight_norm(module):
try:
nn.utils.remove_weight_norm(module)
except ValueError: # this module didn't have weight norm
return
self.apply(apply_remove_weight_norm)
def apply_make_generation_fast_(module):
if module != self and hasattr(module, 'make_generation_fast_'):
module.make_generation_fast_(**kwargs)
self.apply(apply_make_generation_fast_)
def train(mode):
if mode:
raise RuntimeError('cannot train after make_generation_fast')
# this model should no longer be used for training
self.eval()
self.train = train
class FairseqModel(BaseFairseqModel):
"""Base class for encoder-decoder models."""
def __init__(self, encoder, decoder):
super().__init__()
self.encoder = encoder
self.decoder = decoder
assert isinstance(self.encoder, FairseqEncoder)
assert isinstance(self.decoder, FairseqDecoder)
def forward(self, src_tokens, src_lengths, prev_output_tokens):
encoder_out = self.encoder(src_tokens, src_lengths)
decoder_out = self.decoder(prev_output_tokens, encoder_out)
return decoder_out
def max_positions(self):
"""Maximum length supported by the model."""
return (self.encoder.max_positions(), self.decoder.max_positions())
class FairseqLanguageModel(BaseFairseqModel):
"""Base class for decoder-only models."""
def __init__(self, decoder):
super().__init__()
self.decoder = decoder
assert isinstance(self.decoder, FairseqDecoder)
def forward(self, src_tokens):
return self.decoder(src_tokens)
def max_positions(self):
"""Maximum length supported by the model."""
return self.decoder.max_positions()
# Copyright (c) 2017-present, Facebook, Inc.
# All rights reserved.
#
# This source code is licensed under the license found in the LICENSE file in
# the root directory of this source tree. An additional grant of patent rights
# can be found in the PATENTS file in the same directory.
import math
import torch
import torch.nn as nn
import torch.nn.functional as F
from fairseq import options, utils
from fairseq.modules import (
AdaptiveSoftmax, BeamableMM, GradMultiply, LearnedPositionalEmbedding,
LinearizedConvolution,
)
from . import (
FairseqEncoder, FairseqIncrementalDecoder, FairseqModel,
FairseqLanguageModel, register_model, register_model_architecture,
)
@register_model('fconv')
class FConvModel(FairseqModel):
def __init__(self, encoder, decoder):
super().__init__(encoder, decoder)
self.encoder.num_attention_layers = sum(layer is not None for layer in decoder.attention)
@staticmethod
def add_args(parser):
"""Add model-specific arguments to the parser."""
parser.add_argument('--dropout', type=float, metavar='D',
help='dropout probability')
parser.add_argument('--encoder-embed-dim', type=int, metavar='N',
help='encoder embedding dimension')
parser.add_argument('--encoder-embed-path', type=str, metavar='STR',
help='path to pre-trained encoder embedding')
parser.add_argument('--encoder-layers', type=str, metavar='EXPR',
help='encoder layers [(dim, kernel_size), ...]')
parser.add_argument('--decoder-embed-dim', type=int, metavar='N',
help='decoder embedding dimension')
parser.add_argument('--decoder-embed-path', type=str, metavar='STR',
help='path to pre-trained decoder embedding')
parser.add_argument('--decoder-layers', type=str, metavar='EXPR',
help='decoder layers [(dim, kernel_size), ...]')
parser.add_argument('--decoder-out-embed-dim', type=int, metavar='N',
help='decoder output embedding dimension')
parser.add_argument('--decoder-attention', type=str, metavar='EXPR',
help='decoder attention [True, ...]')
parser.add_argument('--normalization-constant', type=float, metavar='D',
help='multiplies the result of the residual block by sqrt(value)')
parser.add_argument('--share-input-output-embed', action='store_true',
help='share input and output embeddings (requires'
' --decoder-out-embed-dim and --decoder-embed-dim'
' to be equal)')
@classmethod
def build_model(cls, args, task):
"""Build a new model instance."""
# make sure that all args are properly defaulted (in case there are any new ones)
base_architecture(args)
encoder_embed_dict = None
if args.encoder_embed_path:
encoder_embed_dict = utils.parse_embedding(args.encoder_embed_path)
utils.print_embed_overlap(encoder_embed_dict, task.source_dictionary)
decoder_embed_dict = None
if args.decoder_embed_path:
decoder_embed_dict = utils.parse_embedding(args.decoder_embed_path)
utils.print_embed_overlap(decoder_embed_dict, task.target_dictionary)
encoder = FConvEncoder(
dictionary=task.source_dictionary,
embed_dim=args.encoder_embed_dim,
embed_dict=encoder_embed_dict,
convolutions=eval(args.encoder_layers),
dropout=args.dropout,
max_positions=args.max_source_positions,
normalization_constant=args.normalization_constant,
)
decoder = FConvDecoder(
dictionary=task.target_dictionary,
embed_dim=args.decoder_embed_dim,
embed_dict=decoder_embed_dict,
convolutions=eval(args.decoder_layers),
out_embed_dim=args.decoder_out_embed_dim,
attention=eval(args.decoder_attention),
dropout=args.dropout,
max_positions=args.max_target_positions,
share_embed=args.share_input_output_embed,
normalization_constant=args.normalization_constant,
)
return FConvModel(encoder, decoder)
@register_model('fconv_lm')
class FConvLanguageModel(FairseqLanguageModel):
def __init__(self, decoder):
super().__init__(decoder)
@staticmethod
def add_args(parser):
"""Add model-specific arguments to the parser."""
parser.add_argument('--dropout', type=float, metavar='D',
help='dropout probability')
parser.add_argument('--decoder-embed-dim', type=int, metavar='N',
help='decoder embedding dimension')
parser.add_argument('--decoder-layers', type=str, metavar='EXPR',
help='decoder layers [(dim, kernel_size), ...]')
parser.add_argument('--decoder-out-embed-dim', type=int, metavar='N',
help='decoder output embedding dimension')
parser.add_argument('--adaptive-softmax-cutoff', metavar='EXPR',
help='comma separated list of adaptive softmax cutoff points. '
'Must be used with adaptive_loss criterion')
parser.add_argument('--decoder-attention', type=str, metavar='EXPR',
help='decoder attention [True, ...]')
parser.add_argument('--normalization-constant', type=float, metavar='D',
help='multiplies the result of the residual block by sqrt(value)')
@classmethod
def build_model(cls, args, task):
"""Build a new model instance."""
# make sure all arguments are present in older models
base_lm_architecture(args)
if hasattr(args, 'max_target_positions'):
args.tokens_per_sample = args.max_target_positions
decoder = FConvDecoder(
dictionary=task.target_dictionary,
embed_dim=args.decoder_embed_dim,
convolutions=eval(args.decoder_layers),
out_embed_dim=args.decoder_embed_dim,
attention=eval(args.decoder_attention),
dropout=args.dropout,
max_positions=args.tokens_per_sample,
share_embed=False,
positional_embeddings=False,
adaptive_softmax_cutoff=(
options.eval_str_list(args.adaptive_softmax_cutoff, type=int)
if args.criterion == 'adaptive_loss' else None
),
normalization_constant=args.normalization_constant,
)
return FConvLanguageModel(decoder)
class FConvEncoder(FairseqEncoder):
"""Convolutional encoder"""
def __init__(
self, dictionary, embed_dim=512, embed_dict=None, max_positions=1024,
convolutions=((512, 3),) * 20, dropout=0.1, normalization_constant=0.5,
left_pad=True,
):
super().__init__(dictionary)
self.dropout = dropout
self.normalization_constant = normalization_constant
self.left_pad = left_pad
self.num_attention_layers = None
num_embeddings = len(dictionary)
self.padding_idx = dictionary.pad()
self.embed_tokens = Embedding(num_embeddings, embed_dim, self.padding_idx)
if embed_dict:
self.embed_tokens = utils.load_embedding(embed_dict, self.dictionary, self.embed_tokens)
self.embed_positions = PositionalEmbedding(
max_positions,
embed_dim,
self.padding_idx,
left_pad=self.left_pad,
)
convolutions = extend_conv_spec(convolutions)
in_channels = convolutions[0][0]
self.fc1 = Linear(embed_dim, in_channels, dropout=dropout)
self.projections = nn.ModuleList()
self.convolutions = nn.ModuleList()
self.residuals = []
layer_in_channels = [in_channels]
for i, (out_channels, kernel_size, residual) in enumerate(convolutions):
if residual == 0:
residual_dim = out_channels
else:
residual_dim = layer_in_channels[-residual]
self.projections.append(Linear(residual_dim, out_channels)
if residual_dim != out_channels else None)
if kernel_size % 2 == 1:
padding = kernel_size // 2
else:
padding = 0
self.convolutions.append(
ConvTBC(in_channels, out_channels * 2, kernel_size,
dropout=dropout, padding=padding)
)
self.residuals.append(residual)
in_channels = out_channels
layer_in_channels.append(out_channels)
self.fc2 = Linear(in_channels, embed_dim)
def forward(self, src_tokens, src_lengths):
# embed tokens and positions
x = self.embed_tokens(src_tokens) + self.embed_positions(src_tokens)
x = F.dropout(x, p=self.dropout, training=self.training)
input_embedding = x
# project to size of convolution
x = self.fc1(x)
# used to mask padding in input
encoder_padding_mask = src_tokens.eq(self.padding_idx).t() # -> T x B
if not encoder_padding_mask.any():
encoder_padding_mask = None
# B x T x C -> T x B x C
x = x.transpose(0, 1)
residuals = [x]
# temporal convolutions
for proj, conv, res_layer in zip(self.projections, self.convolutions, self.residuals):
if res_layer > 0:
residual = residuals[-res_layer]
residual = residual if proj is None else proj(residual)
else:
residual = None
if encoder_padding_mask is not None:
x = x.masked_fill(encoder_padding_mask.unsqueeze(-1), 0)
x = F.dropout(x, p=self.dropout, training=self.training)
if conv.kernel_size[0] % 2 == 1:
# padding is implicit in the conv
x = conv(x)
else:
padding_l = (conv.kernel_size[0] - 1) // 2
padding_r = conv.kernel_size[0] // 2
x = F.pad(x, (0, 0, 0, 0, padding_l, padding_r))
x = conv(x)
x = F.glu(x, dim=2)
if residual is not None:
x = (x + residual) * math.sqrt(self.normalization_constant)
residuals.append(x)
# T x B x C -> B x T x C
x = x.transpose(1, 0)
# project back to size of embedding
x = self.fc2(x)
if encoder_padding_mask is not None:
encoder_padding_mask = encoder_padding_mask.t() # -> B x T
x = x.masked_fill(encoder_padding_mask.unsqueeze(-1), 0)
# scale gradients (this only affects backward, not forward)
x = GradMultiply.apply(x, 1.0 / (2.0 * self.num_attention_layers))
# add output to input embedding for attention
y = (x + input_embedding) * math.sqrt(self.normalization_constant)
return {
'encoder_out': (x, y),
'encoder_padding_mask': encoder_padding_mask, # B x T
}
def reorder_encoder_out(self, encoder_out, new_order):
if encoder_out['encoder_out'] is not None:
encoder_out['encoder_out'] = (
encoder_out['encoder_out'][0].index_select(0, new_order),
encoder_out['encoder_out'][1].index_select(0, new_order),
)
if encoder_out['encoder_padding_mask'] is not None:
encoder_out['encoder_padding_mask'] = \
encoder_out['encoder_padding_mask'].index_select(0, new_order)
return encoder_out
def max_positions(self):
"""Maximum input length supported by the encoder."""
return self.embed_positions.max_positions()
class AttentionLayer(nn.Module):
def __init__(self, conv_channels, embed_dim, normalization_constant=0.5, bmm=None):
super().__init__()
self.normalization_constant = normalization_constant
# projects from output of convolution to embedding dimension
self.in_projection = Linear(conv_channels, embed_dim)
# projects from embedding dimension to convolution size
self.out_projection = Linear(embed_dim, conv_channels)
self.bmm = bmm if bmm is not None else torch.bmm
def forward(self, x, target_embedding, encoder_out, encoder_padding_mask):
residual = x
# attention
x = (self.in_projection(x) + target_embedding) * math.sqrt(self.normalization_constant)
x = self.bmm(x, encoder_out[0])
# don't attend over padding
if encoder_padding_mask is not None:
x = x.float().masked_fill(
encoder_padding_mask.unsqueeze(1),
float('-inf')
).type_as(x) # FP16 support: cast to float and back
# softmax over last dim
sz = x.size()
x = F.softmax(x.view(sz[0] * sz[1], sz[2]), dim=1)
x = x.view(sz)
attn_scores = x
x = self.bmm(x, encoder_out[1])
# scale attention output (respecting potentially different lengths)
s = encoder_out[1].size(1)
if encoder_padding_mask is None:
x = x * (s * math.sqrt(1.0 / s))
else:
s = s - encoder_padding_mask.type_as(x).sum(dim=1, keepdim=True) # exclude padding
s = s.unsqueeze(-1)
x = x * (s * s.rsqrt())
# project back
x = (self.out_projection(x) + residual) * math.sqrt(self.normalization_constant)
return x, attn_scores
def make_generation_fast_(self, beamable_mm_beam_size=None, **kwargs):
"""Replace torch.bmm with BeamableMM."""
if beamable_mm_beam_size is not None:
del self.bmm
self.add_module('bmm', BeamableMM(beamable_mm_beam_size))
class FConvDecoder(FairseqIncrementalDecoder):
"""Convolutional decoder"""
def __init__(
self, dictionary, embed_dim=512, embed_dict=None, out_embed_dim=256,
max_positions=1024, convolutions=((512, 3),) * 20, attention=True,
dropout=0.1, share_embed=False, positional_embeddings=True,
adaptive_softmax_cutoff=None, normalization_constant=0.5,
left_pad=False,
):
super().__init__(dictionary)
self.register_buffer('version', torch.Tensor([2]))
self.dropout = dropout
self.normalization_constant = normalization_constant
self.left_pad = left_pad
self.need_attn = True
convolutions = extend_conv_spec(convolutions)
in_channels = convolutions[0][0]
if isinstance(attention, bool):
# expand True into [True, True, ...] and do the same with False
attention = [attention] * len(convolutions)
if not isinstance(attention, list) or len(attention) != len(convolutions):
raise ValueError('Attention is expected to be a list of booleans of '
'length equal to the number of layers.')
num_embeddings = len(dictionary)
padding_idx = dictionary.pad()
self.embed_tokens = Embedding(num_embeddings, embed_dim, padding_idx)
if embed_dict:
self.embed_tokens = utils.load_embedding(embed_dict, self.dictionary, self.embed_tokens)
self.embed_positions = PositionalEmbedding(
max_positions,
embed_dim,
padding_idx,
left_pad=self.left_pad,
) if positional_embeddings else None
self.fc1 = Linear(embed_dim, in_channels, dropout=dropout)
self.projections = nn.ModuleList()
self.convolutions = nn.ModuleList()
self.attention = nn.ModuleList()
self.residuals = []
layer_in_channels = [in_channels]
for i, (out_channels, kernel_size, residual) in enumerate(convolutions):
if residual == 0:
residual_dim = out_channels
else:
residual_dim = layer_in_channels[-residual]
self.projections.append(Linear(residual_dim, out_channels)
if residual_dim != out_channels else None)
self.convolutions.append(
LinearizedConv1d(in_channels, out_channels * 2, kernel_size,
padding=(kernel_size - 1), dropout=dropout)
)
self.attention.append(AttentionLayer(out_channels, embed_dim, self.normalization_constant)
if attention[i] else None)
self.residuals.append(residual)
in_channels = out_channels
layer_in_channels.append(out_channels)
self.adaptive_softmax = None
self.fc2 = self.fc3 = None
if adaptive_softmax_cutoff is not None:
assert not share_embed
self.adaptive_softmax = AdaptiveSoftmax(num_embeddings, in_channels, adaptive_softmax_cutoff,
dropout=dropout)
else:
self.fc2 = Linear(in_channels, out_embed_dim)
if share_embed:
assert out_embed_dim == embed_dim, \
"Shared embed weights implies same dimensions " \
" out_embed_dim={} vs embed_dim={}".format(out_embed_dim, embed_dim)
self.fc3 = nn.Linear(out_embed_dim, num_embeddings)
self.fc3.weight = self.embed_tokens.weight
else:
self.fc3 = Linear(out_embed_dim, num_embeddings, dropout=dropout)
def forward(self, prev_output_tokens, encoder_out_dict=None, incremental_state=None):
if encoder_out_dict is not None:
encoder_out = encoder_out_dict['encoder_out']
encoder_padding_mask = encoder_out_dict['encoder_padding_mask']
# split and transpose encoder outputs
encoder_a, encoder_b = self._split_encoder_out(encoder_out, incremental_state)
if self.embed_positions is not None:
pos_embed = self.embed_positions(prev_output_tokens, incremental_state)
else:
pos_embed = 0
if incremental_state is not None:
prev_output_tokens = prev_output_tokens[:, -1:]
x = self._embed_tokens(prev_output_tokens, incremental_state)
# embed tokens and combine with positional embeddings
x += pos_embed
x = F.dropout(x, p=self.dropout, training=self.training)
target_embedding = x
# project to size of convolution
x = self.fc1(x)
# B x T x C -> T x B x C
x = self._transpose_if_training(x, incremental_state)
# temporal convolutions
avg_attn_scores = None
num_attn_layers = len(self.attention)
residuals = [x]
for proj, conv, attention, res_layer in zip(self.projections, self.convolutions, self.attention,
self.residuals):
if res_layer > 0:
residual = residuals[-res_layer]
residual = residual if proj is None else proj(residual)
else:
residual = None
x = F.dropout(x, p=self.dropout, training=self.training)
x = conv(x, incremental_state)
x = F.glu(x, dim=2)
# attention
if attention is not None:
x = self._transpose_if_training(x, incremental_state)
x, attn_scores = attention(x, target_embedding, (encoder_a, encoder_b), encoder_padding_mask)
if not self.training and self.need_attn:
attn_scores = attn_scores / num_attn_layers
if avg_attn_scores is None:
avg_attn_scores = attn_scores
else:
avg_attn_scores.add_(attn_scores)
x = self._transpose_if_training(x, incremental_state)
# residual
if residual is not None:
x = (x + residual) * math.sqrt(self.normalization_constant)
residuals.append(x)
# T x B x C -> B x T x C
x = self._transpose_if_training(x, incremental_state)
# project back to size of vocabulary if not using adaptive softmax
if self.fc2 is not None and self.fc3 is not None:
x = self.fc2(x)
x = F.dropout(x, p=self.dropout, training=self.training)
x = self.fc3(x)
return x, avg_attn_scores
def reorder_incremental_state(self, incremental_state, new_order):
super().reorder_incremental_state(incremental_state, new_order)
encoder_out = utils.get_incremental_state(self, incremental_state, 'encoder_out')
if encoder_out is not None:
encoder_out = tuple(eo.index_select(0, new_order) for eo in encoder_out)
utils.set_incremental_state(self, incremental_state, 'encoder_out', encoder_out)
def max_positions(self):
"""Maximum output length supported by the decoder."""
return self.embed_positions.max_positions() if self.embed_positions is not None else float('inf')
def upgrade_state_dict(self, state_dict):
if state_dict.get('decoder.version', torch.Tensor([1]))[0] < 2:
# old models use incorrect weight norm dimension
for i, conv in enumerate(self.convolutions):
# reconfigure weight norm
nn.utils.remove_weight_norm(conv)
self.convolutions[i] = nn.utils.weight_norm(conv, dim=0)
state_dict['decoder.version'] = torch.Tensor([1])
return state_dict
def make_generation_fast_(self, need_attn=False, **kwargs):
self.need_attn = need_attn
def _embed_tokens(self, tokens, incremental_state):
if incremental_state is not None:
# keep only the last token for incremental forward pass
tokens = tokens[:, -1:]
return self.embed_tokens(tokens)
def _split_encoder_out(self, encoder_out, incremental_state):
"""Split and transpose encoder outputs.
This is cached when doing incremental inference.
"""
cached_result = utils.get_incremental_state(self, incremental_state, 'encoder_out')
if cached_result is not None:
return cached_result
# transpose only once to speed up attention layers
encoder_a, encoder_b = encoder_out
encoder_a = encoder_a.transpose(1, 2).contiguous()
result = (encoder_a, encoder_b)
if incremental_state is not None:
utils.set_incremental_state(self, incremental_state, 'encoder_out', result)
return result
def _transpose_if_training(self, x, incremental_state):
if incremental_state is None:
x = x.transpose(0, 1)
return x
def extend_conv_spec(convolutions):
"""
Extends convolutional spec that is a list of tuples of 2 or 3 parameters
(kernel size, dim size and optionally how many layers behind to look for residual)
to default the residual propagation param if it is not specified
"""
extended = []
for spec in convolutions:
if len(spec) == 3:
extended.append(spec)
elif len(spec) == 2:
extended.append(spec + (1,))
else:
raise Exception('invalid number of parameters in convolution spec ' + str(spec) + '. expected 2 or 3')
return tuple(extended)
def Embedding(num_embeddings, embedding_dim, padding_idx):
m = nn.Embedding(num_embeddings, embedding_dim, padding_idx=padding_idx)
nn.init.normal_(m.weight, 0, 0.1)
nn.init.constant_(m.weight[padding_idx], 0)
return m
def PositionalEmbedding(num_embeddings, embedding_dim, padding_idx, left_pad):
m = LearnedPositionalEmbedding(num_embeddings, embedding_dim, padding_idx, left_pad)
nn.init.normal_(m.weight, 0, 0.1)
nn.init.constant_(m.weight[padding_idx], 0)
return m
def Linear(in_features, out_features, dropout=0):
"""Weight-normalized Linear layer (input: N x T x C)"""
m = nn.Linear(in_features, out_features)
nn.init.normal_(m.weight, mean=0, std=math.sqrt((1 - dropout) / in_features))
nn.init.constant_(m.bias, 0)
return nn.utils.weight_norm(m)
def LinearizedConv1d(in_channels, out_channels, kernel_size, dropout=0, **kwargs):
"""Weight-normalized Conv1d layer optimized for decoding"""
m = LinearizedConvolution(in_channels, out_channels, kernel_size, **kwargs)
std = math.sqrt((4 * (1.0 - dropout)) / (m.kernel_size[0] * in_channels))
nn.init.normal_(m.weight, mean=0, std=std)
nn.init.constant_(m.bias, 0)
return nn.utils.weight_norm(m, dim=2)
def ConvTBC(in_channels, out_channels, kernel_size, dropout=0, **kwargs):
"""Weight-normalized Conv1d layer"""
from fairseq.modules import ConvTBC
m = ConvTBC(in_channels, out_channels, kernel_size, **kwargs)
std = math.sqrt((4 * (1.0 - dropout)) / (m.kernel_size[0] * in_channels))
nn.init.normal_(m.weight, mean=0, std=std)
nn.init.constant_(m.bias, 0)
return nn.utils.weight_norm(m, dim=2)
@register_model_architecture('fconv_lm', 'fconv_lm')
def base_lm_architecture(args):
args.dropout = getattr(args, 'dropout', 0.1)
args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 128)
args.decoder_layers = getattr(args, 'decoder_layers', '[(1268, 4)] * 13')
args.decoder_attention = getattr(args, 'decoder_attention', 'False')
args.adaptive_softmax_cutoff = getattr(args, 'adaptive_softmax_cutoff', None)
args.normalization_constant = getattr(args, 'normalization_constant', 0.5)
@register_model_architecture('fconv_lm', 'fconv_lm_dauphin_wikitext103')
def fconv_lm_dauphin_wikitext103(args):
layers = '[(850, 6)] * 3'
layers += ' + [(850, 1)] * 1'
layers += ' + [(850, 5)] * 4'
layers += ' + [(850, 1)] * 1'
layers += ' + [(850, 4)] * 3'
layers += ' + [(1024, 4)] * 1'
layers += ' + [(2048, 4)] * 1'
args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 280)
args.decoder_layers = getattr(args, 'decoder_layers', layers)
args.decoder_attention = getattr(args, 'decoder_attention', 'False')
args.adaptive_softmax_cutoff = getattr(args, 'adaptive_softmax_cutoff', '10000,20000,200000')
base_lm_architecture(args)
@register_model_architecture('fconv_lm', 'fconv_lm_dauphin_gbw')
def fconv_lm_dauphin_gbw(args):
layers = '[(512, 5)]'
layers += ' + [(128, 1, 0), (128, 5, 0), (512, 1, 3)] * 3'
layers += ' + [(512, 1, 0), (512, 5, 0), (1024, 1, 3)] * 3'
layers += ' + [(1024, 1, 0), (1024, 5, 0), (2048, 1, 3)] * 6'
layers += ' + [(1024, 1, 0), (1024, 5, 0), (4096, 1, 3)]'
args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 128)
args.decoder_layers = getattr(args, 'decoder_layers', layers)
args.decoder_attention = getattr(args, 'decoder_attention', 'False')
args.adaptive_softmax_cutoff = getattr(args, 'adaptive_softmax_cutoff', '10000,50000,200000')
base_lm_architecture(args)
@register_model_architecture('fconv', 'fconv')
def base_architecture(args):
args.dropout = getattr(args, 'dropout', 0.1)
args.encoder_embed_dim = getattr(args, 'encoder_embed_dim', 512)
args.encoder_embed_path = getattr(args, 'encoder_embed_path', None)
args.encoder_layers = getattr(args, 'encoder_layers', '[(512, 3)] * 20')
args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 512)
args.decoder_embed_path = getattr(args, 'decoder_embed_path', None)
args.decoder_layers = getattr(args, 'decoder_layers', '[(512, 3)] * 20')
args.decoder_out_embed_dim = getattr(args, 'decoder_out_embed_dim', 256)
args.decoder_attention = getattr(args, 'decoder_attention', 'True')
args.share_input_output_embed = getattr(args, 'share_input_output_embed', False)
args.normalization_constant = getattr(args, 'normalization_constant', 0.5)
@register_model_architecture('fconv', 'fconv_iwslt_de_en')
def fconv_iwslt_de_en(args):
args.encoder_embed_dim = getattr(args, 'encoder_embed_dim', 256)
args.encoder_layers = getattr(args, 'encoder_layers', '[(256, 3)] * 4')
args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 256)
args.decoder_layers = getattr(args, 'decoder_layers', '[(256, 3)] * 3')
args.decoder_out_embed_dim = getattr(args, 'decoder_out_embed_dim', 256)
base_architecture(args)
@register_model_architecture('fconv', 'fconv_wmt_en_ro')
def fconv_wmt_en_ro(args):
args.decoder_out_embed_dim = getattr(args, 'decoder_out_embed_dim', 512)
base_architecture(args)
@register_model_architecture('fconv', 'fconv_wmt_en_de')
def fconv_wmt_en_de(args):
convs = '[(512, 3)] * 9' # first 9 layers have 512 units
convs += ' + [(1024, 3)] * 4' # next 4 layers have 1024 units
convs += ' + [(2048, 1)] * 2' # final 2 layers use 1x1 convolutions
args.encoder_embed_dim = getattr(args, 'encoder_embed_dim', 768)
args.encoder_layers = getattr(args, 'encoder_layers', convs)
args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 768)
args.decoder_layers = getattr(args, 'decoder_layers', convs)
args.decoder_out_embed_dim = getattr(args, 'decoder_out_embed_dim', 512)
base_architecture(args)
@register_model_architecture('fconv', 'fconv_wmt_en_fr')
def fconv_wmt_en_fr(args):
convs = '[(512, 3)] * 6' # first 6 layers have 512 units
convs += ' + [(768, 3)] * 4' # next 4 layers have 768 units
convs += ' + [(1024, 3)] * 3' # next 3 layers have 1024 units
convs += ' + [(2048, 1)] * 1' # next 1 layer uses 1x1 convolutions
convs += ' + [(4096, 1)] * 1' # final 1 layer uses 1x1 convolutions
args.encoder_embed_dim = getattr(args, 'encoder_embed_dim', 768)
args.encoder_layers = getattr(args, 'encoder_layers', convs)
args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 768)
args.decoder_layers = getattr(args, 'decoder_layers', convs)
args.decoder_out_embed_dim = getattr(args, 'decoder_out_embed_dim', 512)
base_architecture(args)
# Copyright (c) 2017-present, Facebook, Inc.
# All rights reserved.
#
# This source code is licensed under the license found in the LICENSE file in
# the root directory of this source tree. An additional grant of patent rights
# can be found in the PATENTS file in the same directory.
#
import math
import torch
import torch.nn as nn
import torch.nn.functional as F
from fairseq.modules import (
DownsampledMultiHeadAttention, GradMultiply, LearnedPositionalEmbedding,
LinearizedConvolution,
)
from fairseq import utils
from . import (
FairseqEncoder, CompositeEncoder, FairseqDecoder, FairseqModel,
register_model, register_model_architecture,
)
@register_model('fconv_self_att')
class FConvModelSelfAtt(FairseqModel):
def __init__(self, encoder, decoder, pretrained_encoder=None):
super().__init__(encoder, decoder)
self.encoder.num_attention_layers = sum(layer is not None for layer in decoder.attention)
self.pretrained_encoder = pretrained_encoder
if self.pretrained_encoder is None:
encoders = {'encoder': encoder}
else:
encoders = {'encoder': encoder, 'pretrained': self.pretrained_encoder}
# for fusion model, CompositeEncoder contains both pretrained and training encoders
# these are forwarded and then combined in the decoder
self.encoder = CompositeEncoder(encoders)
@staticmethod
def add_args(parser):
"""Add model-specific arguments to the parser."""
parser.add_argument('--dropout', type=float, metavar='D',
help='dropout probability')
parser.add_argument('--encoder-embed-dim', type=int, metavar='N',
help='encoder embedding dimension')
parser.add_argument('--encoder-layers', type=str, metavar='EXPR',
help='encoder layers [(dim, kernel_size), ...]')
parser.add_argument('--decoder-embed-dim', type=int, metavar='N',
help='decoder embedding dimension')
parser.add_argument('--decoder-layers', type=str, metavar='EXPR',
help='decoder layers [(dim, kernel_size), ...]')
parser.add_argument('--decoder-out-embed-dim', type=int, metavar='N',
help='decoder output embedding dimension')
parser.add_argument('--decoder-attention', type=str, metavar='EXPR',
help='decoder attention [True, ...]')
parser.add_argument('--self-attention', type=str, metavar='EXPR',
help='decoder self-attention layers, ex: [True] + [False]*5')
parser.add_argument('--multihead-attention-nheads', type=int,
help='Number of heads to use in attention')
parser.add_argument('--multihead-self-attention-nheads', type=int,
help='Number of heads to use in self-attention')
parser.add_argument('--encoder-attention', type=str, metavar='EXPR',
help='encoder attention [True, ...]')
parser.add_argument('--encoder-attention-nheads', type=int,
help='Number of heads to use in encoder attention')
parser.add_argument('--project-input', type=str, metavar='EXPR',
help='Use projections in self-attention [True, ...]')
parser.add_argument('--gated-attention', type=str, metavar='EXPR',
help='Use GLU layers in self-attention projections [True, ...]')
parser.add_argument('--downsample', type=str, metavar='EXPR',
help='Use downsampling in self-attention [True, ...]')
parser.add_argument('--pretrained-checkpoint', metavar='DIR',
help='path to load checkpoint from pretrained model')
parser.add_argument('--pretrained', type=str, metavar='EXPR',
help='use pretrained model when training [True, ...]')
@classmethod
def build_model(cls, args, task):
trained_encoder, trained_decoder = None, None
pretrained = eval(args.pretrained)
if pretrained:
print("| loading pretrained model")
trained_model = utils.load_ensemble_for_inference(
# not actually for inference, but loads pretrained model parameters
filenames=[args.pretrained_checkpoint],
task=task,
)[0][0]
trained_decoder = list(trained_model.children())[1]
trained_encoder = list(trained_model.children())[0]
# freeze pretrained model
for param in trained_decoder.parameters():
param.requires_grad = False
for param in trained_encoder.parameters():
param.requires_grad = False
"""Build a new model instance."""
encoder = FConvEncoder(
task.source_dictionary,
embed_dim=args.encoder_embed_dim,
convolutions=eval(args.encoder_layers),
dropout=args.dropout,
max_positions=args.max_source_positions,
attention=eval(args.encoder_attention),
attention_nheads=args.encoder_attention_nheads
)
decoder = FConvDecoder(
task.target_dictionary,
embed_dim=args.decoder_embed_dim,
convolutions=eval(args.decoder_layers),
out_embed_dim=args.decoder_out_embed_dim,
attention=eval(args.decoder_attention),
dropout=args.dropout,
max_positions=args.max_target_positions,
selfattention=eval(args.self_attention),
attention_nheads=args.multihead_attention_nheads,
selfattention_nheads=args.multihead_self_attention_nheads,
project_input=eval(args.project_input),
gated_attention=eval(args.gated_attention),
downsample=eval(args.downsample),
pretrained=pretrained,
trained_decoder=trained_decoder
)
model = FConvModelSelfAtt(encoder, decoder, trained_encoder)
return model
@property
def pretrained(self):
return self.pretrained_encoder is not None
class FConvEncoder(FairseqEncoder):
"""Convolutional encoder"""
def __init__(
self, dictionary, embed_dim=512, max_positions=1024,
convolutions=((512, 3),) * 20, dropout=0.1, attention=False,
attention_nheads=1, left_pad=True,
):
super().__init__(dictionary)
self.dropout = dropout
self.num_attention_layers = None
self.left_pad = left_pad
num_embeddings = len(dictionary)
self.padding_idx = dictionary.pad()
self.embed_tokens = Embedding(num_embeddings, embed_dim, self.padding_idx)
self.embed_positions = PositionalEmbedding(
max_positions,
embed_dim,
self.padding_idx,
left_pad=self.left_pad,
)
def expand_bool_array(val):
if isinstance(val, bool):
# expand True into [True, True, ...] and do the same with False
return [val] * len(convolutions)
return val
attention = expand_bool_array(attention)
in_channels = convolutions[0][0]
self.fc1 = Linear(embed_dim, in_channels, dropout=dropout)
self.projections = nn.ModuleList()
self.convolutions = nn.ModuleList()
self.attention = nn.ModuleList()
self.attproj = nn.ModuleList()
for i, (out_channels, kernel_size) in enumerate(convolutions):
self.projections.append(
Linear(in_channels, out_channels) if in_channels != out_channels else None
)
self.convolutions.append(
ConvTBC(in_channels, out_channels * 2, kernel_size, dropout=dropout)
)
self.attention.append(
SelfAttention(out_channels, embed_dim, attention_nheads) if attention[i] else None
)
in_channels = out_channels
self.fc2 = Linear(in_channels, embed_dim)
def forward(self, src_tokens, src_lengths):
# embed tokens and positions
x = self.embed_tokens(src_tokens) + self.embed_positions(src_tokens)
x = F.dropout(x, p=self.dropout, training=self.training)
input_embedding = x.transpose(0, 1)
# project to size of convolution
x = self.fc1(x)
# B x T x C -> T x B x C
x = x.transpose(0, 1)
# temporal convolutions
for proj, conv, attention in zip(self.projections, self.convolutions, self.attention):
residual = x if proj is None else proj(x)
x = F.dropout(x, p=self.dropout, training=self.training)
padding_l = (conv.kernel_size[0] - 1) // 2
padding_r = conv.kernel_size[0] // 2
x = F.pad(x, (0, 0, 0, 0, padding_l, padding_r))
x = conv(x)
x = F.glu(x, dim=2)
if attention is not None:
x = attention(x)
x = (x + residual) * math.sqrt(0.5)
# T x B x C -> B x T x C
x = x.transpose(1, 0)
# project back to size of embedding
x = self.fc2(x)
# scale gradients (this only affects backward, not forward)
x = GradMultiply.apply(x, 1.0 / (2.0 * self.num_attention_layers))
# add output to input embedding for attention
y = (x + input_embedding.transpose(0, 1)) * math.sqrt(0.5)
return {
'encoder_out': (x, y),
}
def reorder_encoder_out(self, encoder_out, new_order):
encoder_out['encoder_out'] = tuple(
eo.index_select(0, new_order) for eo in encoder_out['encoder_out']
)
if 'pretrained' in encoder_out:
encoder_out['pretrained']['encoder_out'] = tuple(
eo.index_select(0, new_order)
for eo in encoder_out['pretrained']['encoder_out']
)
return encoder_out
def max_positions(self):
"""Maximum input length supported by the encoder."""
return self.embed_positions.max_positions()
class FConvDecoder(FairseqDecoder):
"""Convolutional decoder"""
def __init__(
self, dictionary, embed_dim=512, out_embed_dim=256, max_positions=1024,
convolutions=((512, 3),) * 8, attention=True, dropout=0.1,
selfattention=False, attention_nheads=1, selfattention_nheads=1,
project_input=False, gated_attention=False, downsample=False,
pretrained=False, trained_decoder=None, left_pad=False,
):
super().__init__(dictionary)
self.register_buffer('version', torch.Tensor([2]))
self.pretrained = pretrained
self.pretrained_decoder = trained_decoder
self.dropout = dropout
self.left_pad = left_pad
self.need_attn = True
in_channels = convolutions[0][0]
def expand_bool_array(val):
if isinstance(val, bool):
# expand True into [True, True, ...] and do the same with False
return [val] * len(convolutions)
return val
attention = expand_bool_array(attention)
selfattention = expand_bool_array(selfattention)
if not isinstance(attention, list) or len(attention) != len(convolutions):
raise ValueError('Attention is expected to be a list of booleans of '
'length equal to the number of layers.')
num_embeddings = len(dictionary)
padding_idx = dictionary.pad()
self.embed_tokens = Embedding(num_embeddings, embed_dim, padding_idx)
self.embed_positions = PositionalEmbedding(
max_positions,
embed_dim,
padding_idx,
left_pad=self.left_pad,
)
self.fc1 = Linear(embed_dim, in_channels, dropout=dropout)
self.projections = nn.ModuleList()
self.convolutions = nn.ModuleList()
self.attention = nn.ModuleList()
self.selfattention = nn.ModuleList()
self.attproj = nn.ModuleList()
for i, (out_channels, kernel_size) in enumerate(convolutions):
self.projections.append(
Linear(in_channels, out_channels) if in_channels != out_channels else None
)
self.convolutions.append(
LinearizedConv1d(
in_channels, out_channels * 2, kernel_size,
padding=(kernel_size - 1), dropout=dropout,
)
)
self.attention.append(
DownsampledMultiHeadAttention(
out_channels, embed_dim, attention_nheads,
project_input=project_input, gated=False, downsample=False,
) if attention[i] else None
)
self.attproj.append(
Linear(out_channels, embed_dim, dropout=dropout) if attention[i] else None
)
self.selfattention.append(
SelfAttention(
out_channels, embed_dim, selfattention_nheads,
project_input=project_input, gated=gated_attention,
downsample=downsample,
) if selfattention[i] else None
)
in_channels = out_channels
self.fc2 = Linear(in_channels, out_embed_dim)
self.fc3 = Linear(out_embed_dim, num_embeddings, dropout=dropout)
# model fusion
if self.pretrained:
# independent gates are learned from the concatenated input
self.gate1 = nn.Sequential(Linear(out_embed_dim*2, out_embed_dim), nn.Sigmoid())
self.gate2 = nn.Sequential(Linear(out_embed_dim*2, out_embed_dim), nn.Sigmoid())
# pretrained and trained models are joined
self.joining = nn.Sequential(
Linear(out_embed_dim*2, out_embed_dim*2),
nn.LayerNorm(out_embed_dim*2),
nn.GLU(),
Linear(out_embed_dim, out_embed_dim*2),
nn.LayerNorm(out_embed_dim*2),
nn.GLU(),
Linear(out_embed_dim, out_embed_dim),
nn.LayerNorm(out_embed_dim)
)
# pretrained model contains an output layer that is nhid -> vocab size
# but the models are combined in their hidden state
# the hook stores the output of the pretrained model forward
self.pretrained_outputs = {}
def save_output():
def hook(a, b, output):
self.pretrained_outputs["out"] = output
return hook
self.pretrained_decoder.fc2.register_forward_hook(save_output())
def forward(self, prev_output_tokens, encoder_out_dict):
encoder_out = encoder_out_dict['encoder']['encoder_out']
trained_encoder_out = encoder_out_dict['pretrained'] if self.pretrained else None
encoder_a, encoder_b = self._split_encoder_out(encoder_out)
# embed positions
positions = self.embed_positions(prev_output_tokens)
# embed tokens and positions
x = self.embed_tokens(prev_output_tokens) + positions
x = F.dropout(x, p=self.dropout, training=self.training)
target_embedding = x.transpose(0, 1)
# project to size of convolution
x = self.fc1(x)
# B x T x C -> T x B x C
x = x.transpose(0, 1)
# temporal convolutions
avg_attn_scores = None
for proj, conv, attention, selfattention, attproj in zip(
self.projections, self.convolutions, self.attention, self.selfattention, self.attproj
):
residual = x if proj is None else proj(x)
x = F.dropout(x, p=self.dropout, training=self.training)
x = conv(x)
x = F.glu(x, dim=2)
# attention
if attention is not None:
r = x
x, attn_scores = attention(attproj(x) + target_embedding, encoder_a, encoder_b)
x = x + r
if not self.training and self.need_attn:
if avg_attn_scores is None:
avg_attn_scores = attn_scores
else:
avg_attn_scores.add_(attn_scores)
if selfattention is not None:
x = selfattention(x)
x = (x + residual) * math.sqrt(0.5)
# T x B x C -> B x T x C
x = x.transpose(0, 1)
# project back to size of vocabulary
x = self.fc2(x)
x = F.dropout(x, p=self.dropout, training=self.training)
if not self.pretrained:
x = self.fc3(x)
# fusion gating
if self.pretrained:
trained_x, _ = self.pretrained_decoder.forward(prev_output_tokens, trained_encoder_out)
y = torch.cat([x, self.pretrained_outputs["out"]], dim=-1)
gate1 = self.gate1(y)
gate2 = self.gate2(y)
gated_x1 = gate1 * x
gated_x2 = gate2 * self.pretrained_outputs["out"]
fusion = torch.cat([gated_x1, gated_x2], dim=-1)
fusion = self.joining(fusion)
fusion_output = self.fc3(fusion)
return fusion_output, avg_attn_scores
else:
return x, avg_attn_scores
def max_positions(self):
"""Maximum output length supported by the decoder."""
return self.embed_positions.max_positions()
def make_generation_fast_(self, need_attn=False, **kwargs):
self.need_attn = need_attn
def _split_encoder_out(self, encoder_out):
"""Split and transpose encoder outputs."""
# transpose only once to speed up attention layers
encoder_a, encoder_b = encoder_out
encoder_a = encoder_a.transpose(0, 1).contiguous()
encoder_b = encoder_b.transpose(0, 1).contiguous()
result = (encoder_a, encoder_b)
return result
class SelfAttention(nn.Module):
def __init__(self, out_channels, embed_dim, num_heads, project_input=False, gated=False, downsample=False):
super().__init__()
self.attention = DownsampledMultiHeadAttention(
out_channels, embed_dim, num_heads, dropout=0, bias=True,
project_input=project_input, gated=gated, downsample=downsample,
)
self.in_proj_q = Linear(out_channels, embed_dim)
self.in_proj_k = Linear(out_channels, embed_dim)
self.in_proj_v = Linear(out_channels, embed_dim)
self.ln = nn.LayerNorm(out_channels)
def forward(self, x):
residual = x
query = self.in_proj_q(x)
key = self.in_proj_k(x)
value = self.in_proj_v(x)
x, _ = self.attention(query, key, value, mask_future_timesteps=True, use_scalar_bias=True)
return self.ln(x + residual)
def Embedding(num_embeddings, embedding_dim, padding_idx):
m = nn.Embedding(num_embeddings, embedding_dim, padding_idx=padding_idx)
m.weight.data.normal_(0, 0.1)
return m
def PositionalEmbedding(num_embeddings, embedding_dim, padding_idx, left_pad):
m = LearnedPositionalEmbedding(num_embeddings, embedding_dim, padding_idx, left_pad)
m.weight.data.normal_(0, 0.1)
return m
def Linear(in_features, out_features, dropout=0.):
"""Weight-normalized Linear layer (input: N x T x C)"""
m = nn.Linear(in_features, out_features)
m.weight.data.normal_(mean=0, std=math.sqrt((1 - dropout) / in_features))
m.bias.data.zero_()
return m
def LinearizedConv1d(in_channels, out_channels, kernel_size, dropout=0., **kwargs):
"""Weight-normalized Conv1d layer optimized for decoding"""
m = LinearizedConvolution(in_channels, out_channels, kernel_size, **kwargs)
std = math.sqrt((4 * (1.0 - dropout)) / (m.kernel_size[0] * in_channels))
m.weight.data.normal_(mean=0, std=std)
m.bias.data.zero_()
return m
def ConvTBC(in_channels, out_channels, kernel_size, dropout=0, **kwargs):
"""Weight-normalized Conv1d layer"""
from fairseq.modules import ConvTBC
m = ConvTBC(in_channels, out_channels, kernel_size, **kwargs)
std = math.sqrt((4 * (1.0 - dropout)) / (m.kernel_size[0] * in_channels))
m.weight.data.normal_(mean=0, std=std)
m.bias.data.zero_()
return m
@register_model_architecture('fconv_self_att', 'fconv_self_att')
def base_architecture(args):
args.dropout = getattr(args, 'dropout', 0.1)
args.encoder_embed_dim = getattr(args, 'encoder_embed_dim', 512)
args.encoder_layers = getattr(args, 'encoder_layers', '[(512, 3)] * 3')
args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 512)
args.decoder_layers = getattr(args, 'decoder_layers', '[(512, 3)] * 8')
args.decoder_out_embed_dim = getattr(args, 'decoder_out_embed_dim', 256)
args.decoder_attention = getattr(args, 'decoder_attention', 'True')
args.self_attention = getattr(args, 'self_attention', 'False')
args.encoder_attention = getattr(args, 'encoder_attention', 'False')
args.multihead_attention_nheads = getattr(args, 'multihead_attention_nheads', 1)
args.multihead_self_attention_nheads = getattr(args, 'multihead_self_attention_nheads', 1)
args.encoder_attention_nheads = getattr(args, 'encoder_attention_nheads', 1)
args.project_input = getattr(args, 'project_input', 'False')
args.gated_attention = getattr(args, 'gated_attention', 'False')
args.downsample = getattr(args, 'downsample', 'False')
args.pretrained_checkpoint = getattr(args, 'pretrained_checkpoint', '')
args.pretrained = getattr(args, 'pretrained', 'False')
@register_model_architecture('fconv_self_att', 'fconv_self_att_wp')
def fconv_self_att_wp(args):
args.encoder_embed_dim = getattr(args, 'encoder_embed_dim', 256)
args.encoder_layers = getattr(args, 'encoder_layers', '[(128, 3)] * 2 + [(512,3)] * 1')
args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 256)
args.decoder_layers = getattr(args, 'decoder_layers', '[(512, 4)] * 4 + [(768, 4)] * 2 + [(1024, 4)] * 1')
args.decoder_out_embed_dim = getattr(args, 'decoder_out_embed_dim', 256)
args.self_attention = getattr(args, 'self_attention', 'True')
args.multihead_self_attention_nheads = getattr(args, 'multihead_self_attention_nheads', 4)
args.project_input = getattr(args, 'project_input', 'True')
args.gated_attention = getattr(args, 'gated_attention', 'True')
args.downsample = getattr(args, 'downsample', 'True')
base_architecture(args)
# Copyright (c) 2017-present, Facebook, Inc.
# All rights reserved.
#
# This source code is licensed under the license found in the LICENSE file in
# the root directory of this source tree. An additional grant of patent rights
# can be found in the PATENTS file in the same directory.
import torch
import torch.nn as nn
import torch.nn.functional as F
from fairseq import options, utils
from . import (
FairseqEncoder, FairseqIncrementalDecoder, FairseqModel, register_model,
register_model_architecture,
)
@register_model('lstm')
class LSTMModel(FairseqModel):
def __init__(self, encoder, decoder):
super().__init__(encoder, decoder)
@staticmethod
def add_args(parser):
"""Add model-specific arguments to the parser."""
parser.add_argument('--dropout', type=float, metavar='D',
help='dropout probability')
parser.add_argument('--encoder-embed-dim', type=int, metavar='N',
help='encoder embedding dimension')
parser.add_argument('--encoder-embed-path', type=str, metavar='STR',
help='path to pre-trained encoder embedding')
parser.add_argument('--encoder-hidden-size', type=int, metavar='N',
help='encoder hidden size')
parser.add_argument('--encoder-layers', type=int, metavar='N',
help='number of encoder layers')
parser.add_argument('--encoder-bidirectional', action='store_true',
help='make all layers of encoder bidirectional')
parser.add_argument('--decoder-embed-dim', type=int, metavar='N',
help='decoder embedding dimension')
parser.add_argument('--decoder-embed-path', type=str, metavar='STR',
help='path to pre-trained decoder embedding')
parser.add_argument('--decoder-hidden-size', type=int, metavar='N',
help='decoder hidden size')
parser.add_argument('--decoder-layers', type=int, metavar='N',
help='number of decoder layers')
parser.add_argument('--decoder-out-embed-dim', type=int, metavar='N',
help='decoder output embedding dimension')
parser.add_argument('--decoder-attention', type=str, metavar='BOOL',
help='decoder attention')
# Granular dropout settings (if not specified these default to --dropout)
parser.add_argument('--encoder-dropout-in', type=float, metavar='D',
help='dropout probability for encoder input embedding')
parser.add_argument('--encoder-dropout-out', type=float, metavar='D',
help='dropout probability for encoder output')
parser.add_argument('--decoder-dropout-in', type=float, metavar='D',
help='dropout probability for decoder input embedding')
parser.add_argument('--decoder-dropout-out', type=float, metavar='D',
help='dropout probability for decoder output')
parser.add_argument('--share-decoder-input-output-embed', default=False,
action='store_true',
help='share decoder input and output embeddings')
parser.add_argument('--share-all-embeddings', default=False, action='store_true',
help='share encoder, decoder and output embeddings'
' (requires shared dictionary and embed dim)')
@classmethod
def build_model(cls, args, task):
"""Build a new model instance."""
# make sure that all args are properly defaulted (in case there are any new ones)
base_architecture(args)
def load_pretrained_embedding_from_file(embed_path, dictionary, embed_dim):
num_embeddings = len(dictionary)
padding_idx = dictionary.pad()
embed_tokens = Embedding(num_embeddings, embed_dim, padding_idx)
embed_dict = utils.parse_embedding(embed_path)
utils.print_embed_overlap(embed_dict, dictionary)
return utils.load_embedding(embed_dict, dictionary, embed_tokens)
if args.encoder_embed_path:
pretrained_encoder_embed = load_pretrained_embedding_from_file(
args.encoder_embed_path, task.source_dictionary, args.encoder_embed_dim)
else:
num_embeddings = len(task.source_dictionary)
pretrained_encoder_embed = Embedding(
num_embeddings, args.encoder_embed_dim, task.source_dictionary.pad()
)
if args.share_all_embeddings:
# double check all parameters combinations are valid
if task.source_dictionary != task.target_dictionary:
raise RuntimeError('--share-all-embeddings requires a joint dictionary')
if args.decoder_embed_path and (
args.decoder_embed_path != args.encoder_embed_path):
raise RuntimeError(
'--share-all-embed not compatible with --decoder-embed-path'
)
if args.encoder_embed_dim != args.decoder_embed_dim:
raise RuntimeError(
'--share-all-embeddings requires --encoder-embed-dim to '
'match --decoder-embed-dim'
)
pretrained_decoder_embed = pretrained_encoder_embed
args.share_decoder_input_output_embed = True
else:
# separate decoder input embeddings
pretrained_decoder_embed = None
if args.decoder_embed_path:
pretrained_decoder_embed = load_pretrained_embedding_from_file(
args.decoder_embed_path,
task.target_dictionary,
args.decoder_embed_dim
)
# one last double check of parameter combinations
if args.share_decoder_input_output_embed and (
args.decoder_embed_dim != args.decoder_out_embed_dim):
raise RuntimeError(
'--share-decoder-input-output-embeddings requires '
'--decoder-embed-dim to match --decoder-out-embed-dim'
)
encoder = LSTMEncoder(
dictionary=task.source_dictionary,
embed_dim=args.encoder_embed_dim,
hidden_size=args.encoder_hidden_size,
num_layers=args.encoder_layers,
dropout_in=args.encoder_dropout_in,
dropout_out=args.encoder_dropout_out,
bidirectional=args.encoder_bidirectional,
pretrained_embed=pretrained_encoder_embed,
)
decoder = LSTMDecoder(
dictionary=task.target_dictionary,
embed_dim=args.decoder_embed_dim,
hidden_size=args.decoder_hidden_size,
out_embed_dim=args.decoder_out_embed_dim,
num_layers=args.decoder_layers,
dropout_in=args.decoder_dropout_in,
dropout_out=args.decoder_dropout_out,
attention=options.eval_bool(args.decoder_attention),
encoder_embed_dim=args.encoder_embed_dim,
encoder_output_units=encoder.output_units,
pretrained_embed=pretrained_decoder_embed,
share_input_output_embed=args.share_decoder_input_output_embed,
)
return cls(encoder, decoder)
class LSTMEncoder(FairseqEncoder):
"""LSTM encoder."""
def __init__(
self, dictionary, embed_dim=512, hidden_size=512, num_layers=1,
dropout_in=0.1, dropout_out=0.1, bidirectional=False,
left_pad=True, pretrained_embed=None, padding_value=0.,
):
super().__init__(dictionary)
self.num_layers = num_layers
self.dropout_in = dropout_in
self.dropout_out = dropout_out
self.bidirectional = bidirectional
self.hidden_size = hidden_size
num_embeddings = len(dictionary)
self.padding_idx = dictionary.pad()
if pretrained_embed is None:
self.embed_tokens = Embedding(num_embeddings, embed_dim, self.padding_idx)
else:
self.embed_tokens = pretrained_embed
self.lstm = LSTM(
input_size=embed_dim,
hidden_size=hidden_size,
num_layers=num_layers,
dropout=self.dropout_out if num_layers > 1 else 0.,
bidirectional=bidirectional,
)
self.left_pad = left_pad
self.padding_value = padding_value
self.output_units = hidden_size
if bidirectional:
self.output_units *= 2
def forward(self, src_tokens, src_lengths):
if self.left_pad:
# convert left-padding to right-padding
src_tokens = utils.convert_padding_direction(
src_tokens,
self.padding_idx,
left_to_right=True,
)
bsz, seqlen = src_tokens.size()
# embed tokens
x = self.embed_tokens(src_tokens)
x = F.dropout(x, p=self.dropout_in, training=self.training)
# B x T x C -> T x B x C
x = x.transpose(0, 1)
# pack embedded source tokens into a PackedSequence
packed_x = nn.utils.rnn.pack_padded_sequence(x, src_lengths.data.tolist())
# apply LSTM
if self.bidirectional:
state_size = 2 * self.num_layers, bsz, self.hidden_size
else:
state_size = self.num_layers, bsz, self.hidden_size
h0 = x.data.new(*state_size).zero_()
c0 = x.data.new(*state_size).zero_()
packed_outs, (final_hiddens, final_cells) = self.lstm(packed_x, (h0, c0))
# unpack outputs and apply dropout
x, _ = nn.utils.rnn.pad_packed_sequence(packed_outs, padding_value=self.padding_value)
x = F.dropout(x, p=self.dropout_out, training=self.training)
assert list(x.size()) == [seqlen, bsz, self.output_units]
if self.bidirectional:
def combine_bidir(outs):
return torch.cat([
torch.cat([outs[2 * i], outs[2 * i + 1]], dim=0).view(1, bsz, self.output_units)
for i in range(self.num_layers)
], dim=0)
final_hiddens = combine_bidir(final_hiddens)
final_cells = combine_bidir(final_cells)
encoder_padding_mask = src_tokens.eq(self.padding_idx).t()
return {
'encoder_out': (x, final_hiddens, final_cells),
'encoder_padding_mask': encoder_padding_mask if encoder_padding_mask.any() else None
}
def reorder_encoder_out(self, encoder_out, new_order):
encoder_out['encoder_out'] = tuple(
eo.index_select(1, new_order)
for eo in encoder_out['encoder_out']
)
if encoder_out['encoder_padding_mask'] is not None:
encoder_out['encoder_padding_mask'] = \
encoder_out['encoder_padding_mask'].index_select(1, new_order)
return encoder_out
def max_positions(self):
"""Maximum input length supported by the encoder."""
return int(1e5) # an arbitrary large number
class AttentionLayer(nn.Module):
def __init__(self, input_embed_dim, output_embed_dim):
super().__init__()
self.input_proj = Linear(input_embed_dim, output_embed_dim, bias=False)
self.output_proj = Linear(input_embed_dim + output_embed_dim, output_embed_dim, bias=False)
def forward(self, input, source_hids, encoder_padding_mask):
# input: bsz x input_embed_dim
# source_hids: srclen x bsz x output_embed_dim
# x: bsz x output_embed_dim
x = self.input_proj(input)
# compute attention
attn_scores = (source_hids * x.unsqueeze(0)).sum(dim=2)
# don't attend over padding
if encoder_padding_mask is not None:
attn_scores = attn_scores.float().masked_fill_(
encoder_padding_mask,
float('-inf')
).type_as(attn_scores) # FP16 support: cast to float and back
attn_scores = F.softmax(attn_scores, dim=0) # srclen x bsz
# sum weighted sources
x = (attn_scores.unsqueeze(2) * source_hids).sum(dim=0)
x = F.tanh(self.output_proj(torch.cat((x, input), dim=1)))
return x, attn_scores
class LSTMDecoder(FairseqIncrementalDecoder):
"""LSTM decoder."""
def __init__(
self, dictionary, embed_dim=512, hidden_size=512, out_embed_dim=512,
num_layers=1, dropout_in=0.1, dropout_out=0.1, attention=True,
encoder_embed_dim=512, encoder_output_units=512, pretrained_embed=None,
share_input_output_embed=False,
):
super().__init__(dictionary)
self.dropout_in = dropout_in
self.dropout_out = dropout_out
self.hidden_size = hidden_size
self.share_input_output_embed = share_input_output_embed
self.need_attn = True
num_embeddings = len(dictionary)
padding_idx = dictionary.pad()
if pretrained_embed is None:
self.embed_tokens = Embedding(num_embeddings, embed_dim, padding_idx)
else:
self.embed_tokens = pretrained_embed
self.encoder_output_units = encoder_output_units
assert encoder_output_units == hidden_size, \
'encoder_output_units ({}) != hidden_size ({})'.format(encoder_output_units, hidden_size)
# TODO another Linear layer if not equal
self.layers = nn.ModuleList([
LSTMCell(
input_size=encoder_output_units + embed_dim if layer == 0 else hidden_size,
hidden_size=hidden_size,
)
for layer in range(num_layers)
])
self.attention = AttentionLayer(encoder_output_units, hidden_size) if attention else None
if hidden_size != out_embed_dim:
self.additional_fc = Linear(hidden_size, out_embed_dim)
if not self.share_input_output_embed:
self.fc_out = Linear(out_embed_dim, num_embeddings, dropout=dropout_out)
def forward(self, prev_output_tokens, encoder_out_dict, incremental_state=None):
encoder_out = encoder_out_dict['encoder_out']
encoder_padding_mask = encoder_out_dict['encoder_padding_mask']
if incremental_state is not None:
prev_output_tokens = prev_output_tokens[:, -1:]
bsz, seqlen = prev_output_tokens.size()
# get outputs from encoder
encoder_outs, _, _ = encoder_out[:3]
srclen = encoder_outs.size(0)
# embed tokens
x = self.embed_tokens(prev_output_tokens)
x = F.dropout(x, p=self.dropout_in, training=self.training)
# B x T x C -> T x B x C
x = x.transpose(0, 1)
# initialize previous states (or get from cache during incremental generation)
cached_state = utils.get_incremental_state(self, incremental_state, 'cached_state')
if cached_state is not None:
prev_hiddens, prev_cells, input_feed = cached_state
else:
_, encoder_hiddens, encoder_cells = encoder_out[:3]
num_layers = len(self.layers)
prev_hiddens = [encoder_hiddens[i] for i in range(num_layers)]
prev_cells = [encoder_cells[i] for i in range(num_layers)]
input_feed = x.data.new(bsz, self.encoder_output_units).zero_()
attn_scores = x.data.new(srclen, seqlen, bsz).zero_()
outs = []
for j in range(seqlen):
# input feeding: concatenate context vector from previous time step
input = torch.cat((x[j, :, :], input_feed), dim=1)
for i, rnn in enumerate(self.layers):
# recurrent cell
hidden, cell = rnn(input, (prev_hiddens[i], prev_cells[i]))
# hidden state becomes the input to the next layer
input = F.dropout(hidden, p=self.dropout_out, training=self.training)
# save state for next time step
prev_hiddens[i] = hidden
prev_cells[i] = cell
# apply attention using the last layer's hidden state
if self.attention is not None:
out, attn_scores[:, j, :] = self.attention(hidden, encoder_outs, encoder_padding_mask)
else:
out = hidden
out = F.dropout(out, p=self.dropout_out, training=self.training)
# input feeding
input_feed = out
# save final output
outs.append(out)
# cache previous states (no-op except during incremental generation)
utils.set_incremental_state(
self, incremental_state, 'cached_state', (prev_hiddens, prev_cells, input_feed))
# collect outputs across time steps
x = torch.cat(outs, dim=0).view(seqlen, bsz, self.hidden_size)
# T x B x C -> B x T x C
x = x.transpose(1, 0)
# srclen x tgtlen x bsz -> bsz x tgtlen x srclen
if not self.training and self.need_attn:
attn_scores = attn_scores.transpose(0, 2)
else:
attn_scores = None
# project back to size of vocabulary
if hasattr(self, 'additional_fc'):
x = self.additional_fc(x)
x = F.dropout(x, p=self.dropout_out, training=self.training)
if self.share_input_output_embed:
x = F.linear(x, self.embed_tokens.weight)
else:
x = self.fc_out(x)
return x, attn_scores
def reorder_incremental_state(self, incremental_state, new_order):
super().reorder_incremental_state(incremental_state, new_order)
cached_state = utils.get_incremental_state(self, incremental_state, 'cached_state')
if cached_state is None:
return
def reorder_state(state):
if isinstance(state, list):
return [reorder_state(state_i) for state_i in state]
return state.index_select(0, new_order)
new_state = tuple(map(reorder_state, cached_state))
utils.set_incremental_state(self, incremental_state, 'cached_state', new_state)
def max_positions(self):
"""Maximum output length supported by the decoder."""
return int(1e5) # an arbitrary large number
def make_generation_fast_(self, need_attn=False, **kwargs):
self.need_attn = need_attn
def Embedding(num_embeddings, embedding_dim, padding_idx):
m = nn.Embedding(num_embeddings, embedding_dim, padding_idx=padding_idx)
nn.init.uniform_(m.weight, -0.1, 0.1)
nn.init.constant_(m.weight[padding_idx], 0)
return m
def LSTM(input_size, hidden_size, **kwargs):
m = nn.LSTM(input_size, hidden_size, **kwargs)
for name, param in m.named_parameters():
if 'weight' in name or 'bias' in name:
param.data.uniform_(-0.1, 0.1)
return m
def LSTMCell(input_size, hidden_size, **kwargs):
m = nn.LSTMCell(input_size, hidden_size, **kwargs)
for name, param in m.named_parameters():
if 'weight' in name or 'bias' in name:
param.data.uniform_(-0.1, 0.1)
return m
def Linear(in_features, out_features, bias=True, dropout=0):
"""Linear layer (input: N x T x C)"""
m = nn.Linear(in_features, out_features, bias=bias)
m.weight.data.uniform_(-0.1, 0.1)
if bias:
m.bias.data.uniform_(-0.1, 0.1)
return m
@register_model_architecture('lstm', 'lstm')
def base_architecture(args):
args.dropout = getattr(args, 'dropout', 0.1)
args.encoder_embed_dim = getattr(args, 'encoder_embed_dim', 512)
args.encoder_embed_path = getattr(args, 'encoder_embed_path', None)
args.encoder_hidden_size = getattr(args, 'encoder_hidden_size', args.encoder_embed_dim)
args.encoder_layers = getattr(args, 'encoder_layers', 1)
args.encoder_bidirectional = getattr(args, 'encoder_bidirectional', False)
args.encoder_dropout_in = getattr(args, 'encoder_dropout_in', args.dropout)
args.encoder_dropout_out = getattr(args, 'encoder_dropout_out', args.dropout)
args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 512)
args.decoder_embed_path = getattr(args, 'decoder_embed_path', None)
args.decoder_hidden_size = getattr(args, 'decoder_hidden_size', args.decoder_embed_dim)
args.decoder_layers = getattr(args, 'decoder_layers', 1)
args.decoder_out_embed_dim = getattr(args, 'decoder_out_embed_dim', 512)
args.decoder_attention = getattr(args, 'decoder_attention', '1')
args.decoder_dropout_in = getattr(args, 'decoder_dropout_in', args.dropout)
args.decoder_dropout_out = getattr(args, 'decoder_dropout_out', args.dropout)
args.share_decoder_input_output_embed = getattr(args, 'share_decoder_input_output_embed', False)
args.share_all_embeddings = getattr(args, 'share_all_embeddings', False)
@register_model_architecture('lstm', 'lstm_wiseman_iwslt_de_en')
def lstm_wiseman_iwslt_de_en(args):
args.dropout = getattr(args, 'dropout', 0.1)
args.encoder_embed_dim = getattr(args, 'encoder_embed_dim', 256)
args.encoder_dropout_in = getattr(args, 'encoder_dropout_in', 0)
args.encoder_dropout_out = getattr(args, 'encoder_dropout_out', 0)
args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 256)
args.decoder_out_embed_dim = getattr(args, 'decoder_out_embed_dim', 256)
args.decoder_dropout_in = getattr(args, 'decoder_dropout_in', 0)
args.decoder_dropout_out = getattr(args, 'decoder_dropout_out', args.dropout)
base_architecture(args)
@register_model_architecture('lstm', 'lstm_luong_wmt_en_de')
def lstm_luong_wmt_en_de(args):
args.encoder_embed_dim = getattr(args, 'encoder_embed_dim', 1000)
args.encoder_layers = getattr(args, 'encoder_layers', 4)
args.encoder_dropout_out = getattr(args, 'encoder_dropout_out', 0)
args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 1000)
args.decoder_layers = getattr(args, 'decoder_layers', 4)
args.decoder_out_embed_dim = getattr(args, 'decoder_out_embed_dim', 1000)
args.decoder_dropout_out = getattr(args, 'decoder_dropout_out', 0)
base_architecture(args)
# Copyright (c) 2017-present, Facebook, Inc.
# All rights reserved.
#
# This source code is licensed under the license found in the LICENSE file in
# the root directory of this source tree. An additional grant of patent rights
# can be found in the PATENTS file in the same directory.
import math
import torch
import torch.nn as nn
import torch.nn.functional as F
torch.set_printoptions(threshold=500000000, linewidth=1024)
from fairseq import options
from fairseq import utils
from apex.contrib.multihead_attn import SelfMultiheadAttn
from apex.contrib.multihead_attn import EncdecMultiheadAttn
from fairseq.modules import (
AdaptiveSoftmax, LearnedPositionalEmbedding, SinusoidalPositionalEmbedding,
)
from . import (
FairseqIncrementalDecoder, FairseqDecoder, FairseqEncoder, FairseqLanguageModel, FairseqModel, register_model,
register_model_architecture,
)
from apex.normalization.fused_layer_norm import FusedLayerNorm
@torch.jit.script
def jit_dropout_add(x, residual, prob, is_training) :
# type: (Tensor, Tensor, float, bool) -> Tensor
# TODO: Bug in Pytorch 19.07 prevents propogation of "training" argument
#out = F.dropout(x, p=prob, training=is_training)
out = F.dropout(x, p=prob, training=True)
out = residual + out
return out
# TODO: Remove workaround to enable relu+dropout fusion
# when it gets fixed
if hasattr(torch._C, '_jit_set_profiling_executor') :
torch._C._jit_set_profiling_executor(False)
if hasattr(torch._C, '_jit_set_profiling_mode') :
torch._C._jit_set_profiling_mode(False)
@torch.jit.script
def jit_relu_dropout(x, prob, is_training) :
# type: (Tensor, float, bool) -> Tensor
out = F.threshold(x, 0., 0.)
out = F.dropout(out, p=prob, training=True)
return out
@register_model('transformer')
class TransformerModel(FairseqModel):
def __init__(self, encoder, decoder):
super().__init__(encoder, decoder)
@staticmethod
def add_args(parser):
"""Add model-specific arguments to the parser."""
parser.add_argument('--dropout', type=float, metavar='D',
help='dropout probability')
parser.add_argument('--attention-dropout', type=float, metavar='D',
help='dropout probability for attention weights')
parser.add_argument('--relu-dropout', type=float, metavar='D',
help='dropout probability after ReLU in FFN')
parser.add_argument('--encoder-embed-path', type=str, metavar='STR',
help='path to pre-trained encoder embedding')
parser.add_argument('--encoder-embed-dim', type=int, metavar='N',
help='encoder embedding dimension')
parser.add_argument('--encoder-ffn-embed-dim', type=int, metavar='N',
help='encoder embedding dimension for FFN')
parser.add_argument('--encoder-layers', type=int, metavar='N',
help='num encoder layers')
parser.add_argument('--encoder-attention-heads', type=int, metavar='N',
help='num encoder attention heads')
parser.add_argument('--encoder-normalize-before', action='store_true',
help='apply layernorm before each encoder block')
parser.add_argument('--encoder-learned-pos', action='store_true',
help='use learned positional embeddings in the encoder')
parser.add_argument('--decoder-embed-path', type=str, metavar='STR',
help='path to pre-trained decoder embedding')
parser.add_argument('--decoder-embed-dim', type=int, metavar='N',
help='decoder embedding dimension')
parser.add_argument('--decoder-ffn-embed-dim', type=int, metavar='N',
help='decoder embedding dimension for FFN')
parser.add_argument('--decoder-layers', type=int, metavar='N',
help='num decoder layers')
parser.add_argument('--decoder-attention-heads', type=int, metavar='N',
help='num decoder attention heads')
parser.add_argument('--decoder-learned-pos', action='store_true',
help='use learned positional embeddings in the decoder')
parser.add_argument('--decoder-normalize-before', action='store_true',
help='apply layernorm before each decoder block')
parser.add_argument('--share-decoder-input-output-embed', action='store_true',
help='share decoder input and output embeddings')
parser.add_argument('--share-all-embeddings', action='store_true',
help='share encoder, decoder and output embeddings'
' (requires shared dictionary and embed dim)')
parser.add_argument('--adaptive-softmax-cutoff', metavar='EXPR',
help='comma separated list of adaptive softmax cutoff points. '
'Must be used with adaptive_loss criterion')
@classmethod
def build_model(cls, args, task):
"""Build a new model instance."""
# make sure all arguments are present in older models
base_architecture(args)
if not hasattr(args, 'max_source_positions'):
args.max_source_positions = 256
if not hasattr(args, 'max_target_positions'):
args.max_target_positions = 256
src_dict, tgt_dict = task.source_dictionary, task.target_dictionary
def build_embedding(dictionary, embed_dim, path=None):
num_embeddings = len(dictionary)
padding_idx = dictionary.pad()
emb = Embedding(num_embeddings, embed_dim, padding_idx)
# if provided, load from preloaded dictionaries
if path:
embed_dict = utils.parse_embedding(path)
utils.load_embedding(embed_dict, dictionary, emb)
return emb
if args.share_all_embeddings:
if src_dict != tgt_dict:
raise RuntimeError('--share-all-embeddings requires a joined dictionary')
if args.encoder_embed_dim != args.decoder_embed_dim:
raise RuntimeError(
'--share-all-embeddings requires --encoder-embed-dim to match --decoder-embed-dim')
if args.decoder_embed_path and (
args.decoder_embed_path != args.encoder_embed_path):
raise RuntimeError('--share-all-embeddings not compatible with --decoder-embed-path')
encoder_embed_tokens = build_embedding(
src_dict, args.encoder_embed_dim, args.encoder_embed_path
)
decoder_embed_tokens = encoder_embed_tokens
args.share_decoder_input_output_embed = True
else:
encoder_embed_tokens = build_embedding(
src_dict, args.encoder_embed_dim, args.encoder_embed_path
)
decoder_embed_tokens = build_embedding(
tgt_dict, args.decoder_embed_dim, args.decoder_embed_path
)
encoder = TransformerEncoder(args, src_dict, encoder_embed_tokens)
decoder = TransformerDecoder(args, tgt_dict, decoder_embed_tokens)
return TransformerModel(encoder, decoder)
@register_model('transformer_lm')
class TransformerLanguageModel(FairseqLanguageModel):
def __init__(self, decoder):
super().__init__(decoder)
@staticmethod
def add_args(parser):
"""Add model-specific arguments to the parser."""
parser.add_argument('--dropout', default=0.1, type=float, metavar='D',
help='dropout probability')
parser.add_argument('--attention-dropout', default=0., type=float, metavar='D',
help='dropout probability for attention weights')
parser.add_argument('--relu-dropout', default=0., type=float, metavar='D',
help='dropout probability after ReLU in FFN')
parser.add_argument('--decoder-embed-dim', type=int, metavar='N',
help='decoder embedding dimension')
parser.add_argument('--decoder-ffn-embed-dim', type=int, metavar='N',
help='decoder embedding dimension for FFN')
parser.add_argument('--decoder-layers', type=int, metavar='N',
help='num decoder layers')
parser.add_argument('--decoder-attention-heads', type=int, metavar='N',
help='num decoder attention heads')
parser.add_argument('--decoder-normalize-before', default=False, action='store_true',
help='apply layernorm before each decoder block')
parser.add_argument('--adaptive-softmax-cutoff', metavar='EXPR',
help='comma separated list of adaptive softmax cutoff points. '
'Must be used with adaptive_loss criterion')
parser.add_argument('--no-token-positional-embeddings', default=False, action='store_true',
help='if set, disables positional embeddings (outside self attention)')
parser.add_argument('--share-decoder-input-output-embed', default=False, action='store_true',
help='share decoder input and output embeddings')
@classmethod
def build_model(cls, args, task):
"""Build a new model instance."""
# make sure all arguments are present in older models
base_lm_architecture(args)
if not hasattr(args, 'max_source_positions'):
args.max_source_positions = args.tokens_per_sample
if not hasattr(args, 'max_target_positions'):
args.max_target_positions = args.tokens_per_sample
embed_tokens = Embedding(len(task.dictionary), args.decoder_embed_dim, task.dictionary.pad())
decoder = TransformerDecoder(args, task.dictionary, embed_tokens, no_encoder_attn=True)
return TransformerLanguageModel(decoder)
class TransformerEncoder(FairseqEncoder):
"""Transformer encoder."""
def __init__(self, args, dictionary, embed_tokens, left_pad=True):
super().__init__(dictionary)
self.dropout = args.dropout
embed_dim = embed_tokens.embedding_dim
self.padding_idx = embed_tokens.padding_idx
self.max_source_positions = args.max_source_positions
self.embed_tokens = embed_tokens
self.embed_scale = math.sqrt(embed_dim)
self.embed_positions = PositionalEmbedding(
args.max_source_positions, embed_dim, self.padding_idx,
left_pad=left_pad,
learned=args.encoder_learned_pos,
) if not args.no_token_positional_embeddings else None
self.layers = nn.ModuleList([])
self.layers.extend([
TransformerEncoderLayer(args)
for i in range(args.encoder_layers)
])
self.normalize = args.encoder_normalize_before
if self.normalize:
self.layer_norm = FusedLayerNorm(embed_dim)
def forward(self, src_tokens, src_lengths):
# embed tokens and positions
x = self.embed_scale * self.embed_tokens(src_tokens)
if self.embed_positions is not None:
x += self.embed_positions(src_tokens)
x = F.dropout(x, p=self.dropout, training=self.training)
# B x T x C -> T x B x C
# The tensor needs to copy transposed because
# fused dropout is not capable of handing strided data
x = x.transpose(0, 1)
# TODO: KJS Remove this contiguous once C++ Multihead Attention is made to accomodate
# Is this a bug that when contiguous it doens't make the stride contiguous?
if x.size(1) == 1 :
if x.is_contiguous() :
x = x.view(x.size(0), x.size(1), x.size(2))
else :
x = x.contiguous()
else :
x = x.contiguous()
# compute padding mask
encoder_padding_mask = src_tokens.eq(self.padding_idx).to(torch.uint8)
# encoder layers
for layer in self.layers:
x = layer(x, encoder_padding_mask)
if self.normalize:
x = self.layer_norm(x)
return {
'encoder_out': x, # T x B x C
'encoder_padding_mask': encoder_padding_mask, # B x T
}
def reorder_encoder_out(self, encoder_out, new_order):
if encoder_out['encoder_out'] is not None:
encoder_out['encoder_out'] = \
encoder_out['encoder_out'].index_select(1, new_order)
if encoder_out['encoder_padding_mask'] is not None:
encoder_out['encoder_padding_mask'] = \
encoder_out['encoder_padding_mask'].index_select(0, new_order)
return encoder_out
def max_positions(self):
"""Maximum input length supported by the encoder."""
if self.embed_positions is None:
return self.max_source_positions
return min(self.max_source_positions, self.embed_positions.max_positions())
def upgrade_state_dict(self, state_dict):
if isinstance(self.embed_positions, SinusoidalPositionalEmbedding):
if 'encoder.embed_positions.weights' in state_dict:
del state_dict['encoder.embed_positions.weights']
state_dict['encoder.embed_positions._float_tensor'] = torch.FloatTensor(1)
return state_dict
#class TransformerDecoder(FairseqIncrementalDecoder):
class TransformerDecoder(FairseqDecoder):
"""Transformer decoder."""
def __init__(self, args, dictionary, embed_tokens, no_encoder_attn=False, left_pad=False):
super().__init__(dictionary)
self.dropout = args.dropout
self.share_input_output_embed = args.share_decoder_input_output_embed
embed_dim = embed_tokens.embedding_dim
padding_idx = embed_tokens.padding_idx
self.max_target_positions = args.max_target_positions
self.embed_tokens = embed_tokens
self.embed_scale = math.sqrt(embed_dim)
self.embed_positions = PositionalEmbedding(
args.max_target_positions, embed_dim, padding_idx,
left_pad=left_pad,
learned=args.decoder_learned_pos,
) if not args.no_token_positional_embeddings else None
self.layers = nn.ModuleList([])
self.layers.extend([
TransformerDecoderLayer(args, no_encoder_attn)
for _ in range(args.decoder_layers)
])
self.adaptive_softmax = None
if args.adaptive_softmax_cutoff is not None:
self.adaptive_softmax = AdaptiveSoftmax(
len(dictionary), args.decoder_embed_dim,
options.eval_str_list(args.adaptive_softmax_cutoff, type=int),
dropout=args.dropout
)
elif not self.share_input_output_embed:
self.embed_out = nn.Parameter(torch.Tensor(len(dictionary), embed_dim))
nn.init.normal_(self.embed_out, mean=0, std=embed_dim ** -0.5)
self.normalize = args.decoder_normalize_before
if self.normalize:
self.layer_norm = FusedLayerNorm(embed_dim)
def forward(self, prev_output_tokens, encoder_out=None, incremental_state=None):
# embed positions
positions = self.embed_positions(
prev_output_tokens,
incremental_state=incremental_state,
) if self.embed_positions is not None else None
if incremental_state is not None:
prev_output_tokens = prev_output_tokens[:, -1:]
if positions is not None:
positions = positions[:, -1:]
# embed tokens and positions
x = self.embed_scale * self.embed_tokens(prev_output_tokens)
if positions is not None:
x += positions
x = F.dropout(x, p=self.dropout, training=self.training)
# B x T x C -> T x B x C
# The tensor needs to copy transposed because
# fused dropout is not capable of handing strided data
x = x.transpose(0, 1)
# TODO: KJS Remove this contiguous once C++ Multihead Attention is made to accomodate
# Is this a bug that when contiguous it doens't make the stride contiguous?
if x.size(1) == 1 :
if x.is_contiguous() :
x = x.view(x.size(0), x.size(1), x.size(2))
else :
x = x.contiguous()
else :
x = x.contiguous()
attn = None
# decoder layers
for layer in self.layers:
x, attn = layer(
x,
encoder_out['encoder_out'] if encoder_out is not None else None,
torch.triu(torch.ones(x.size(0), x.size(0), device=x.device, dtype=torch.uint8), 1),
encoder_out['encoder_padding_mask'] if encoder_out is not None else None,
incremental_state,
)
if self.normalize:
x = self.layer_norm(x)
# T x B x C -> B x T x C
x = x.transpose(0, 1)
if self.adaptive_softmax is None:
# project back to size of vocabulary
if self.share_input_output_embed:
x = F.linear(x, self.embed_tokens.weight)
else:
x = F.linear(x, self.embed_out)
return x, attn
def max_positions(self):
"""Maximum output length supported by the decoder."""
if self.embed_positions is None:
return self.max_target_positions
return min(self.max_target_positions, self.embed_positions.max_positions())
def upgrade_state_dict(self, state_dict):
if isinstance(self.embed_positions, SinusoidalPositionalEmbedding):
if 'decoder.embed_positions.weights' in state_dict:
del state_dict['decoder.embed_positions.weights']
state_dict['decoder.embed_positions._float_tensor'] = torch.FloatTensor(1)
for i in range(len(self.layers)):
# update layer norms
layer_norm_map = {
'0': 'self_attn_layer_norm',
'1': 'encoder_attn_layer_norm',
'2': 'final_layer_norm'
}
for old, new in layer_norm_map.items():
for m in ('weight', 'bias'):
k = 'decoder.layers.{}.layer_norms.{}.{}'.format(i, old, m)
if k in state_dict:
state_dict['decoder.layers.{}.{}.{}'.format(i, new, m)] = state_dict[k]
del state_dict[k]
return state_dict
class TransformerEncoderLayer(nn.Module):
"""Encoder layer block.
In the original paper each operation (multi-head attention or FFN) is
postprocessed with: dropout -> add residual -> layernorm.
In the tensor2tensor code they suggest that learning is more robust when
preprocessing each layer with layernorm and postprocessing with:
dropout -> add residual.
We default to the approach in the paper, but the tensor2tensor approach can
be enabled by setting `normalize_before=True`.
"""
def __init__(self, args):
super().__init__()
self.embed_dim = args.encoder_embed_dim
self.multihead_attn_impl = args.multihead_attn_impl
if args.multihead_attn_impl == 'fast_with_lyrnrm_and_dropoutadd' :
self.self_attn = SelfMultiheadAttn(
self.embed_dim, args.encoder_attention_heads,
dropout=args.attention_dropout,
bias=False,
include_norm_add=True,
impl='fast',
)
elif args.multihead_attn_impl == 'fast' :
self.self_attn = SelfMultiheadAttn(
self.embed_dim, args.encoder_attention_heads,
dropout=args.attention_dropout,
bias=False,
include_norm_add=False,
impl='fast',
)
else :
self.self_attn = SelfMultiheadAttn(
self.embed_dim, args.encoder_attention_heads,
dropout=args.attention_dropout,
bias=False,
include_norm_add=False,
impl='default',
)
# in_proj_weight has shape [3 * hidden, hidden] but it should be
# initialized like a [hidden, hidden] matrix.
# sqrt(6 / (hidden + hidden)) / sqrt(6 / (3 * hidden + hidden)) = sqrt(2)
# therefore xavier_uniform gain should be set to sqrt(2).
torch.nn.init.xavier_uniform_(self.self_attn.in_proj_weight, gain=math.sqrt(2))
self.dropout = args.dropout
self.relu_dropout = args.relu_dropout
self.normalize_before = args.encoder_normalize_before
self.fc1 = Linear(self.embed_dim, args.encoder_ffn_embed_dim)
self.fc2 = Linear(args.encoder_ffn_embed_dim, self.embed_dim)
if args.multihead_attn_impl == 'fast_with_lyrnrm_and_dropoutadd' :
self.layer_norms = nn.ModuleList([FusedLayerNorm(self.embed_dim) for i in range(1)])
else :
self.layer_norms = nn.ModuleList([FusedLayerNorm(self.embed_dim) for i in range(2)])
def forward(self, x, encoder_padding_mask):
if self.multihead_attn_impl == 'fast_with_lyrnrm_and_dropoutadd' :
x, _ = self.self_attn(
query=x,
key=x,
value=x,
key_padding_mask=encoder_padding_mask,
need_weights=False,
attn_mask=None,
is_training=self.training,
)
else :
residual = x
x = self.maybe_layer_norm(0, x, before=True)
x, _ = self.self_attn(
query=x,
key=x,
value=x,
key_padding_mask=encoder_padding_mask,
need_weights=False,
attn_mask=None,
is_training=self.training,
)
# TODO: Bug in Pytorch 19.07 prevents propogation of "training" argument
if self.training :
x = jit_dropout_add(x, residual, self.dropout, self.training)
else :
x = F.dropout(x, p=self.dropout, training=self.training)
x = residual + x
x = self.maybe_layer_norm(0, x, after=True)
residual = x
if self.multihead_attn_impl == 'fast_with_lyrnrm_and_dropoutadd' :
x = self.maybe_layer_norm(0, x, before=True)
else :
x = self.maybe_layer_norm(1, x, before=True)
# Fuse Bias to GEMMs by making Tensors 2D
sent_len,sents,hid_dim = x.size()
x = x.view(sent_len*sents, hid_dim)
if self.training :
x = jit_relu_dropout(self.fc1(x), self.relu_dropout, self.training)
else :
x = self.fc1(x)
x = F.threshold(x, 0., 0.)
x = F.dropout(x, p=self.relu_dropout, training=False)
#x = F.threshold(self.fc1(x),0,0)
#x = F.dropout(x, p=self.relu_dropout, training=self.training)
x = self.fc2(x)
x = x.view(sent_len, sents, hid_dim)
# TODO: Bug in Pytorch 19.07 prevents propogation of "training" argument
if self.training :
x = jit_dropout_add(x, residual, self.dropout, self.training)
else :
x = F.dropout(x, p=self.dropout, training=self.training)
x = residual + x
if self.multihead_attn_impl == 'fast_with_lyrnrm_and_dropoutadd' :
x = self.maybe_layer_norm(0, x, after=True)
else :
x = self.maybe_layer_norm(1, x, after=True)
return x
def maybe_layer_norm(self, i, x, before=False, after=False):
assert before ^ after
if after ^ self.normalize_before:
return self.layer_norms[i](x)
else:
return x
class TransformerDecoderLayer(nn.Module):
"""Decoder layer block."""
def __init__(self, args, no_encoder_attn=False):
super().__init__()
self.embed_dim = args.decoder_embed_dim
self.multihead_attn_impl = args.multihead_attn_impl
if args.multihead_attn_impl == 'fast_with_lyrnrm_and_dropoutadd' :
self.self_attn = SelfMultiheadAttn(
self.embed_dim, args.decoder_attention_heads,
dropout=args.attention_dropout,
bias=False,
include_norm_add=True,
impl='fast',
)
elif args.multihead_attn_impl == 'fast' :
self.self_attn = SelfMultiheadAttn(
self.embed_dim, args.encoder_attention_heads,
dropout=args.attention_dropout,
bias=False,
include_norm_add=False,
impl='fast',
)
else :
self.self_attn = SelfMultiheadAttn(
self.embed_dim, args.encoder_attention_heads,
dropout=args.attention_dropout,
bias=False,
include_norm_add=False,
impl='default',
)
# in_proj_weight has shape [3 * hidden, hidden] but it should be
# initialized like a [hidden, hidden] matrix.
# sqrt(6 / (hidden + hidden)) / sqrt(6 / (3 * hidden + hidden)) = sqrt(2)
# therefore xavier_uniform gain should be set to sqrt(2).
torch.nn.init.xavier_uniform_(self.self_attn.in_proj_weight, gain=math.sqrt(2))
self.dropout = args.dropout
self.relu_dropout = args.relu_dropout
self.normalize_before = args.decoder_normalize_before
if not (args.multihead_attn_impl == 'fast_with_lyrnrm_and_dropoutadd') :
self.self_attn_layer_norm = FusedLayerNorm(self.embed_dim)
if no_encoder_attn:
self.encoder_attn = None
if not (args.multihead_attn_impl == 'fast_with_lyrnrm_and_dropoutadd') :
self.encoder_attn_layer_norm = None
else:
if args.multihead_attn_impl == 'fast_with_lyrnrm_and_dropoutadd' :
self.encoder_attn = EncdecMultiheadAttn(
self.embed_dim, args.decoder_attention_heads,
dropout=args.attention_dropout,
bias=False,
include_norm_add=True,
impl='fast',
)
elif args.multihead_attn_impl == 'fast' :
self.encoder_attn = EncdecMultiheadAttn(
self.embed_dim, args.decoder_attention_heads,
dropout=args.attention_dropout,
bias=False,
include_norm_add=False,
impl='fast',
)
else :
self.encoder_attn = EncdecMultiheadAttn(
self.embed_dim, args.decoder_attention_heads,
dropout=args.attention_dropout,
bias=False,
include_norm_add=False,
impl='default',
)
# in_proj_weight_kv has shape [2 * hidden, hidden] but it should be
# initialized like a [hidden, hidden] matrix.
# sqrt(6 / (hidden + hidden)) / sqrt(6 / (2 * hidden + hidden)) = sqrt(1.5)
# therefore xavier_uniform gain should be set to sqrt(1.5).
torch.nn.init.xavier_uniform_(self.encoder_attn.in_proj_weight_kv, gain=math.sqrt(1.5))
if not (args.multihead_attn_impl == 'fast_with_lyrnrm_and_dropoutadd') :
self.encoder_attn_layer_norm = FusedLayerNorm(self.embed_dim)
self.fc1 = Linear(self.embed_dim, args.decoder_ffn_embed_dim)
self.fc2 = Linear(args.decoder_ffn_embed_dim, self.embed_dim)
self.final_layer_norm = FusedLayerNorm(self.embed_dim)
self.need_attn = True
def forward(self, x, encoder_out, time_padding_mask, encoder_padding_mask, incremental_state):
if self.multihead_attn_impl == 'fast_with_lyrnrm_and_dropoutadd' :
x, _ = self.self_attn(
query=x,
key=x,
value=x,
key_padding_mask=None,
need_weights=False,
attn_mask=time_padding_mask,
is_training=self.training,
)
else :
residual = x
x = self.maybe_layer_norm(self.self_attn_layer_norm, x, before=True)
x, _ = self.self_attn(
query=x,
key=x,
value=x,
key_padding_mask=None,
need_weights=False,
attn_mask=time_padding_mask,
is_training=self.training,
)
# TODO: Bug in Pytorch 19.07 prevents propogation of "training" argument
if self.training :
x = jit_dropout_add(x, residual, self.dropout, self.training)
else :
x = F.dropout(x, p=self.dropout, training=self.training)
x = residual + x
x = self.maybe_layer_norm(self.self_attn_layer_norm, x, after=True)
attn = None
if self.encoder_attn is not None:
if self.multihead_attn_impl == 'fast_with_lyrnrm_and_dropoutadd' :
x, attn = self.encoder_attn(
query=x,
key=encoder_out,
value=encoder_out,
key_padding_mask=encoder_padding_mask,
need_weights=False,
attn_mask=None,
is_training=self.training,
)
else :
residual = x
x = self.maybe_layer_norm(self.encoder_attn_layer_norm, x, before=True)
x, attn = self.encoder_attn(
query=x,
key=encoder_out,
value=encoder_out,
key_padding_mask=encoder_padding_mask,
need_weights=False,
attn_mask=None,
is_training=self.training,
)
# TODO: Bug in Pytorch 19.07 prevents propogation of "training" argument
if self.training :
x = jit_dropout_add(x, residual, self.dropout, self.training)
else :
x = F.dropout(x, p=self.dropout, training=self.training)
x = residual + x
x = self.maybe_layer_norm(self.encoder_attn_layer_norm, x, after=True)
residual = x
x = self.maybe_layer_norm(self.final_layer_norm, x, before=True)
# Fuse Bias to GEMMs by making Tensors 2D
sent_len,sents,hid_dim = x.size()
x = x.view(sent_len*sents, hid_dim)
if self.training :
x = jit_relu_dropout(self.fc1(x), self.relu_dropout, self.training)
else :
x = self.fc1(x)
x = F.threshold(x, 0., 0.)
x = F.dropout(x, p=self.relu_dropout, training=False)
#x = F.threshold(self.fc1(x),0,0)
#x = F.dropout(x, p=self.relu_dropout, training=self.training)
x = self.fc2(x)
x = x.view(sent_len, sents, hid_dim)
# TODO: Bug in Pytorch 19.07 prevents propogation of "training" argument
if self.training :
x = jit_dropout_add(x, residual, self.dropout, self.training)
else :
x = F.dropout(x, p=self.dropout, training=self.training)
x = residual + x
x = self.maybe_layer_norm(self.final_layer_norm, x, after=True)
return x, attn
def maybe_layer_norm(self, layer_norm, x, before=False, after=False):
assert before ^ after
if after ^ self.normalize_before:
return layer_norm(x)
else:
return x
def make_generation_fast_(self, need_attn=False, **kwargs):
self.need_attn = need_attn
def Embedding(num_embeddings, embedding_dim, padding_idx):
m = nn.Embedding(num_embeddings, embedding_dim, padding_idx=padding_idx)
nn.init.normal_(m.weight, mean=0, std=embedding_dim ** -0.5)
nn.init.constant_(m.weight[padding_idx], 0)
return m
def LayerNorm(embedding_dim):
m = FusedLayerNorm(embedding_dim)
return m
def Linear(in_features, out_features, bias=True):
m = nn.Linear(in_features, out_features, bias)
nn.init.xavier_uniform_(m.weight)
nn.init.constant_(m.bias, 0.)
return m
def PositionalEmbedding(num_embeddings, embedding_dim, padding_idx, left_pad, learned=False):
if learned:
m = LearnedPositionalEmbedding(num_embeddings + padding_idx + 1, embedding_dim, padding_idx, left_pad)
nn.init.normal_(m.weight, mean=0, std=embedding_dim ** -0.5)
nn.init.constant_(m.weight[padding_idx], 0)
else:
m = SinusoidalPositionalEmbedding(embedding_dim, padding_idx, left_pad, num_embeddings + padding_idx + 1)
return m
@register_model_architecture('transformer_lm', 'transformer_lm')
def base_lm_architecture(args):
args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 512)
args.decoder_ffn_embed_dim = getattr(args, 'decoder_ffn_embed_dim', 2048)
args.decoder_layers = getattr(args, 'decoder_layers', 6)
args.decoder_attention_heads = getattr(args, 'decoder_attention_heads', 8)
args.adaptive_softmax_cutoff = getattr(args, 'adaptive_softmax_cutoff', None)
args.decoder_learned_pos = getattr(args, 'decoder_learned_pos', False)
# The model training is not stable without this
args.decoder_normalize_before = True
@register_model_architecture('transformer_lm', 'transformer_lm_big')
def transformer_lm_big(args):
args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 1024)
args.decoder_ffn_embed_dim = getattr(args, 'decoder_ffn_embed_dim', 4096)
args.decoder_attention_heads = getattr(args, 'decoder_attention_heads', 16)
base_lm_architecture(args)
@register_model_architecture('transformer_lm', 'transformer_lm_wiki103')
def transformer_lm_wiki103(args):
args.dropout = getattr(args, 'dropout', 0.3)
base_lm_architecture(args)
@register_model_architecture('transformer_lm', 'transformer_lm_gbw')
def transformer_lm_gbw(args):
args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 512)
args.dropout = getattr(args, 'dropout', 0.1)
args.attention_dropout = getattr(args, 'attention_dropout', 0.1)
transformer_lm_big(args)
@register_model_architecture('transformer', 'transformer')
def base_architecture(args):
args.encoder_embed_path = getattr(args, 'encoder_embed_path', None)
args.encoder_embed_dim = getattr(args, 'encoder_embed_dim', 512)
args.encoder_ffn_embed_dim = getattr(args, 'encoder_ffn_embed_dim', 2048)
args.encoder_layers = getattr(args, 'encoder_layers', 6)
args.encoder_attention_heads = getattr(args, 'encoder_attention_heads', 8)
args.encoder_normalize_before = getattr(args, 'encoder_normalize_before', False)
args.encoder_learned_pos = getattr(args, 'encoder_learned_pos', False)
args.decoder_embed_path = getattr(args, 'decoder_embed_path', None)
args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', args.encoder_embed_dim)
args.decoder_ffn_embed_dim = getattr(args, 'decoder_ffn_embed_dim', args.encoder_ffn_embed_dim)
args.decoder_layers = getattr(args, 'decoder_layers', 6)
args.decoder_attention_heads = getattr(args, 'decoder_attention_heads', 8)
args.decoder_normalize_before = getattr(args, 'decoder_normalize_before', False)
args.decoder_learned_pos = getattr(args, 'decoder_learned_pos', False)
args.attention_dropout = getattr(args, 'attention_dropout', 0.)
args.relu_dropout = getattr(args, 'relu_dropout', 0.)
args.dropout = getattr(args, 'dropout', 0.1)
args.adaptive_softmax_cutoff = getattr(args, 'adaptive_softmax_cutoff', None)
args.share_decoder_input_output_embed = getattr(args, 'share_decoder_input_output_embed', False)
args.share_all_embeddings = getattr(args, 'share_all_embeddings', False)
args.no_token_positional_embeddings = getattr(args, 'no_token_positional_embeddings', False)
@register_model_architecture('transformer', 'transformer_iwslt_de_en')
def transformer_iwslt_de_en(args):
args.encoder_embed_dim = getattr(args, 'encoder_embed_dim', 512)
args.encoder_ffn_embed_dim = getattr(args, 'encoder_ffn_embed_dim', 1024)
args.encoder_attention_heads = getattr(args, 'encoder_attention_heads', 4)
args.encoder_layers = getattr(args, 'encoder_layers', 6)
args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 512)
args.decoder_ffn_embed_dim = getattr(args, 'decoder_ffn_embed_dim', 1024)
args.decoder_attention_heads = getattr(args, 'decoder_attention_heads', 4)
args.decoder_layers = getattr(args, 'decoder_layers', 6)
base_architecture(args)
@register_model_architecture('transformer', 'transformer_wmt_en_de')
def transformer_wmt_en_de(args):
base_architecture(args)
# parameters used in the "Attention Is All You Need" paper (Vaswani, et al, 2017)
@register_model_architecture('transformer', 'transformer_vaswani_wmt_en_de_big')
def transformer_vaswani_wmt_en_de_big(args):
args.encoder_embed_dim = getattr(args, 'encoder_embed_dim', 1024)
args.encoder_ffn_embed_dim = getattr(args, 'encoder_ffn_embed_dim', 4096)
args.encoder_attention_heads = getattr(args, 'encoder_attention_heads', 16)
args.encoder_normalize_before = getattr(args, 'encoder_normalize_before', False)
args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 1024)
args.decoder_ffn_embed_dim = getattr(args, 'decoder_ffn_embed_dim', 4096)
args.decoder_attention_heads = getattr(args, 'decoder_attention_heads', 16)
args.dropout = getattr(args, 'dropout', 0.3)
base_architecture(args)
@register_model_architecture('transformer', 'transformer_vaswani_wmt_en_fr_big')
def transformer_vaswani_wmt_en_fr_big(args):
args.dropout = getattr(args, 'dropout', 0.1)
transformer_vaswani_wmt_en_de_big(args)
@register_model_architecture('transformer', 'transformer_wmt_en_de_big')
def transformer_wmt_en_de_big(args):
args.attention_dropout = getattr(args, 'attention_dropout', 0.1)
transformer_vaswani_wmt_en_de_big(args)
# default parameters used in tensor2tensor implementation
@register_model_architecture('transformer', 'transformer_wmt_en_de_big_t2t')
def transformer_wmt_en_de_big_t2t(args):
args.encoder_normalize_before = getattr(args, 'encoder_normalize_before', True)
args.decoder_normalize_before = getattr(args, 'decoder_normalize_before', True)
args.attention_dropout = getattr(args, 'attention_dropout', 0.1)
args.relu_dropout = getattr(args, 'relu_dropout', 0.1)
transformer_vaswani_wmt_en_de_big(args)
# Copyright (c) 2017-present, Facebook, Inc.
# All rights reserved.
#
# This source code is licensed under the license found in the LICENSE file in
# the root directory of this source tree. An additional grant of patent rights
# can be found in the PATENTS file in the same directory.
from .adaptive_softmax import AdaptiveSoftmax
from .beamable_mm import BeamableMM
from .conv_tbc import ConvTBC
from .downsampled_multihead_attention import DownsampledMultiHeadAttention
from .grad_multiply import GradMultiply
from .learned_positional_embedding import LearnedPositionalEmbedding
from .linearized_convolution import LinearizedConvolution
from .scalar_bias import ScalarBias
from .sinusoidal_positional_embedding import SinusoidalPositionalEmbedding
__all__ = [
'AdaptiveSoftmax',
'BeamableMM',
'ConvTBC',
'DownsampledMultiHeadAttention',
'GradMultiply',
'LearnedPositionalEmbedding',
'LinearizedConvolution',
'ScalarBias',
'SinusoidalPositionalEmbedding',
]
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment