Initial commit

e734b0fa · Sergey Edunov · e734b0fa · e734b0fa · e734b0fa · e734b0fa
Commit e734b0fa authored Sep 14, 2017 by Sergey Edunov
20 changed files
--- a/fairseq/models/__init__.py
+++ b/fairseq/models/__init__.py
+# Copyright (c) 2017-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in
+# the root directory of this source tree. An additional grant of patent rights
+# can be found in the PATENTS file in the same directory.
+#
+from .fconv import *
+__all__ = [
+    'fconv', 'fconv_iwslt_de_en', 'fconv_wmt_en_ro', 'fconv_wmt_en_de',
+    'fconv_wmt_en_fr',
+]
--- a/fairseq/models/fconv.py
+++ b/fairseq/models/fconv.py
+# Copyright (c) 2017-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in
+# the root directory of this source tree. An additional grant of patent rights
+# can be found in the PATENTS file in the same directory.
+#
+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from fairseq.modules import BeamableMM, LinearizedConvolution
+class FConvModel(nn.Module):
+    def __init__(self, encoder, decoder, padding_idx=1):
+        super(FConvModel, self).__init__()
+        self.encoder = encoder
+        self.decoder = decoder
+        self.encoder.num_attention_layers = sum([layer is not None for layer in decoder.attention])
+        self.padding_idx = padding_idx
+        self._is_generation_fast = False
+    def forward(self, src_tokens, src_positions, input_tokens, input_positions):
+        encoder_out = self.encoder(src_tokens, src_positions)
+        decoder_out = self.decoder(input_tokens, input_positions, encoder_out)
+        return decoder_out.view(-1, decoder_out.size(-1))
+    def make_generation_fast_(self, beam_size, use_beamable_mm=False):
+        """Optimize model for faster generation.
+        Optimizations include:
+        - remove WeightNorm
+        - (optionally) use BeamableMM in attention layers
+        The optimized model should not be used again for training.
+        Note: this can be combined with incremental inference in the Decoder for
+        even faster generation.
+        """
+        if self._is_generation_fast:
+            return  # only apply once
+        self._is_generation_fast = True
+        # remove weight norm from all modules in the network
+        def remove_weight_norm(m):
+            try:
+                nn.utils.remove_weight_norm(m)
+            except ValueError:  # this module didn't have weight norm
+                return
+        self.apply(remove_weight_norm)
+        # use BeamableMM in attention layers
+        if use_beamable_mm:
+            self.decoder._use_beamable_mm(beam_size)
+        def train(mode):
+            if mode:
+                raise RuntimeError('cannot train after make_generation_fast')
+        # this model should no longer be used for training
+        self.eval()
+        self.train = train
+class Encoder(nn.Module):
+    """Convolutional encoder"""
+    def __init__(self, num_embeddings, embed_dim=512, max_positions=1024,
+                 convolutions=((512, 3),) * 20, dropout=0.1, padding_idx=1):
+        super(Encoder, self).__init__()
+        self.dropout = dropout
+        self.num_attention_layers = None
+        self.embed_tokens = Embedding(num_embeddings, embed_dim, padding_idx)
+        self.embed_positions = Embedding(max_positions, embed_dim, padding_idx)
+        in_channels = convolutions[0][0]
+        self.fc1 = Linear(embed_dim, in_channels, dropout=dropout)
+        self.projections = nn.ModuleList()
+        self.convolutions = nn.ModuleList()
+        for (out_channels, kernel_size) in convolutions:
+            pad = (kernel_size - 1) // 2
+            self.projections.append(Linear(in_channels, out_channels)
+                                    if in_channels != out_channels else None)
+            self.convolutions.append(
+                ConvTBC(in_channels, out_channels * 2, kernel_size, padding=pad,
+                        dropout=dropout))
+            in_channels = out_channels
+        self.fc2 = Linear(in_channels, embed_dim)
+    def forward(self, tokens, positions):
+        # embed tokens and positions
+        x = self.embed_tokens(tokens) + self.embed_positions(positions)
+        x = F.dropout(x, p=self.dropout, training=self.training)
+        input_embedding = x
+        # project to size of convolution
+        x = self.fc1(x)
+        # B x T x C -> T x B x C
+        x = x.transpose(0, 1)
+        # temporal convolutions
+        for proj, conv in zip(self.projections, self.convolutions):
+            residual = x if proj is None else proj(x)
+            x = F.dropout(x, p=self.dropout, training=self.training)
+            x = conv(x)
+            x = F.glu(x, dim=-1)
+            x = (x + residual) * math.sqrt(0.5)
+        # T x B x C -> B x T x C
+        x = x.transpose(1, 0)
+        # project back to size of embedding
+        x = self.fc2(x)
+        # scale gradients (this only affects backward, not forward)
+        x = grad_multiply(x, 1.0 / (2.0 * self.num_attention_layers))
+        # add output to input embedding for attention
+        y = (x + input_embedding) * math.sqrt(0.5)
+        return x, y
+class AttentionLayer(nn.Module):
+    def __init__(self, conv_channels, embed_dim, bmm=None):
+        super(AttentionLayer, self).__init__()
+        # projects from output of convolution to embedding dimension
+        self.in_projection = Linear(conv_channels, embed_dim)
+        # projects from embedding dimension to convolution size
+        self.out_projection = Linear(embed_dim, conv_channels)
+        self.bmm = bmm if bmm is not None else torch.bmm
+    def forward(self, x, target_embedding, encoder_out):
+        residual = x
+        # attention
+        x = (self.in_projection(x) + target_embedding) * math.sqrt(0.5)
+        x = self.bmm(x, encoder_out[0])
+        # softmax over last dim
+        sz = x.size()
+        x = F.softmax(x.view(sz[0] * sz[1], sz[2]))
+        x = x.view(sz)
+        attn_scores = x
+        x = self.bmm(x, encoder_out[1])
+        # scale attention output
+        s = encoder_out[1].size(1)
+        x = x * (s * math.sqrt(1.0 / s))
+        # project back
+        x = (self.out_projection(x) + residual) * math.sqrt(0.5)
+        return x, attn_scores
+class Decoder(nn.Module):
+    """Convolutional decoder"""
+    def __init__(self, num_embeddings, embed_dim=512, out_embed_dim=256,
+                 max_positions=1024, convolutions=((512, 3),) * 20,
+                 attention=True, dropout=0.1, padding_idx=1):
+        super(Decoder, self).__init__()
+        self.dropout = dropout
+        in_channels = convolutions[0][0]
+        if isinstance(attention, bool):
+            # expand True into [True, True, ...] and do the same with False
+            attention = [attention] * len(convolutions)
+        self.embed_tokens = Embedding(num_embeddings, embed_dim, padding_idx)
+        self.embed_positions = Embedding(max_positions, embed_dim, padding_idx)
+        self.fc1 = Linear(embed_dim, in_channels, dropout=dropout)
+        self.projections = nn.ModuleList()
+        self.convolutions = nn.ModuleList()
+        self.attention = nn.ModuleList()
+        for i, (out_channels, kernel_size) in enumerate(convolutions):
+            pad = kernel_size - 1
+            self.projections.append(Linear(in_channels, out_channels)
+                                    if in_channels != out_channels else None)
+            self.convolutions.append(
+                LinearizedConv1d(in_channels, out_channels * 2, kernel_size,
+                                 padding=pad, dropout=dropout))
+            self.attention.append(AttentionLayer(out_channels, embed_dim)
+                                  if attention[i] else None)
+            in_channels = out_channels
+        self.fc2 = Linear(in_channels, out_embed_dim)
+        self.fc3 = Linear(out_embed_dim, num_embeddings, dropout=dropout)
+        self._is_inference_incremental = False
+    def forward(self, tokens, positions, encoder_out):
+        # embed tokens and positions
+        x = self.embed_tokens(tokens) + self.embed_positions(positions)
+        x = F.dropout(x, p=self.dropout, training=self.training)
+        target_embedding = x
+        # project to size of convolution
+        x = self.fc1(x)
+        # transpose only once to speed up attention layers
+        encoder_a, encoder_b = encoder_out
+        encoder_a = encoder_a.transpose(1, 2).contiguous()
+        # B x T x C -> T x B x C
+        x = x.transpose(0, 1)
+        # temporal convolutions
+        for proj, conv, attention in zip(self.projections, self.convolutions, self.attention):
+            residual = x if proj is None else proj(x)
+            x = F.dropout(x, p=self.dropout, training=self.training)
+            x = conv(x)
+            x = conv.remove_future_timesteps(x)
+            x = F.glu(x)
+            # attention
+            if attention is not None:
+                x = x.transpose(1, 0)
+                x, _ = attention(x, target_embedding, (encoder_a, encoder_b))
+                x = x.transpose(1, 0)
+            # residual
+            x = (x + residual) * math.sqrt(0.5)
+        # T x B x C -> B x T x C
+        x = x.transpose(1, 0)
+        # project back to size of vocabulary
+        x = self.fc2(x)
+        x = F.dropout(x, p=self.dropout, training=self.training)
+        x = self.fc3(x)
+        return x
+    def context_size(self):
+        """Maximum number of input elements each output element depends on"""
+        context = 1
+        for conv in self.convolutions:
+            context += conv.kernel_size[0] - 1
+        return context
+    def incremental_inference(self):
+        """Context manager for incremental inference.
+        This provides an optimized forward pass for incremental inference
+        (i.e., it predicts one time step at a time). If the input order changes
+        between time steps, call model.decoder.reorder_incremental_state to
+        update the relevant buffers. To generate a fresh sequence, first call
+        model.decoder.clear_incremental_state.
+        Usage:
+        ```
+        with model.decoder.incremental_inference():
+            for step in range(maxlen):
+                out = model.decoder(tokens[:, :step], positions[:, :step],
+                                    encoder_out)
+                probs = F.log_softmax(out[:, -1, :])
+        ```
+        """
+        class IncrementalInference(object):
+            def __init__(self, decoder):
+                self.decoder = decoder
+            def __enter__(self):
+                self.decoder._start_incremental_inference()
+            def __exit__(self, *args):
+                self.decoder._stop_incremental_inference()
+        return IncrementalInference(self)
+    def _start_incremental_inference(self):
+        assert not self._is_inference_incremental, \
+            'already performing incremental inference'
+        self._is_inference_incremental = True
+        # save original forward and convolution layers
+        self._orig_forward = self.forward
+        self._orig_conv = self.convolutions
+        # switch to incremental forward
+        self.forward = self._incremental_forward
+        # start a fresh sequence
+        self.clear_incremental_state()
+    def _stop_incremental_inference(self):
+        # restore original forward and convolution layers
+        self.forward = self._orig_forward
+        self.convolutions = self._orig_conv
+        self._is_inference_incremental = False
+    def _incremental_forward(self, tokens, positions, encoder_out):
+        assert self._is_inference_incremental
+        # setup initial state
+        if self.prev_state is None:
+            # transpose encoder output once to speed up attention layers
+            encoder_a, encoder_b = encoder_out
+            encoder_a = encoder_a.transpose(1, 2).contiguous()
+            self.prev_state = {
+                'encoder_out': (encoder_a, encoder_b),
+            }
+        # load previous state
+        encoder_a, encoder_b = self.prev_state['encoder_out']
+        # keep only the last token for incremental forward pass
+        tokens = tokens[:, -1:]
+        positions = positions[:, -1:]
+        # embed tokens and positions
+        x = self.embed_tokens(tokens) + self.embed_positions(positions)
+        target_embedding = x
+        # project to size of convolution
+        x = self.fc1(x)
+        # temporal convolutions
+        avg_attn_scores = None
+        num_attn_layers = len(self.attention)
+        for proj, conv, attention in zip(self.projections, self.convolutions, self.attention):
+            residual = x if proj is None else proj(x)
+            x = conv.incremental_forward(x)
+            x = F.glu(x)
+            # attention
+            if attention is not None:
+                x, attn_scores = attention(x, target_embedding, (encoder_a, encoder_b))
+                attn_scores = attn_scores / num_attn_layers
+                if avg_attn_scores is None:
+                    avg_attn_scores = attn_scores
+                else:
+                    avg_attn_scores += attn_scores
+            # residual
+            x = (x + residual) * math.sqrt(0.5)
+        # project back to size of vocabulary
+        x = self.fc2(x)
+        x = self.fc3(x)
+        return x, avg_attn_scores
+    def clear_incremental_state(self):
+        """Clear all state used for incremental generation.
+        **For incremental inference only**
+        This should be called before generating a fresh sequence.
+        """
+        if self._is_inference_incremental:
+            self.prev_state = None
+            for conv in self.convolutions:
+                conv.clear_buffer()
+    def reorder_incremental_state(self, new_order):
+        """Reorder buffered internal state (for incremental generation).
+        **For incremental inference only**
+        This should be called when the order of the input has changed from the
+        previous time step. A typical use case is beam search, where the input
+        order changes between time steps based on the choice of beams.
+        """
+        if self._is_inference_incremental:
+            for conv in self.convolutions:
+                conv.reorder_buffer(new_order)
+    def _use_beamable_mm(self, beam_size):
+        """Replace torch.bmm with BeamableMM in attention layers."""
+        beamable_mm = BeamableMM(beam_size)
+        for attn in self.attention:
+            attn.bmm = beamable_mm
+def Embedding(num_embeddings, embedding_dim, padding_idx):
+    m = nn.Embedding(num_embeddings, embedding_dim, padding_idx=padding_idx)
+    m.weight.data.normal_(0, 0.1)
+    return m
+def Linear(in_features, out_features, dropout=0):
+    """Weight-normalized Linear layer (input: N x T x C)"""
+    m = nn.Linear(in_features, out_features)
+    m.weight.data.normal_(mean=0, std=math.sqrt((1 - dropout) / in_features))
+    m.bias.data.zero_()
+    return nn.utils.weight_norm(m)
+def LinearizedConv1d(in_channels, out_channels, kernel_size, dropout=0, **kwargs):
+    """Weight-normalized Conv1d layer optimized for decoding"""
+    m = LinearizedConvolution(in_channels, out_channels, kernel_size, **kwargs)
+    std = math.sqrt((4 * (1.0 - dropout)) / (m.kernel_size[0] * in_channels))
+    m.weight.data.normal_(mean=0, std=std)
+    m.bias.data.zero_()
+    return nn.utils.weight_norm(m)
+def ConvTBC(in_channels, out_channels, kernel_size, dropout=0, **kwargs):
+    """Weight-normalized Conv1d layer"""
+    from fairseq.modules import ConvTBC
+    m = ConvTBC(in_channels, out_channels, kernel_size, **kwargs)
+    std = math.sqrt((4 * (1.0 - dropout)) / (m.kernel_size[0] * in_channels))
+    m.weight.data.normal_(mean=0, std=std)
+    m.bias.data.zero_()
+    return nn.utils.weight_norm(m, dim=2)
+def grad_multiply(x, scale):
+    return GradMultiply.apply(x, scale)
+class GradMultiply(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, x, scale):
+        ctx.scale = scale
+        res = x.new(x)
+        ctx.mark_shared_storage((x, res))
+        return res
+    @staticmethod
+    def backward(ctx, grad):
+        return grad * ctx.scale, None
+def fconv_iwslt_de_en(dataset, dropout, **kwargs):
+    encoder_convs = [(256, 3)] * 4
+    decoder_convs = [(256, 3)] * 3
+    return fconv(dataset, dropout, 256, encoder_convs, 256, decoder_convs, **kwargs)
+def fconv_wmt_en_ro(dataset, dropout, **kwargs):
+    convs = [(512, 3)] * 20
+    return fconv(dataset, dropout, 512, convs, 512, convs, **kwargs)
+def fconv_wmt_en_de(dataset, dropout, **kwargs):
+    convs = [(512, 3)] * 9  # first 10 layers have 512 units
+    convs += [(1024, 3)] * 4  # next 3 layers have 768 units
+    convs += [(2048, 1)] * 2  # final 2 layers are 1x1
+    return fconv(dataset, dropout, 768, convs, 768, convs,
+                 decoder_out_embed_dim=512,
+                 **kwargs)
+def fconv_wmt_en_fr(dataset, dropout, **kwargs):
+    convs = [(512, 3)] * 6  # first 5 layers have 512 units
+    convs += [(768, 3)] * 4  # next 4 layers have 768 units
+    convs += [(1024, 3)] * 3  # next 4 layers have 1024 units
+    convs += [(2048, 1)] * 1  # next 1 layer is 1x1
+    convs += [(4096, 1)] * 1  # final 1 layer is 1x1
+    return fconv(dataset, dropout, 768, convs, 768, convs,
+                 decoder_out_embed_dim=512,
+                 **kwargs)
+def fconv(dataset, dropout, encoder_embed_dim, encoder_convolutions,
+          decoder_embed_dim, decoder_convolutions, attention=True,
+          decoder_out_embed_dim=256, max_positions=1024):
+    padding_idx = dataset.dst_dict.pad()
+    encoder = Encoder(
+        len(dataset.src_dict),
+        embed_dim=encoder_embed_dim,
+        convolutions=encoder_convolutions,
+        dropout=dropout,
+        padding_idx=padding_idx,
+        max_positions=max_positions)
+    decoder = Decoder(
+        len(dataset.dst_dict),
+        embed_dim=decoder_embed_dim,
+        convolutions=decoder_convolutions,
+        out_embed_dim=decoder_out_embed_dim,
+        attention=attention,
+        dropout=dropout,
+        padding_idx=padding_idx,
+        max_positions=max_positions)
+    return FConvModel(encoder, decoder, padding_idx)
--- a/fairseq/modules/__init__.py
+++ b/fairseq/modules/__init__.py
+# Copyright (c) 2017-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in
+# the root directory of this source tree. An additional grant of patent rights
+# can be found in the PATENTS file in the same directory.
+#
+from .beamable_mm import *
+from .linearized_convolution import *
+from .conv_tbc import ConvTBC
+__all__ = [
+    'BeamableMM', 'LinearizedConvolution', 'ConvTBC',
+]
--- a/fairseq/modules/beamable_mm.py
+++ b/fairseq/modules/beamable_mm.py
+# Copyright (c) 2017-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in
+# the root directory of this source tree. An additional grant of patent rights
+# can be found in the PATENTS file in the same directory.
+#
+import torch
+import torch.nn as nn
+class BeamableMM(nn.Module):
+    """This module provides an optimized MM for beam decoding with attention.
+    It leverage the fact that the source-side of the input is replicated beam
+    times and the target-side of the input is of width one. This layer speeds up
+    inference by replacing the inputs {(bsz x 1 x nhu), (bsz x sz2 x nhu)}
+    with smaller inputs {(bsz/beam x beam x nhu), (bsz/beam x sz2 x nhu)}.
+    """
+    def __init__(self, beam_size):
+        super(BeamableMM, self).__init__()
+        self.beam_size = beam_size
+    def forward(self, input1, input2):
+        if (
+            not self.training and   # test mode
+            self.beam_size > 0 and  # beam size is set
+            input1.dim() == 3 and   # only support batched input
+            input1.size(1) == 1     # single time step update
+        ):
+            bsz, beam = input1.size(0), self.beam_size
+            # bsz x 1 x nhu --> bsz/beam x beam x nhu
+            input1 = input1[:, 0, :].unfold(0, beam, beam).transpose(2, 1)
+            # bsz x sz2 x nhu --> bsz/beam x sz2 x nhu
+            input2 = input2.unfold(0, beam, beam)[:, :, :, 0]
+            # use non batched operation if bsz = beam
+            if input1.size(0) == 1:
+                output = torch.mm(input1[0, :, :], input2[0, :, :])
+            else:
+                output = input1.bmm(input2)
+            return output.view(bsz, 1, -1)
+        else:
+            return input1.bmm(input2)
--- a/fairseq/modules/conv_tbc.py
+++ b/fairseq/modules/conv_tbc.py
+# Copyright (c) 2017-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in
+# the root directory of this source tree. An additional grant of patent rights
+# can be found in the PATENTS file in the same directory.
+#
+import torch
+from torch.autograd import Variable, Function
+from torch.nn.modules.utils import _single
+try:
+    from fairseq import temporal_convolution_tbc
+except ImportError as e:
+    import sys
+    sys.stderr.write('ERROR: missing temporal_convolution_tbc, run `python setup.py install`\n')
+    raise e
+class ConvTBC(torch.nn.Module):
+    """1D convolution over an input of shape (time x batch x channel)
+    The implementation uses gemm to perform the convolution. This implementation
+    is faster than cuDNN for small kernel sizes.
+    """
+    def __init__(self, in_channels, out_channels, kernel_size, stride=1,
+                 padding=0):
+        super(ConvTBC, self).__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.kernel_size = _single(kernel_size)
+        self.stride = _single(stride)
+        self.padding = _single(padding)
+        assert self.stride == (1,)
+        self.weight = torch.nn.Parameter(torch.Tensor(
+            self.kernel_size[0], in_channels, out_channels))
+        self.bias = torch.nn.Parameter(torch.Tensor(out_channels))
+    def forward(self, input):
+        return ConvTBCFunction.apply(
+            input.contiguous(), self.weight, self.bias, self.padding[0])
+    def __repr__(self):
+        s = ('{name}({in_channels}, {out_channels}, kernel_size={kernel_size}'
+             ', padding={padding}')
+        if self.bias is None:
+            s += ', bias=False'
+        s += ')'
+        return s.format(name=self.__class__.__name__, **self.__dict__)
+class ConvTBCFunction(Function):
+    @staticmethod
+    def forward(ctx, input, weight, bias, pad):
+        input_size = input.size()
+        weight_size = weight.size()
+        kernel_size = weight_size[0]
+        output = input.new(
+            input_size[0] - kernel_size + 1 + pad * 2,
+            input_size[1],
+            weight_size[2])
+        ctx.input_size = input_size
+        ctx.weight_size = weight_size
+        ctx.save_for_backward(input, weight)
+        temporal_convolution_tbc.TemporalConvolutionTBC_forward(
+            input.type().encode('utf-8'),
+            input,
+            output,
+            weight,
+            bias)
+        return output
+    @staticmethod
+    def backward(ctx, grad_output):
+        input, weight = ctx.saved_tensors
+        grad_output = grad_output.data.contiguous()
+        grad_input = grad_output.new(ctx.input_size).zero_()
+        grad_weight = grad_output.new(ctx.weight_size).zero_()
+        grad_bias = grad_output.new(ctx.weight_size[2])
+        temporal_convolution_tbc.TemporalConvolutionTBC_backward(
+            input.type().encode('utf-8'),
+            grad_output,
+            grad_input,
+            grad_weight,
+            grad_bias,
+            input,
+            weight)
+        grad_input = Variable(grad_input, volatile=True)
+        grad_weight = Variable(grad_weight, volatile=True)
+        grad_bias = Variable(grad_bias, volatile=True)
+        return grad_input, grad_weight, grad_bias, None
+def conv_tbc(input, weight, bias=None, stride=1, padding=0):
+    return ConvTBCFunction.apply(
+        input.contiguous(), weight, bias, padding[0])
--- a/fairseq/modules/linearized_convolution.py
+++ b/fairseq/modules/linearized_convolution.py
+# Copyright (c) 2017-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in
+# the root directory of this source tree. An additional grant of patent rights
+# can be found in the PATENTS file in the same directory.
+#
+import torch
+import torch.nn.functional as F
+from .conv_tbc import ConvTBC
+class LinearizedConvolution(ConvTBC):
+    """An optimized version of nn.Conv1d.
+    This module replaces convolutions with linear layers as appropriate
+    and supports optimizations for incremental inference.
+    """
+    def __init__(self, in_channels, out_channels, kernel_size, **kwargs):
+        super().__init__(in_channels, out_channels, kernel_size, **kwargs)
+        self.clear_buffer()
+        self._linearized_weight = None
+        self.register_backward_hook(self._clear_linearized_weight)
+    def remove_future_timesteps(self, x):
+        """Remove future time steps created by padding."""
+        if self.kernel_size[0] > 1 and self.padding[0] > 0:
+            x = x[:-self.padding[0], :, :]
+        return x
+    def incremental_forward(self, input):
+        """Forward convolution one time step at a time.
+        This function maintains an internal state to buffer signal and
+        accepts a single frame as input. If the input order changes
+        between time steps, call reorder_buffer. To apply to fresh
+        inputs, call clear_buffer.
+        """
+        if self.training:
+            raise RuntimeError('LinearizedConvolution only supports inference')
+        # run forward pre hooks (e.g., weight norm)
+        for hook in self._forward_pre_hooks.values():
+            hook(self, input)
+        # reshape weight
+        weight = self._get_linearized_weight()
+        kw = self.kernel_size[0]
+        bsz = input.size(0)  # input: bsz x len x dim
+        if kw > 1:
+            input = input.data
+            if self.input_buffer is None:
+                self.input_buffer = input.new(bsz, kw, input.size(2))
+                self.input_buffer.zero_()
+            else:
+                # shift buffer
+                self.input_buffer[:, :-1, :] = self.input_buffer[:, 1:, :].clone()
+            # append next input
+            self.input_buffer[:, -1, :] = input[:, -1, :]
+            input = torch.autograd.Variable(self.input_buffer, volatile=True)
+        output = F.linear(input.view(bsz, -1), weight, self.bias)
+        return output.view(bsz, 1, -1)
+    def clear_buffer(self):
+        self.input_buffer = None
+    def reorder_buffer(self, new_order):
+        if self.input_buffer is not None:
+            self.input_buffer = self.input_buffer.index_select(0, new_order)
+    def _get_linearized_weight(self):
+        if self._linearized_weight is None:
+            kw = self.kernel_size[0]
+            weight = self.weight.transpose(2, 1).transpose(1, 0).contiguous()
+            assert weight.size() == (self.out_channels, kw, self.in_channels)
+            self._linearized_weight = weight.view(self.out_channels, -1)
+        return self._linearized_weight
+    def _clear_linearized_weight(self, *args):
+        self._linearized_weight = None
--- a/fairseq/multiprocessing_event_loop.py
+++ b/fairseq/multiprocessing_event_loop.py
+# Copyright (c) 2017-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in
+# the root directory of this source tree. An additional grant of patent rights
+# can be found in the PATENTS file in the same directory.
+#
+import os
+import signal
+import threading
+from torch import multiprocessing
+class MultiprocessingEventLoop(object):
+    """Start a multiprocessing event loop."""
+    def __init__(self, device_ids=None, multiprocessing_method='spawn'):
+        super().__init__()
+        self.device_ids = tuple(device_ids)
+        self.num_replicas = len(device_ids)
+        self.rank = None
+        self._mp = multiprocessing.get_context(multiprocessing_method)
+        self._start_error_handler()
+        self._start_multiprocessing()
+    def call_async(self, rank, action, **kwargs):
+        """Asynchronously call a function in each child process.
+        Call a function named `action` on the rank'th process and return
+        a Future with the result.
+        """
+        def result_generator():
+            yield self.return_pipes[rank].recv()
+        assert not self.return_pipes[rank].poll(), \
+            'return pipe must be consumed before calling another function'
+        self.input_pipes[rank].send((action, kwargs))
+        return Future(result_generator())
+    def stop(self, interrupt_children=False):
+        """Stop multiprocessing."""
+        for rank in range(self.num_replicas):
+            self.input_pipes[rank].close()
+            self.return_pipes[rank].close()
+            if interrupt_children:
+                # send KeyboardInterrupt to children
+                os.kill(self.procs[rank].pid, signal.SIGINT)
+            else:
+                self.procs[rank].join()
+        self.error_queue.put((None, None))  # poison pill
+    def _start_error_handler(self):
+        """Error handler to catch exceptions in child processes."""
+        # create a thread to listen for errors in the child processes
+        self.error_queue = self._mp.SimpleQueue()
+        error_thread = threading.Thread(target=self._error_listener,
+                                        daemon=True)
+        error_thread.start()
+        # create signal handler that executes in the main process/thread and
+        # handles errors from child processes
+        signal.signal(signal.SIGUSR1, self._signal_handler)
+    def _error_listener(self):
+        """A thread that listens for errors in the child processes.
+        Errors are handled in a signal handler in the main thread.
+        """
+        (rank, original_trace) = self.error_queue.get()
+        if rank is None:  # poison pill, return
+            return
+        # requeue error and switch to main thread for handling the error
+        self.error_queue.put((rank, original_trace))
+        os.kill(os.getpid(), signal.SIGUSR1)
+    def _signal_handler(self, signal, frame):
+        """Signal handler that handles errors from child processes.
+        This signal handler executes in the main/process thread.
+        """
+        self.stop(interrupt_children=True)
+        (rank, original_trace) = self.error_queue.get()
+        msg = "\n\n-- Tracebacks above this line can probably be ignored --\n\n"
+        msg += original_trace
+        raise Exception(msg)
+    def _start_multiprocessing(self):
+        """Create child processes to run async event loop.
+        Each process reads input from a Pipe, performs some computation,
+        and returns its output to another Pipe.
+        """
+        # create child processes
+        input_pipes = []
+        return_pipes = []
+        procs = []
+        for rank, id in enumerate(self.device_ids):
+            recv_input_pipe, send_input_pipe = self._mp.Pipe(duplex=False)
+            recv_return_pipe, send_return_pipe = self._mp.Pipe(duplex=False)
+            proc = self._mp.Process(
+                target=self._process_event_loop,
+                args=(rank, id, recv_input_pipe, send_return_pipe),
+                daemon=True)
+            proc.start()
+            input_pipes.append(send_input_pipe)
+            return_pipes.append(recv_return_pipe)
+            procs.append(proc)
+        self.input_pipes = input_pipes
+        self.return_pipes = return_pipes
+        self.procs = procs
+    def _process_event_loop(self, rank, device_id, input_pipe, return_pipe):
+        """Event loop that runs in each child process.
+        Event loop:
+        - take an action from the input pipe
+        - call the corresponding function in this process
+        - put the return value in the return pipe
+        Any exceptions are put in the error queue.
+        """
+        self.rank = rank
+        try:
+            # event loop
+            while True:
+                action, kwargs = input_pipe.recv()
+                action_fn = getattr(self, action)
+                return_pipe.send(action_fn(rank, device_id, **kwargs))
+        except EOFError:
+            # input pipe was closed, do nothing
+            pass
+        except KeyboardInterrupt:
+            # killed by parent, do nothing
+            pass
+        except Exception:
+            # propagate exception from child to parent process, keeping
+            # original traceback
+            import traceback
+            self.error_queue.put((rank, traceback.format_exc()))
+        finally:
+            # cleanup pipes
+            input_pipe.close()
+            return_pipe.close()
+class Future(object):
+    """A wrapper around a Python generator, with syntactic sugar."""
+    def __init__(self, generator):
+        self.generator = generator
+    def gen(self):
+        return next(self.generator)
+    @staticmethod
+    def gen_list(gens):
+        return [g.gen() for g in gens]
+    @staticmethod
+    def gen_tuple_list(gens):
+        list = [g.gen() for g in gens]
+        return zip(*list)
--- a/fairseq/multiprocessing_pdb.py
+++ b/fairseq/multiprocessing_pdb.py
+# Copyright (c) 2017-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in
+# the root directory of this source tree. An additional grant of patent rights
+# can be found in the PATENTS file in the same directory.
+#
+import multiprocessing
+import os
+import pdb
+import sys
+class MultiprocessingPdb(pdb.Pdb):
+    """A Pdb wrapper that works in a multiprocessing environment.
+    Usage: `from fairseq import pdb; pdb.set_trace()`
+    """
+    _stdin_fd = sys.stdin.fileno()
+    _stdin = None
+    _stdin_lock = multiprocessing.Lock()
+    def __init__(self):
+        pdb.Pdb.__init__(self, nosigint=True)
+    def _cmdloop(self):
+        stdin_bak = sys.stdin
+        with self._stdin_lock:
+            try:
+                if not self._stdin:
+                    self._stdin = os.fdopen(self._stdin_fd)
+                sys.stdin = self._stdin
+                self.cmdloop()
+            finally:
+                sys.stdin = stdin_bak
+pdb = MultiprocessingPdb()
--- a/fairseq/multiprocessing_trainer.py
+++ b/fairseq/multiprocessing_trainer.py
+# Copyright (c) 2017-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in
+# the root directory of this source tree. An additional grant of patent rights
+# can be found in the PATENTS file in the same directory.
+#
+"""
+Train a network on multiple GPUs using multiprocessing.
+"""
+import torch
+from torch.optim.lr_scheduler import LambdaLR, ReduceLROnPlateau
+from fairseq import nccl, utils
+from fairseq.criterions import FairseqCriterion
+from fairseq.multiprocessing_event_loop import MultiprocessingEventLoop, Future
+from fairseq.nag import NAG
+class MultiprocessingTrainer(MultiprocessingEventLoop):
+    """Main class for multi-GPU training.
+    Each GPU has a full copy of the model and is assigned to its own Python
+    process. Gradients are accumulated with all-reduce and all model replicas
+    are updated synchronously after each batch.
+    The methods in this class are divided into synchronous functions, which
+    prepare and dispatch the input to each process, and asynchronous functions
+    (prefixed with `_async_`), which run on each process in parallel.
+    """
+    def __init__(self, args, model, device_ids=None,
+                 multiprocessing_method='spawn'):
+        if device_ids is None:
+            device_ids = tuple(range(torch.cuda.device_count()))
+        super().__init__(device_ids, multiprocessing_method)
+        if not torch.cuda.is_available():
+            raise NotImplementedError('Training on CPU is not supported')
+        model = model.share_memory()
+        nccl_uid = nccl.get_unique_id()
+        Future.gen_list([
+            self.call_async(rank, '_async_init', args=args, model=model,
+                            nccl_uid=nccl_uid)
+            for rank in range(self.num_replicas)
+        ])
+    def _async_init(self, rank, device_id, args, model, nccl_uid):
+        """Initialize child processes."""
+        self.args = args
+        # set torch.seed in this process
+        torch.manual_seed(args.seed)
+        # set CUDA device
+        torch.cuda.set_device(device_id)
+        # initialize NCCL
+        nccl.initialize(self.num_replicas, nccl_uid, device_id)
+        # copy model to current device
+        self.model = model.cuda()
+        # initialize optimizer
+        self.optimizer = NAG(self.model.parameters(), lr=self.args.lr,
+                             momentum=self.args.momentum,
+                             weight_decay=self.args.weight_decay)
+        self.flat_grads = None
+        # initialize LR scheduler
+        self.lr_scheduler = self._build_lr_scheduler()
+    def _build_lr_scheduler(self):
+        if self.args.force_anneal > 0:
+            def anneal(e):
+                if e < self.args.force_anneal:
+                    return 1
+                else:
+                    return self.args.lrshrink ** (e + 1 - self.args.force_anneal)
+            lr_scheduler = LambdaLR(self.optimizer, anneal)
+            lr_scheduler.best = None
+        else:
+            # decay the LR by 0.1 every time the validation loss plateaus
+            lr_scheduler = ReduceLROnPlateau(self.optimizer, patience=0)
+        return lr_scheduler
+    def get_model(self):
+        """Get one of the model replicas."""
+        # just return the first model, since all replicas are the same
+        return self.call_async(0, '_async_get_model').gen()
+    def _async_get_model(self, rank, device_id):
+        return self.model
+    def save_checkpoint(self, args, epoch, batch_offset, val_loss=None):
+        """Save a checkpoint for the current model."""
+        self.call_async(0, '_async_save_checkpoint', args=args, epoch=epoch,
+                        batch_offset=batch_offset, val_loss=val_loss).gen()
+    def _async_save_checkpoint(self, rank, device_id, args, epoch, batch_offset, val_loss):
+        utils.save_checkpoint(args, epoch, batch_offset, self.model,
+                              self.optimizer, self.lr_scheduler, val_loss)
+    def load_checkpoint(self, filename):
+        """Load a checkpoint into the model replicas in each process."""
+        results = Future.gen_list([
+            self.call_async(rank, '_async_load_checkpoint', filename=filename)
+            for rank in range(self.num_replicas)
+        ])
+        epoch, batch_offset = results[0]
+        return epoch, batch_offset
+    def _async_load_checkpoint(self, rank, device_id, filename):
+        return utils.load_checkpoint(filename, self.model, self.optimizer,
+                                     self.lr_scheduler, cuda_device=device_id)
+    def train_step(self, samples, criterion):
+        """Do forward, backward and gradient step in parallel."""
+        assert isinstance(criterion, FairseqCriterion)
+        # scatter sample across GPUs
+        samples, data_events = self._scatter_samples(samples)
+        criterion.prepare(samples)
+        # forward pass, backward pass and gradient step
+        losses = [
+            self.call_async(rank, '_async_train_step', sample=samples[rank],
+                            criterion=criterion, data_event=event)
+            for rank, event in enumerate(data_events)
+        ]
+        # aggregate losses and gradient norms
+        losses, grad_norms = Future.gen_tuple_list(losses)
+        loss = criterion.aggregate(losses)
+        return loss, grad_norms[0]
+    def _async_train_step(self, rank, device_id, sample, criterion, data_event):
+        data_event.wait()
+        self.model.train()
+        # zero grads even if net_input is None, since we will all-reduce them
+        self.optimizer.zero_grad()
+        # calculate loss and grads
+        loss = 0
+        if sample is not None:
+            net_output = self.model(**sample['net_input'])
+            loss_ = criterion(net_output, sample)
+            loss_.backward()
+            loss = loss_.data[0]
+        # flatten grads into a contiguous block of memory
+        if self.flat_grads is None:
+            self.flat_grads = self._flatten_grads_(self.model)
+        # all-reduce grads
+        nccl.all_reduce(self.flat_grads)
+        # clip grads
+        grad_norm = self._clip_grads_(self.flat_grads, self.args.clip_norm)
+        # take an optimization step
+        self.optimizer.step()
+        return loss, grad_norm
+    def _flatten_grads_(self, model):
+        num_params = sum(p.data.numel() for p in model.parameters())
+        flat_grads = next(model.parameters()).data.new(num_params)
+        offset = 0
+        for p in model.parameters():
+            grad = p.grad.data
+            numel, sz = grad.numel(), grad.size()
+            flat_grads[offset:offset+numel] = grad.view(-1)
+            grad.set_(flat_grads[offset:offset+numel])
+            grad.resize_(sz)  # preserve original shape
+            offset += numel
+        return flat_grads
+    def _clip_grads_(self, flat_grads, clipv):
+        norm = flat_grads.norm()
+        if clipv > 0 and norm > clipv:
+            coef = max(norm, 1e-6) / clipv
+            flat_grads.div_(coef)
+        return norm
+    def valid_step(self, samples, criterion):
+        """Do forward pass in parallel."""
+        # scatter sample across GPUs
+        samples, data_events = self._scatter_samples(samples, volatile=True)
+        criterion.prepare(samples)
+        # forward pass
+        losses = [
+            self.call_async(rank, '_async_valid_step', sample=samples[rank],
+                            criterion=criterion, data_event=event)
+            for rank, event in enumerate(data_events)
+        ]
+        # aggregate losses
+        loss = criterion.aggregate(Future.gen_list(losses))
+        return loss
+    def _async_valid_step(self, rank, device_id, sample, criterion, data_event):
+        if sample is None:
+            return 0
+        data_event.wait()
+        self.model.eval()
+        net_output = self.model(**sample['net_input'])
+        loss = criterion(net_output, sample)
+        return loss.data[0]
+    def get_lr(self):
+        """Get the current learning rate."""
+        return self.call_async(0, '_async_get_lr').gen()
+    def _async_get_lr(self, rank, device_id):
+        return self.optimizer.param_groups[0]['lr']
+    def lr_step(self, val_loss=None, epoch=None):
+        """Adjust the learning rate depending on the validation loss."""
+        lr = Future.gen_list([
+            self.call_async(rank, '_async_lr_step', val_loss=val_loss, epoch=epoch)
+            for rank in range(self.num_replicas)
+        ])
+        return lr[0]
+    def _async_lr_step(self, rank, device_id, epoch, val_loss):
+        # update the learning rate
+        if self.args.force_anneal > 0:
+            self.lr_scheduler.step(epoch)
+        else:
+            self.lr_scheduler.step(val_loss, epoch)
+        return self.optimizer.param_groups[0]['lr']
+    def _scatter_samples(self, samples, volatile=False):
+        """Split and distribute a sample across GPUs."""
+        res = [utils.prepare_sample(sample, volatile=volatile,
+                                    cuda_device=device_id)
+               for sample, device_id in zip(samples, self.device_ids)]
+        # Pad with None until its size is equal to the number of replicas.
+        res = res + [None]*(self.num_replicas - len(samples))
+        # Synchronize GPU devices after data is sent to prevent
+        # race conditions.
+        events = []
+        for d in self.device_ids:
+            with torch.cuda.device(d):
+                event = torch.cuda.Event(interprocess=True)
+                event.record()
+                events.append(event)
+        return res, events
--- a/fairseq/nag.py
+++ b/fairseq/nag.py
+# Copyright (c) 2017-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in
+# the root directory of this source tree. An additional grant of patent rights
+# can be found in the PATENTS file in the same directory.
+#
+from torch.optim.optimizer import Optimizer, required
+class NAG(Optimizer):
+    def __init__(self, params, lr=required, momentum=0, weight_decay=0):
+        defaults = dict(lr=lr, momentum=momentum, weight_decay=weight_decay)
+        super(NAG, self).__init__(params, defaults)
+    def step(self, closure=None):
+        """Performs a single optimization step.
+        Arguments:
+            closure (callable, optional): A closure that reevaluates the model
+                and returns the loss.
+        """
+        loss = None
+        if closure is not None:
+            loss = closure()
+        for group in self.param_groups:
+            weight_decay = group['weight_decay']
+            momentum = group['momentum']
+            lr = group['lr']
+            for p in group['params']:
+                if p.grad is None:
+                    continue
+                d_p = p.grad.data
+                if weight_decay != 0:
+                    d_p.add_(weight_decay, p.data)
+                param_state = self.state[p]
+                if 'momentum_buffer' not in param_state:
+                    param_state['momentum_buffer'] = d_p.clone().zero_()
+                buf = param_state['momentum_buffer']
+                p.data.add_(momentum * momentum, buf)
+                p.data.add_(-(1 + momentum) * lr, d_p)
+                buf.mul_(momentum).add_(-lr, d_p)
+        return loss
--- a/fairseq/nccl.py
+++ b/fairseq/nccl.py
+# Copyright (c) 2017-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in
+# the root directory of this source tree. An additional grant of patent rights
+# can be found in the PATENTS file in the same directory.
+#
+"""
+A modified version of torch.cuda.nccl.all_reduce for launching kernels on each
+GPU separately.
+"""
+import ctypes
+import warnings
+lib = None
+_uid = None
+_rank = None
+_num_devices = None
+_comm = None
+__all__ = ['all_reduce', 'initialize', 'get_unique_id']
+def _libnccl():
+    global lib
+    if lib is None:
+        lib = ctypes.cdll.LoadLibrary(None)
+        if hasattr(lib, 'ncclCommDestroy'):
+            lib.ncclCommDestroy.restype = None
+            lib.ncclGetErrorString.restype = ctypes.c_char_p
+        else:
+            lib = None
+    return lib
+def is_available(tensors):
+    devices = set()
+    for tensor in tensors:
+        if not tensor.is_contiguous():
+            return False
+        if not tensor.is_cuda:
+            return False
+        device = tensor.get_device()
+        if device in devices:
+            return False
+        devices.add(device)
+    if _libnccl() is None:
+        warnings.warn('NCCL library not found. Check your LD_LIBRARY_PATH')
+        return False
+    return True
+_communicators = {}
+# ncclDataType_t
+ncclChar = 0
+ncclInt = 1
+ncclHalf = 2
+ncclFloat = 3
+ncclDouble = 4
+ncclInt64 = 5
+ncclUint64 = 6
+# ncclRedOp_t
+SUM = 0
+PROD = 1
+MAX = 2
+MIN = 3
+nccl_types = {
+    'torch.cuda.ByteTensor': ncclChar,
+    'torch.cuda.CharTensor': ncclChar,
+    'torch.cuda.IntTensor': ncclInt,
+    'torch.cuda.HalfTensor': ncclHalf,
+    'torch.cuda.FloatTensor': ncclFloat,
+    'torch.cuda.DoubleTensor': ncclDouble,
+    'torch.cuda.LongTensor': ncclInt64,
+}
+class NcclError(RuntimeError):
+    def __init__(self, status):
+        self.status = status
+        msg = '{0} ({1})'.format(lib.ncclGetErrorString(status), status)
+        super(NcclError, self).__init__(msg)
+class NcclComm(ctypes.c_void_p):
+    def __del__(self):
+        lib.ncclCommDestroy(self)
+class NcclUniqueId(ctypes.Structure):
+    _fields_ = [
+        ('internal', ctypes.c_uint8 * 128)
+    ]
+def check_error(status):
+    if status != 0:
+        raise NcclError(status)
+_uids = []
+def get_unique_id():
+    if _libnccl() is None:
+        raise RuntimeError('Unable to load NCCL library')
+    uid = NcclUniqueId()
+    check_error(lib.ncclGetUniqueId(ctypes.byref(uid)))
+    _uids.append(uid)  # Don't allow UIDs to be collected
+    return uid
+def initialize(num_devices, uid, rank):
+    global _num_devices, _uid, _rank
+    if _libnccl() is None:
+        raise RuntimeError('Unable to load NCCL library')
+    _num_devices = num_devices
+    if rank != 0:
+        _uid = NcclUniqueId.from_buffer_copy(uid)
+    else:
+        _uid = uid
+    _rank = rank
+def communicator():
+    global _comm
+    if _uid is None:
+        raise RuntimeError('NCCL not initialized')
+    if _comm is None:
+        comm = ctypes.c_void_p()
+        check_error(lib.ncclCommInitRank(
+            ctypes.byref(comm),
+            ctypes.c_int(_num_devices),
+            _uid,
+            ctypes.c_int(_rank)))
+        _comm = comm
+    return _comm
+def all_reduce(input, output=None, op=SUM, stream=None):
+    comm = communicator()
+    if output is None:
+        output = input
+    if stream is not None:
+        stream = stream.cuda_stream
+    data_type = nccl_types[input.type()]
+    check_error(lib.ncclAllReduce(
+        ctypes.c_void_p(input.data_ptr()),
+        ctypes.c_void_p(output.data_ptr()),
+        ctypes.c_size_t(input.numel()),
+        data_type,
+        op,
+        comm,
+        ctypes.c_void_p(stream)))
+    return output
--- a/fairseq/options.py
+++ b/fairseq/options.py
+# Copyright (c) 2017-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in
+# the root directory of this source tree. An additional grant of patent rights
+# can be found in the PATENTS file in the same directory.
+#
+import argparse
+from fairseq import models
+def get_parser(desc):
+    parser = argparse.ArgumentParser(
+        description='Facebook AI Research Sequence-to-Sequence Toolkit -- ' + desc)
+    parser.add_argument('--no-progress-bar', action='store_true', help='disable progress bar')
+    parser.add_argument('--log-interval', type=int, default=1000, metavar='N',
+                        help='log progress every N updates (when progress bar is disabled)')
+    parser.add_argument('--seed', default=1, type=int, metavar='N',
+                        help='pseudo random number generator seed')
+    return parser
+def add_dataset_args(parser):
+    group = parser.add_argument_group('Dataset and data loading')
+    group.add_argument('data', metavar='DIR',
+                       help='path to data directory')
+    group.add_argument('-s', '--source-lang', default=None, metavar='SRC',
+                       help='source language')
+    group.add_argument('-t', '--target-lang', default=None, metavar='TARGET',
+                       help='target language')
+    group.add_argument('-j', '--workers', default=1, type=int, metavar='N',
+                       help='number of data loading workers (default: 1)')
+    group.add_argument('--max-positions', default=1024, type=int, metavar='N',
+                       help='max number of tokens in the sequence')
+    return group
+def add_optimization_args(parser):
+    group = parser.add_argument_group('Optimization')
+    group.add_argument('--lr', '--learning-rate', default=0.25, type=float, metavar='LR',
+                       help='initial learning rate')
+    group.add_argument('--min-lr', metavar='LR', default=1e-5, type=float,
+                       help='minimum learning rate')
+    group.add_argument('--force-anneal', '--fa', default=0, type=int, metavar='N',
+                       help='force annealing at specified epoch')
+    group.add_argument('--max-epoch', '--me', default=0, type=int, metavar='N',
+                       help='force stop training at specified epoch')
+    group.add_argument('--lrshrink', default=0.1, type=float, metavar='LS',
+                       help='learning rate shrink factor for annealing, lr_new = (lr * lrshrink)')
+    group.add_argument('--momentum', default=0.99, type=float, metavar='M',
+                       help='momentum factor')
+    group.add_argument('--clip-norm', default=25, type=float, metavar='NORM',
+                       help='clip threshold of gradients')
+    group.add_argument('--weight-decay', '--wd', default=0.0, type=float, metavar='WD',
+                       help='weight decay')
+    group.add_argument('--sample-without-replacement', default=0, type=int, metavar='N',
+                       help='If bigger than 0, use that number of mini-batches for each epoch,'
+                            ' where each sample is drawn randomly with replacement from the'
+                            ' dataset')
+    return group
+def add_checkpoint_args(parser):
+    group = parser.add_argument_group('Checkpointing')
+    group.add_argument('--save-dir', metavar='DIR', default='checkpoints',
+                       help='path to save checkpoints')
+    group.add_argument('--restore-file', default='checkpoint_last.pt',
+                       help='filename in save-dir from which to load checkpoint')
+    group.add_argument('--save-interval', type=int, default=-1,
+                       help='checkpoint every this many batches')
+    group.add_argument('--no-save', action='store_true',
+                       help='don\'t save models and checkpoints')
+    group.add_argument('--no-epoch-checkpoints', action='store_true',
+                       help='only store last and best checkpoints')
+    return group
+def add_generation_args(parser):
+    group = parser.add_argument_group('Generation')
+    group.add_argument('--beam', default=5, type=int, metavar='N',
+                       help='beam size')
+    group.add_argument('--nbest', default=1, type=int, metavar='N',
+                       help='number of hypotheses to output')
+    group.add_argument('--max-len-a', default=0, type=int, metavar='N',
+                       help=('generate sequence of maximum length ax + b, '
+                             'where x is the source length'))
+    group.add_argument('--max-len-b', default=200, type=int, metavar='N',
+                       help=('generate sequence of maximum length ax + b, '
+                             'where x is the source length'))
+    group.add_argument('--remove-bpe', action='store_true',
+                       help='remove BPE tokens before scoring')
+    group.add_argument('--no-early-stop', action='store_true',
+                       help=('continue searching even after finalizing k=beam '
+                             'hypotheses; this is more correct, but increases '
+                             'generation time by 50%%'))
+    group.add_argument('--unnormalized', action='store_true',
+                       help='compare unnormalized hypothesis scores')
+    group.add_argument('--cpu', action='store_true', help='generate on CPU')
+    group.add_argument('--no-beamable-mm', action='store_true',
+                       help='don\'t use BeamableMM in attention layers')
+    group.add_argument('--lenpen', default=1, type=float,
+                       help='length penalty: <1.0 favors shorter, >1.0 favors longer sentences')
+    group.add_argument('--unk-replace-dict', default='', type=str,
+                       help='performs unk word replacement')
+    return group
+def add_model_args(parser):
+    group = parser.add_argument_group('Model configuration')
+    group.add_argument('--arch', '-a', default='fconv', metavar='ARCH',
+                       choices=models.__all__,
+                       help='model architecture ({})'.format(', '.join(models.__all__)))
+    group.add_argument('--encoder-embed-dim', default=512, type=int, metavar='N',
+                       help='encoder embedding dimension')
+    group.add_argument('--encoder-layers', default='[(512, 3)] * 20', type=str, metavar='EXPR',
+                       help='encoder layers [(dim, kernel_size), ...]')
+    group.add_argument('--decoder-embed-dim', default=512, type=int, metavar='N',
+                       help='decoder embedding dimension')
+    group.add_argument('--decoder-layers', default='[(512, 3)] * 20', type=str, metavar='EXPR',
+                       help='decoder layers [(dim, kernel_size), ...]')
+    group.add_argument('--decoder-attention', default='True', type=str, metavar='EXPR',
+                       help='decoder attention [True, ...]')
+    group.add_argument('--decoder-out-embed-dim', default=256, type=int, metavar='N',
+                       help='decoder output embedding dimension')
+    group.add_argument('--dropout', default=0.1, type=float, metavar='D',
+                       help='dropout probability')
+    group.add_argument('--label-smoothing', default=0, type=float, metavar='D',
+                       help='epsilon for label smoothing, 0 means no label smoothing')
+    return group
--- a/fairseq/progress_bar.py
+++ b/fairseq/progress_bar.py
+# Copyright (c) 2017-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in
+# the root directory of this source tree. An additional grant of patent rights
+# can be found in the PATENTS file in the same directory.
+#
+"""
+Progress bar wrapper around tqdm which handles non-tty outputs
+"""
+import sys
+from tqdm import tqdm
+class progress_bar(tqdm):
+    enabled = sys.stderr.isatty()
+    print_interval = 1000
+    def __new__(cls, *args, **kwargs):
+        if cls.enabled:
+            return tqdm(*args, **kwargs)
+        else:
+            return simple_progress_bar(cls.print_interval, *args, **kwargs)
+class simple_progress_bar(tqdm):
+    def __init__(self, print_interval, *args, **kwargs):
+        super(simple_progress_bar, self).__init__(*args, **kwargs)
+        self.print_interval = print_interval
+    def __iter__(self):
+        size = len(self.iterable)
+        for i, obj in enumerate(self.iterable):
+            yield obj
+            if i > 0 and i % self.print_interval == 0:
+                msg = '{} {:5d} / {:d} {}\n'.format(self.desc, i, size, self.postfix)
+                sys.stdout.write(msg)
+                sys.stdout.flush()
+    @classmethod
+    def write(cls, s, file=None, end="\n"):
+        fp = file if file is not None else sys.stdout
+        fp.write(s)
+        fp.write(end)
+        fp.flush()
+    @staticmethod
+    def status_printer(file):
+        def print_status(s):
+            pass
+        return print_status
--- a/fairseq/sequence_generator.py
+++ b/fairseq/sequence_generator.py
+# Copyright (c) 2017-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in
+# the root directory of this source tree. An additional grant of patent rights
+# can be found in the PATENTS file in the same directory.
+#
+from contextlib import ExitStack
+import math
+import torch
+import torch.nn.functional as F
+from torch.autograd import Variable
+from fairseq import utils
+class SequenceGenerator(object):
+    def __init__(self, models, dst_dict, beam_size=1, minlen=1, maxlen=200,
+                 stop_early=True, normalize_scores=True, len_penalty=1):
+        """Generates translations of a given source sentence.
+        Args:
+            min/maxlen: The length of the generated output will be bounded by
+                minlen and maxlen (not including the end-of-sentence marker).
+            stop_early: Stop generation immediately after we finalize beam_size
+                hypotheses, even though longer hypotheses might have better
+                normalized scores.
+            normalize_scores: Normalize scores by the length of the output.
+        """
+        self.models = models
+        self.dict = dst_dict
+        self.pad = dst_dict.pad()
+        self.eos = dst_dict.eos()
+        self.vocab_size = len(dst_dict)
+        self.beam_size = beam_size
+        self.minlen = minlen
+        self.maxlen = maxlen
+        self.positions = torch.LongTensor(range(self.pad + 1, self.pad + maxlen + 2))
+        self.decoder_context = models[0].decoder.context_size()
+        self.stop_early = stop_early
+        self.normalize_scores = normalize_scores
+        self.len_penalty = len_penalty
+    def cuda(self):
+        for model in self.models:
+            model.cuda()
+        self.positions = self.positions.cuda()
+        return self
+    def generate_batched_itr(self, data_itr, maxlen_a=0, maxlen_b=200,
+                             cuda_device=None, timer=None):
+        """Iterate over a batched dataset and yield individual translations.
+        Args:
+            maxlen_a/b: generate sequences of maximum length ax + b,
+                where x is the source sentence length.
+            cuda_device: GPU on which to do generation.
+            timer: StopwatchMeter for timing generations.
+        """
+        def lstrip_pad(tensor):
+            return tensor[tensor.eq(self.pad).sum():]
+        for sample in data_itr:
+            s = utils.prepare_sample(sample, volatile=True, cuda_device=cuda_device)
+            input = s['net_input']
+            srclen = input['src_tokens'].size(1)
+            if timer is not None:
+                timer.start()
+            hypos = self.generate(input['src_tokens'], input['src_positions'],
+                                  maxlen=(maxlen_a*srclen + maxlen_b))
+            if timer is not None:
+                timer.stop(s['ntokens'])
+            for i, id in enumerate(s['id']):
+                src = input['src_tokens'].data[i, :]
+                # remove padding from ref, which appears at the beginning
+                ref = lstrip_pad(s['target'].data[i, :])
+                yield id, src, ref, hypos[i]
+    def generate(self, src_tokens, src_positions, beam_size=None, maxlen=None):
+        """Generate a batch of translations."""
+        with ExitStack() as stack:
+            for model in self.models:
+                stack.enter_context(model.decoder.incremental_inference())
+            return self._generate(src_tokens, src_positions, beam_size, maxlen)
+    def _generate(self, src_tokens, src_positions, beam_size=None, maxlen=None):
+        bsz = src_tokens.size(0)
+        beam_size = beam_size if beam_size is not None else self.beam_size
+        maxlen = min(maxlen, self.maxlen) if maxlen is not None else self.maxlen
+        encoder_outs = []
+        for model in self.models:
+            model.eval()
+            model.decoder.clear_incremental_state()  # start a fresh sequence
+            # compute the encoder output and expand to beam size
+            encoder_out = model.encoder(src_tokens, src_positions)
+            encoder_out = self._expand_encoder_out(encoder_out, beam_size)
+            encoder_outs.append(encoder_out)
+        # initialize buffers
+        scores = encoder_outs[0][0].data.new(bsz * beam_size).fill_(0)
+        tokens = src_tokens.data.new(bsz * beam_size, maxlen + 2).fill_(self.pad)
+        tokens_buf = tokens.clone()
+        tokens[:, 0] = self.eos
+        align = src_tokens.data.new(bsz * beam_size, maxlen + 2).fill_(-1)
+        align_buf = align.clone()
+        # list of completed sentences
+        finalized = [[] for i in range(bsz)]
+        finished = [False for i in range(bsz)]
+        worst_finalized = [{'idx': None, 'score': float('Inf')} for i in range(bsz)]
+        num_remaining_sent = bsz
+        # number of candidate hypos per step
+        cand_size = 2 * beam_size  # 2 x beam size in case half are EOS
+        # offset arrays for converting between different indexing schemes
+        bbsz_offsets = (torch.arange(0, bsz)*beam_size).unsqueeze(1).type_as(tokens)
+        cand_offsets = torch.arange(0, cand_size).type_as(tokens)
+        # helper function for allocating buffers on the fly
+        buffers = {}
+        def buffer(name, type_of=tokens):
+            if name not in buffers:
+                buffers[name] = type_of.new()
+            return buffers[name]
+        def is_finished(sent):
+            """
+            Check whether we've finished generation for a given sentence, by
+            comparing the worst score among finalized hypotheses to the best
+            possible score among unfinalized hypotheses.
+            """
+            assert len(finalized[sent]) <= beam_size
+            if len(finalized[sent]) == beam_size:
+                if self.stop_early:
+                    return True
+                # stop if the best unfinalized score is worse than the worst
+                # finalized one
+                bbsz = sent*beam_size
+                best_unfinalized_score = scores[bbsz:bbsz+beam_size].max()
+                if self.normalize_scores:
+                    best_unfinalized_score /= maxlen
+                if worst_finalized[sent]['score'] >= best_unfinalized_score:
+                    return True
+            return False
+        def finalize_hypos(step, bbsz_idx, scores):
+            """
+            Finalize the given hypotheses at this step, while keeping the total
+            number of finalized hypotheses per sentence <= beam_size.
+            Note: the input must be in the desired finalization order, so that
+            hypotheses that appear earlier in the input are preferred to those
+            that appear later.
+            Args:
+                step: current time step
+                bbsz_idx: A vector of indices in the range [0, bsz*beam_size),
+                    indicating which hypotheses to finalize
+                scores: A vector of the same size as bbsz_idx containing scores
+                    for each hypothesis
+            """
+            assert bbsz_idx.numel() == scores.numel()
+            norm_scores = scores/math.pow(step+1, self.len_penalty) if self.normalize_scores else scores
+            sents_seen = set()
+            for idx, score in zip(bbsz_idx.cpu(), norm_scores.cpu()):
+                sent = idx // beam_size
+                sents_seen.add(sent)
+                def get_hypo():
+                    hypo = tokens[idx, 1:step+2].clone()
+                    hypo[step] = self.eos
+                    alignment = align[idx, 1:step+2].clone()
+                    return {
+                        'tokens': hypo,
+                        'score': score,
+                        'alignment': alignment,
+                    }
+                if len(finalized[sent]) < beam_size:
+                    finalized[sent].append(get_hypo())
+                elif score > worst_finalized[sent]['score']:
+                    # replace worst hypo for this sentence with new/better one
+                    worst_idx = worst_finalized[sent]['idx']
+                    finalized[sent][worst_idx] = get_hypo()
+                    # find new worst finalized hypo for this sentence
+                    idx, s = min(enumerate(finalized[sent]), key=lambda r: r[1]['score'])
+                    worst_finalized[sent] = {
+                        'score': s['score'],
+                        'idx': idx,
+                    }
+            # return number of hypotheses finished this step
+            num_finished = 0
+            for sent in sents_seen:
+                # check termination conditions for this sentence
+                if not finished[sent] and is_finished(sent):
+                    finished[sent] = True
+                    num_finished += 1
+            return num_finished
+        reorder_state = None
+        for step in range(maxlen + 1):  # one extra step for EOS marker
+            # reorder decoder internal states based on the prev choice of beams
+            if reorder_state is not None:
+                for model in self.models:
+                    model.decoder.reorder_incremental_state(reorder_state)
+            probs, avg_attn_scores = self._decode(tokens[:, :step+1], encoder_outs)
+            if step == 0:
+                # at the first step all hypotheses are equally likely, so use
+                # only the first beam
+                probs = probs.unfold(0, 1, beam_size).squeeze(2).contiguous()
+            else:
+                # make probs contain cumulative scores for each hypothesis
+                probs.add_(scores.view(-1, 1))
+            # record alignment to source tokens, based on attention
+            _ignore_scores = buffer('_ignore_scores', type_of=scores)
+            avg_attn_scores.topk(1, out=(_ignore_scores, align[:, step+1].unsqueeze(1)))
+            # take the best 2 x beam_size predictions. We'll choose the first
+            # beam_size of these which don't predict eos to continue with.
+            cand_scores = buffer('cand_scores', type_of=scores)
+            cand_indices = buffer('cand_indices')
+            cand_beams = buffer('cand_beams')
+            probs.view(bsz, -1).topk(cand_size, out=(cand_scores, cand_indices))
+            torch.div(cand_indices, self.vocab_size, out=cand_beams)
+            cand_indices.fmod_(self.vocab_size)
+            # cand_bbsz_idx contains beam indices for the top candidate
+            # hypotheses, with a range of values: [0, bsz*beam_size),
+            # and dimensions: [bsz, cand_size]
+            cand_bbsz_idx = cand_beams.add_(bbsz_offsets)
+            # finalize hypotheses that end in eos
+            eos_mask = cand_indices.eq(self.eos)
+            if step >= self.minlen:
+                eos_bbsz_idx = buffer('eos_bbsz_idx')
+                cand_bbsz_idx.masked_select(eos_mask, out=eos_bbsz_idx)
+                if eos_bbsz_idx.numel() > 0:
+                    eos_scores = buffer('eos_scores', type_of=scores)
+                    cand_scores.masked_select(eos_mask, out=eos_scores)
+                    num_remaining_sent -= finalize_hypos(step, eos_bbsz_idx, eos_scores)
+            assert num_remaining_sent >= 0
+            if num_remaining_sent == 0:
+                break
+            # set active_mask so that values > cand_size indicate eos hypos
+            # and values < cand_size indicate candidate active hypos.
+            # After, the min values per row are the top candidate active hypos
+            active_mask = buffer('active_mask')
+            torch.add((eos_mask*cand_size).type_as(cand_offsets), cand_offsets,
+                      out=active_mask)
+            # get the top beam_size active hypotheses, which are just the hypos
+            # with the smallest values in active_mask
+            active_hypos, _ignore = buffer('active_hypos'), buffer('_ignore')
+            active_mask.topk(beam_size, 1, largest=False, out=(_ignore, active_hypos))
+            active_bbsz_idx = buffer('active_bbsz_idx')
+            cand_bbsz_idx.gather(1, active_hypos, out=active_bbsz_idx)
+            active_scores = cand_scores.gather(1, active_hypos,
+                                               out=scores.view(bsz, beam_size))
+            active_bbsz_idx = active_bbsz_idx.view(-1)
+            active_scores = active_scores.view(-1)
+            # finalize all active hypotheses once we hit maxlen
+            # finalize_hypos will take care of adding the EOS markers
+            if step == maxlen:
+                num_remaining_sent -= finalize_hypos(step, active_bbsz_idx, active_scores)
+                assert num_remaining_sent == 0
+                break
+            # copy tokens for active hypotheses
+            torch.index_select(tokens[:, :step+1], dim=0, index=active_bbsz_idx,
+                               out=tokens_buf[:, :step+1])
+            cand_indices.gather(1, active_hypos,
+                                out=tokens_buf.view(bsz, beam_size, -1)[:, :, step+1])
+            # copy attention/alignment for active hypotheses
+            torch.index_select(align[:, :step+2], dim=0, index=active_bbsz_idx,
+                               out=align_buf[:, :step+2])
+            # swap buffers
+            old_tokens = tokens
+            tokens = tokens_buf
+            tokens_buf = old_tokens
+            old_align = align
+            align = align_buf
+            align_buf = old_align
+            # reorder incremental state in decoder
+            reorder_state = active_bbsz_idx
+        # sort by score descending
+        for sent in range(bsz):
+            finalized[sent] = sorted(finalized[sent], key=lambda r: r['score'], reverse=True)
+        return finalized
+    def _decode(self, tokens, encoder_outs):
+        length = tokens.size(1)
+        # repeat the first length positions to fill batch
+        positions = self.positions[:length].view(1, length)
+        # wrap in Variables
+        tokens = Variable(tokens, volatile=True)
+        positions = Variable(positions, volatile=True)
+        avg_probs = None
+        avg_attn = None
+        for model, encoder_out in zip(self.models, encoder_outs):
+            decoder_out, attn = model.decoder(tokens, positions, encoder_out)
+            probs = F.softmax(decoder_out[:, -1, :]).data
+            attn = attn[:, -1, :].data
+            if avg_probs is None or avg_attn is None:
+                avg_probs = probs
+                avg_attn = attn
+            else:
+                avg_probs.add_(probs)
+                avg_attn.add_(attn)
+        avg_probs.div_(len(self.models))
+        avg_probs.log_()
+        avg_attn.div_(len(self.models))
+        return avg_probs, avg_attn
+    def _expand_encoder_out(self, encoder_out, beam_size):
+        res = []
+        for tensor in encoder_out:
+            res.append(
+                # repeat beam_size times along second dimension
+                tensor.repeat(1, beam_size, *[1 for i in range(tensor.dim()-2)]) \
+                # then collapse into [bsz*beam, ...original dims...]
+                .view(-1, *tensor.size()[1:])
+            )
+        return tuple(res)
--- a/fairseq/tokenizer.py
+++ b/fairseq/tokenizer.py
+# Copyright (c) 2017-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in
+# the root directory of this source tree. An additional grant of patent rights
+# can be found in the PATENTS file in the same directory.
+#
+import re
+import torch
+from fairseq import dictionary
+def tokenize_line(line):
+    line = re.sub(r"\t", "", line)
+    line = re.sub(r"^\s+", "", line)
+    line = re.sub(r"\s+$", "", line)
+    line = re.sub(r"\s+", " ", line)
+    return line.split()
+class Tokenizer:
+    @staticmethod
+    def build_dictionary(filename, tokenize=tokenize_line):
+        dict = dictionary.Dictionary()
+        Tokenizer.add_file_to_dictionary(filename, dict, tokenize)
+        dict.finalize()
+        return dict
+    @staticmethod
+    def add_file_to_dictionary(filename, dict, tokenize):
+        with open(filename, 'r') as f:
+            for line in f.readlines():
+                for word in tokenize(line):
+                    dict.add_symbol(word)
+                dict.add_symbol(dict.eos_word)
+    @staticmethod
+    def binarize(filename, dict, consumer, tokenize=tokenize_line):
+        nseq, ntok, nunk = 0, 0, 0
+        replaced = {}
+        with open(filename, 'r') as f:
+            for line in f.readlines():
+                words = tokenize(line)
+                nwords = len(words)
+                ids = torch.IntTensor(nwords + 1)
+                nseq = nseq + 1
+                for i in range(0, len(words)):
+                    word = words[i]
+                    idx = dict.index(word)
+                    if idx == dict.unk_index and word != dict.unk_word:
+                        nunk = nunk + 1
+                        if word in replaced:
+                            replaced[word] = replaced[word] + 1
+                        else:
+                            replaced[word] = 1
+                    ids[i] = idx
+                ids[nwords] = dict.eos_index
+                consumer(ids)
+                ntok = ntok + len(ids)
+        return {'nseq': nseq, 'nunk': nunk, 'ntok': ntok, 'replaced': len(replaced)}
+    @staticmethod
+    def tokenize(line, dict, tokenize=tokenize_line, add_if_not_exist=True):
+        words = tokenize(line)
+        nwords = len(words)
+        ids = torch.IntTensor(nwords + 1)
+        for i in range(0, len(words)):
+            if add_if_not_exist:
+                ids[i] = dict.add_symbol(words[i])
+            else:
+                ids[i] = dict.index(words[i])
+        ids[nwords] = dict.eos_index
+        return ids
--- a/fairseq/utils.py
+++ b/fairseq/utils.py
+# Copyright (c) 2017-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in
+# the root directory of this source tree. An additional grant of patent rights
+# can be found in the PATENTS file in the same directory.
+#
+import os
+import torch
+from torch.autograd import Variable
+from torch.serialization import default_restore_location
+from fairseq import criterions, data, models
+def build_model(args, dataset):
+    if args.arch == 'fconv':
+        encoder_layers = eval(args.encoder_layers)
+        decoder_layers = eval(args.decoder_layers)
+        decoder_attention = eval(args.decoder_attention)
+        model = models.fconv(
+            dataset, args.dropout, args.encoder_embed_dim, encoder_layers,
+            args.decoder_embed_dim, decoder_layers, decoder_attention,
+            decoder_out_embed_dim=args.decoder_out_embed_dim,
+            max_positions=args.max_positions)
+    else:
+        model = models.__dict__[args.arch](dataset, args.dropout,
+                                           max_positions=args.max_positions)
+    return model
+def build_criterion(args, dataset):
+    padding_idx = dataset.dst_dict.pad()
+    if args.label_smoothing > 0:
+        return criterions.LabelSmoothedCrossEntropyCriterion(args.label_smoothing, padding_idx)
+    else:
+        return criterions.CrossEntropyCriterion(padding_idx)
+def torch_persistent_save(*args, **kwargs):
+    for i in range(3):
+        try:
+            return torch.save(*args, **kwargs)
+        except:
+            if i == 3:
+                raise
+def save_checkpoint(args, epoch, batch_offset, model, optimizer, lr_scheduler, val_loss=None):
+    state_dict = {
+        'args': args,
+        'epoch': epoch,
+        'batch_offset': batch_offset,
+        'model': model.state_dict(),
+        'optimizer': optimizer.state_dict(),
+        'best_loss': lr_scheduler.best,
+        'val_loss': val_loss,
+    }
+    if batch_offset == 0:
+        if not args.no_epoch_checkpoints:
+            epoch_filename = os.path.join(args.save_dir, 'checkpoint{}.pt'.format(epoch))
+            torch_persistent_save(state_dict, epoch_filename)
+        assert val_loss is not None
+        if not hasattr(save_checkpoint, 'best') or val_loss < save_checkpoint.best:
+            save_checkpoint.best = val_loss
+            best_filename = os.path.join(args.save_dir, 'checkpoint_best.pt')
+            torch_persistent_save(state_dict, best_filename)
+    last_filename = os.path.join(args.save_dir, 'checkpoint_last.pt')
+    torch_persistent_save(state_dict, last_filename)
+def load_checkpoint(filename, model, optimizer, lr_scheduler, cuda_device=None):
+    if not os.path.exists(filename):
+        return 1, 0
+    if cuda_device is None:
+        state = torch.load(filename)
+    else:
+        state = torch.load(
+            filename,
+            map_location=lambda s, l: default_restore_location(s, 'cuda:{}'.format(cuda_device))
+        )
+    model.load_state_dict(state['model'])
+    optimizer.load_state_dict(state['optimizer'])
+    lr_scheduler.best = state['best_loss']
+    epoch = state['epoch'] + 1
+    batch_offset = state['batch_offset']
+    gpu_str = ' on GPU #{}'.format(cuda_device) if cuda_device is not None else ''
+    print('| loaded checkpoint {} (epoch {}){}'.format(filename, epoch, gpu_str))
+    return epoch, batch_offset
+def load_ensemble_for_inference(models, data_path):
+    # load model architectures and weights
+    states = []
+    for model in models:
+        if not os.path.exists(model):
+            raise IOError('Model file not found: ' + model)
+        states.append(
+            torch.load(model, map_location=lambda s, l: default_restore_location(s, 'cpu'))
+        )
+    # load dataset
+    args = states[0]['args']
+    dataset = data.load(data_path, args.source_lang, args.target_lang)
+    # build models
+    models = []
+    for state in states:
+        model = build_model(args, dataset)
+        model.load_state_dict(state['model'])
+        models.append(model)
+    return models, dataset
+def prepare_sample(sample, volatile=False, cuda_device=None):
+    """Wrap input tensors in Variable class."""
+    def make_variable(tensor):
+        if cuda_device is not None and torch.cuda.is_available():
+            tensor = tensor.cuda(async=True, device=cuda_device)
+        return Variable(tensor, volatile=volatile)
+    return {
+        'id': sample['id'],
+        'ntokens': sample['ntokens'],
+        'target': make_variable(sample['target']),
+        'net_input': {
+            key: make_variable(sample[key])
+            for key in ['src_tokens', 'src_positions', 'input_tokens', 'input_positions']
+        },
+    }
--- a/generate.py
+++ b/generate.py
+# Copyright (c) 2017-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in
+# the root directory of this source tree. An additional grant of patent rights
+# can be found in the PATENTS file in the same directory.
+#
+import sys
+import torch
+from torch.autograd import Variable
+from fairseq import bleu, options, utils, tokenizer
+from fairseq.meters import StopwatchMeter, TimeMeter
+from fairseq.progress_bar import progress_bar
+from fairseq.sequence_generator import SequenceGenerator
+def main():
+    parser = options.get_parser('Generation')
+    parser.add_argument('--path', metavar='FILE', required=True, action='append',
+                        help='path(s) to model file(s)')
+    dataset_args = options.add_dataset_args(parser)
+    dataset_args.add_argument('-i', '--interactive', action='store_true',
+                              help='generate translations in interactive mode')
+    dataset_args.add_argument('--batch-size', default=32, type=int, metavar='N',
+                              help='batch size')
+    dataset_args.add_argument('--gen-subset', default='test', metavar='SPLIT',
+                              help='data subset to generate (train, valid, test)')
+    options.add_generation_args(parser)
+    args = parser.parse_args()
+    print(args)
+    if args.no_progress_bar:
+        progress_bar.enabled = False
+    use_cuda = torch.cuda.is_available() and not args.cpu
+    # Load model and dataset
+    print('| loading model(s) from {}'.format(', '.join(args.path)))
+    models, dataset = utils.load_ensemble_for_inference(args.path, args.data)
+    print('| [{}] dictionary: {} types'.format(dataset.src, len(dataset.src_dict)))
+    print('| [{}] dictionary: {} types'.format(dataset.dst, len(dataset.dst_dict)))
+    if not args.interactive:
+        print('| {} {} {} examples'.format(args.data, args.gen_subset, len(dataset.splits[args.gen_subset])))
+    # Optimize model for generation
+    for model in models:
+        model.make_generation_fast_(args.beam, not args.no_beamable_mm)
+    # Initialize generator
+    translator = SequenceGenerator(models, dataset.dst_dict, beam_size=args.beam,
+                                   stop_early=(not args.no_early_stop),
+                                   normalize_scores=(not args.unnormalized),
+                                   len_penalty=args.lenpen)
+    align_dict = {}
+    if args.unk_replace_dict != '':
+        assert args.interactive, "Unkown words replacing requires access to original source and is only" \
+                                 "supported in interactive mode"
+        with open(args.unk_replace_dict, 'r') as f:
+            for line in f:
+                l = line.split()
+                align_dict[l[0]] = l[1]
+    def replace_unk(hypo_str, align_str, src, unk):
+        hypo_tokens = hypo_str.split()
+        src_tokens = tokenizer.tokenize_line(src)
+        align_idx = [int(i) for i in align_str.split()]
+        for i, ht in enumerate(hypo_tokens):
+            if ht == unk:
+                src_token = src_tokens[align_idx[i]]
+                if src_token in align_dict:
+                    hypo_tokens[i] = align_dict[src_token]
+                else:
+                    hypo_tokens[i] = src_token
+        return ' '.join(hypo_tokens)
+    if use_cuda:
+        translator.cuda()
+    bpe_symbol = '@@ ' if args.remove_bpe else None
+    def display_hypotheses(id, src, orig, ref, hypos):
+        id_str = '' if id is None else '-{}'.format(id)
+        src_str = to_sentence(dataset.src_dict, src, bpe_symbol)
+        print('S{}\t{}'.format(id_str, src_str))
+        if orig is not None:
+            print('O{}\t{}'.format(id_str, orig.strip()))
+        if ref is not None:
+            print('T{}\t{}'.format(id_str, to_sentence(dataset.dst_dict, ref, bpe_symbol, ref_unk=True)))
+        for hypo in hypos:
+            hypo_str = to_sentence(dataset.dst_dict, hypo['tokens'], bpe_symbol)
+            align_str = ' '.join(map(str, hypo['alignment']))
+            if args.unk_replace_dict != '':
+                hypo_str = replace_unk(hypo_str, align_str, orig, unk_symbol(dataset.dst_dict))
+            print('H{}\t{}\t{}'.format(
+                id_str, hypo['score'], hypo_str))
+            print('A{}\t{}'.format(id_str, align_str))
+    if args.interactive:
+        for line in sys.stdin:
+            tokens = tokenizer.Tokenizer.tokenize(line, dataset.src_dict, add_if_not_exist=False).long()
+            start = dataset.src_dict.pad() + 1
+            positions = torch.arange(start, start + len(tokens)).type_as(tokens)
+            if use_cuda:
+                positions = positions.cuda()
+                tokens = tokens.cuda()
+            translations = translator.generate(Variable(tokens.view(1, -1)), Variable(positions.view(1, -1)))
+            hypos = translations[0]
+            display_hypotheses(None, tokens, line, None, hypos[:min(len(hypos), args.nbest)])
+    else:
+        non_bpe_dict = {}
+        def maybe_remove_bpe_and_reindex(tokens):
+            """Helper for removing BPE symbols from a tensor of indices.
+            If BPE removal is enabled, the returned tensor is reindexed
+            using a new dictionary that is created on-the-fly."""
+            if not args.remove_bpe:
+                return tokens
+            assert (tokens == dataset.dst_dict.pad()).sum() == 0
+            return torch.IntTensor([
+                non_bpe_dict.setdefault(w, len(non_bpe_dict))
+                for w in to_sentence(dataset.dst_dict, tokens, bpe_symbol).split(' ')
+            ])
+        # Generate and compute BLEU score
+        scorer = bleu.Scorer(
+            dataset.dst_dict.pad() if not args.remove_bpe else -1,
+            dataset.dst_dict.eos() if not args.remove_bpe else -1,
+            dataset.dst_dict.unk())
+        itr = dataset.dataloader(args.gen_subset, batch_size=args.batch_size, max_positions=args.max_positions)
+        num_sentences = 0
+        with progress_bar(itr, smoothing=0, leave=False) as t:
+            wps_meter = TimeMeter()
+            gen_timer = StopwatchMeter()
+            translations = translator.generate_batched_itr(
+                t, maxlen_a=args.max_len_a, maxlen_b=args.max_len_b,
+                cuda_device=0 if use_cuda else None, timer=gen_timer)
+            for id, src, ref, hypos in translations:
+                ref = ref.int().cpu()
+                top_hypo = hypos[0]['tokens'].int().cpu()
+                scorer.add(maybe_remove_bpe_and_reindex(ref), maybe_remove_bpe_and_reindex(top_hypo))
+                display_hypotheses(id, src, None, ref, hypos[:min(len(hypos), args.nbest)])
+                wps_meter.update(src.size(0))
+                t.set_postfix(wps='{:5d}'.format(round(wps_meter.avg)))
+                num_sentences += 1
+        print('| Translated {} sentences ({} tokens) in {:.1f}s ({:.2f} tokens/s)'.format(
+            num_sentences, gen_timer.n, gen_timer.sum, 1. / gen_timer.avg))
+        print('| Generate {} with beam={}: {}'.format(args.gen_subset, args.beam, scorer.result_string()))
+def to_token(dict, i, runk):
+    return runk if i == dict.unk() else dict[i]
+def unk_symbol(dict, ref_unk=False):
+    return '<{}>'.format(dict.unk_word) if ref_unk else dict.unk_word
+def to_sentence(dict, tokens, bpe_symbol=None, ref_unk=False):
+    if torch.is_tensor(tokens) and tokens.dim() == 2:
+        sentences = [to_sentence(dict, token) for token in tokens]
+        return '\n'.join(sentences)
+    eos = dict.eos()
+    runk = unk_symbol(dict, ref_unk=ref_unk)
+    sent = ' '.join([to_token(dict, i, runk) for i in tokens if i != eos])
+    if bpe_symbol is not None:
+        sent = sent.replace(bpe_symbol, '')
+    return sent
+if __name__ == '__main__':
+    main()
--- a/preprocess.py
+++ b/preprocess.py
+# Copyright (c) 2017-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in
+# the root directory of this source tree. An additional grant of patent rights
+# can be found in the PATENTS file in the same directory.
+#
+import argparse
+import os
+from itertools import zip_longest
+from fairseq import dictionary, indexed_dataset
+from fairseq.tokenizer import Tokenizer
+def main():
+    parser = argparse.ArgumentParser(
+        description='Data pre-processing: Create dictionary and store data in binary format')
+    parser.add_argument('-s', '--source-lang', default=None, metavar='SRC', help='source language')
+    parser.add_argument('-t', '--target-lang', default=None, metavar='TARGET', help='target language')
+    parser.add_argument('--trainpref', metavar='FP', default='train', help='target language')
+    parser.add_argument('--validpref', metavar='FP', default='valid', help='comma separated, valid language prefixes')
+    parser.add_argument('--testpref', metavar='FP', default='test', help='comma separated, test language prefixes')
+    parser.add_argument('--destdir', metavar='DIR', default='data-bin', help='destination dir')
+    parser.add_argument('--thresholdtgt', metavar='N', default=0, type=int,
+                        help='map words appearing less than threshold times to unknown')
+    parser.add_argument('--thresholdsrc', metavar='N', default=0, type=int,
+                        help='map words appearing less than threshold times to unknown')
+    parser.add_argument('--nwordstgt', metavar='N', default=-1, type=int, help='number of target words to retain')
+    parser.add_argument('--nwordssrc', metavar='N', default=-1, type=int, help='number of source words to retain')
+    parser.add_argument('--alignfile', metavar='ALIGN', default=None, help='an alignment file (optional)')
+    args = parser.parse_args()
+    print(args)
+    os.makedirs(args.destdir, exist_ok=True)
+    src_dict = Tokenizer.build_dictionary(filename='{}.{}'.format(args.trainpref, args.source_lang))
+    src_dict.save(os.path.join(args.destdir, 'dict.{}.txt'.format(args.source_lang)),
+                  threshold=args.thresholdsrc, nwords=args.nwordssrc)
+    tgt_dict = Tokenizer.build_dictionary(filename='{}.{}'.format(args.trainpref, args.target_lang))
+    tgt_dict.save(os.path.join(args.destdir, 'dict.{}.txt'.format(args.target_lang)),
+                  threshold=args.thresholdtgt, nwords=args.nwordstgt)
+    def make_dataset(input_prefix, output_prefix, lang):
+        dict = dictionary.Dictionary.load(os.path.join(args.destdir, 'dict.{}.txt'.format(lang)))
+        print('| [{}] Dictionary: {} types'.format(lang, len(dict) - 1))
+        ds = indexed_dataset.IndexedDatasetBuilder(
+            '{}/{}.{}-{}.{}.bin'.format(args.destdir, output_prefix, args.source_lang,
+                                        args.target_lang, lang)
+        )
+        def consumer(tensor):
+            ds.add_item(tensor)
+        input_file = '{}.{}'.format(input_prefix, lang)
+        res = Tokenizer.binarize(input_file, dict, consumer)
+        print('| [{}] {}: {} sents, {} tokens, {:.3}% replaced by {}'.format(
+            lang, input_file, res['nseq'], res['ntok'],
+            100 * res['nunk'] / res['ntok'], dict.unk_word))
+        ds.finalize('{}/{}.{}-{}.{}.idx'.format(
+            args.destdir, output_prefix,
+            args.source_lang, args.target_lang, lang))
+    make_dataset(args.trainpref, 'train', args.source_lang)
+    make_dataset(args.trainpref, 'train', args.target_lang)
+    for k, validpref in enumerate(args.validpref.split(',')):
+        outprefix = 'valid{}'.format(k) if k > 0 else 'valid'
+        make_dataset(validpref, outprefix, args.source_lang)
+        make_dataset(validpref, outprefix, args.target_lang)
+    for k, testpref in enumerate(args.testpref.split(',')):
+        outprefix = 'test{}'.format(k) if k > 0 else 'test'
+        make_dataset(testpref, outprefix, args.source_lang)
+        make_dataset(testpref, outprefix, args.target_lang)
+    print('| Wrote preprocessed data to {}'.format(args.destdir))
+    if args.alignfile:
+        src_file_name = '{}.{}'.format(args.trainpref, args.source_lang)
+        tgt_file_name = '{}.{}'.format(args.trainpref, args.target_lang)
+        src_dict = dictionary.Dictionary.load(os.path.join(args.destdir, 'dict.{}.txt'.format(args.source_lang)))
+        tgt_dict = dictionary.Dictionary.load(os.path.join(args.destdir, 'dict.{}.txt'.format(args.target_lang)))
+        freq_map = {}
+        with open(args.alignfile, 'r') as align_file:
+            with open(src_file_name, 'r') as src_file:
+                with open(tgt_file_name, 'r') as tgt_file:
+                    for a, s, t in zip_longest(align_file, src_file, tgt_file):
+                        si = Tokenizer.tokenize(s, src_dict, add_if_not_exist=False)
+                        ti = Tokenizer.tokenize(t, tgt_dict, add_if_not_exist=False)
+                        ai = list(map(lambda x: tuple(x.split('-')), a.split()))
+                        for sai, tai in ai:
+                            srcidx = si[int(sai)]
+                            tgtidx = ti[int(tai)]
+                            if srcidx != src_dict.unk() and tgtidx != tgt_dict.unk():
+                                assert srcidx != src_dict.pad()
+                                assert srcidx != src_dict.eos()
+                                assert tgtidx != tgt_dict.pad()
+                                assert tgtidx != tgt_dict.eos()
+                                if srcidx not in freq_map:
+                                    freq_map[srcidx] = {}
+                                if tgtidx not in freq_map[srcidx]:
+                                    freq_map[srcidx][tgtidx] = 1
+                                else:
+                                    freq_map[srcidx][tgtidx] += 1
+        align_dict = {}
+        for srcidx in freq_map.keys():
+            align_dict[srcidx] = max(freq_map[srcidx], key=freq_map[srcidx].get)
+        with open(os.path.join(args.destdir, 'alignment.{}-{}.txt'.format(
+                args.source_lang, args.target_lang)), 'w') as f:
+            for k, v in align_dict.items():
+                print('{} {}'.format(src_dict[k], tgt_dict[v]), file=f)
+if __name__ == '__main__':
+    main()
--- a/requirements.txt
+++ b/requirements.txt
+numpy
+torch
+tqdm
--- a/score.py
+++ b/score.py
+# Copyright (c) 2017-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in
+# the root directory of this source tree. An additional grant of patent rights
+# can be found in the PATENTS file in the same directory.
+#
+import argparse
+import os
+import sys
+from fairseq import bleu, dictionary, tokenizer
+def main():
+    parser = argparse.ArgumentParser(description='Command-line script for BLEU scoring.')
+    parser.add_argument('-s', '--sys', default='-', help='system output')
+    parser.add_argument('-r', '--ref', default='', help='references')
+    parser.add_argument('-o', '--order', default=4, metavar='N',
+                        type=int, help='consider ngrams up to this order')
+    parser.add_argument('--ignore-case', action='store_true',
+                        help='case-insensitive scoring')
+    args = parser.parse_args()
+    print(args)
+    assert args.sys == '-' or os.path.exists(args.sys), \
+        "System output file {} does not exist".format(args.sys)
+    assert os.path.exists(args.ref), \
+        "Reference file {} does not exist".format(args.ref)
+    dict = dictionary.Dictionary()
+    def readlines(fd):
+        for line in fd.readlines():
+            if args.ignore_case:
+                yield line.lower()
+            yield line
+    def score(fdsys):
+        with open(args.ref) as fdref:
+            scorer = bleu.Scorer(dict.pad(), dict.eos(), dict.unk())
+            for sys_tok, ref_tok in zip(readlines(fdsys), readlines(fdref)):
+                sys_tok = tokenizer.Tokenizer.tokenize(sys_tok, dict)
+                ref_tok = tokenizer.Tokenizer.tokenize(ref_tok, dict)
+                scorer.add(ref_tok, sys_tok)
+            print(scorer.result_string(args.order))
+    if args.sys == '-':
+        score(sys.stdin)
+    else:
+        with open(args.sys, 'r') as f:
+            score(f)
+if __name__ == '__main__':
+    main()