initial commit

18d27e00 · wangwei990215 · 541f4c7a · 18d27e00 · 18d27e00 · 18d27e00
Commit 18d27e00 authored Aug 27, 2024 by wangwei990215
20 changed files
--- a/fairseq/examples/linformer/linformer_src/modules/__pycache__/linformer_sentence_encoder.cpython-38.pyc
+++ b/fairseq/examples/linformer/linformer_src/modules/__pycache__/linformer_sentence_encoder.cpython-38.pyc
--- a/fairseq/examples/linformer/linformer_src/modules/__pycache__/linformer_sentence_encoder_layer.cpython-38.pyc
+++ b/fairseq/examples/linformer/linformer_src/modules/__pycache__/linformer_sentence_encoder_layer.cpython-38.pyc
--- a/fairseq/examples/linformer/linformer_src/modules/__pycache__/multihead_linear_attention.cpython-38.pyc
+++ b/fairseq/examples/linformer/linformer_src/modules/__pycache__/multihead_linear_attention.cpython-38.pyc
--- a/fairseq/examples/linformer/linformer_src/modules/linformer_sentence_encoder.py
+++ b/fairseq/examples/linformer/linformer_src/modules/linformer_sentence_encoder.py
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import math
+
+import torch.nn as nn
+from fairseq.modules import TransformerSentenceEncoder
+
+from .linformer_sentence_encoder_layer import LinformerSentenceEncoderLayer
+
+
+class LinformerSentenceEncoder(TransformerSentenceEncoder):
+    """
+    Implementation for a Bi-directional Linformer based Sentence Encoder used
+    in BERT/XLM style pre-trained models.
+
+    This first computes the token embedding using the token embedding matrix,
+    position embeddings (if specified) and segment embeddings
+    (if specified). After applying the specified number of
+    LinformerEncoderLayers, it outputs all the internal states of the
+    encoder as well as the final representation associated with the first
+    token (usually CLS token).
+
+    Input:
+        - tokens: B x T matrix representing sentences
+        - segment_labels: B x T matrix representing segment label for tokens
+
+    Output:
+        - a tuple of the following:
+            - a list of internal model states used to compute the
+              predictions where each tensor has shape T x B x C
+            - sentence representation associated with first input token
+              in format B x C.
+    """
+
+    def __init__(
+        self,
+        padding_idx: int,
+        vocab_size: int,
+        num_encoder_layers: int = 6,
+        embedding_dim: int = 768,
+        ffn_embedding_dim: int = 3072,
+        num_attention_heads: int = 8,
+        dropout: float = 0.1,
+        attention_dropout: float = 0.1,
+        activation_dropout: float = 0.1,
+        layerdrop: float = 0.0,
+        max_seq_len: int = 256,
+        num_segments: int = 2,
+        use_position_embeddings: bool = True,
+        offset_positions_by_padding: bool = True,
+        encoder_normalize_before: bool = False,
+        apply_bert_init: bool = False,
+        activation_fn: str = "relu",
+        learned_pos_embedding: bool = True,
+        embed_scale: float = None,
+        freeze_embeddings: bool = False,
+        n_trans_layers_to_freeze: int = 0,
+        export: bool = False,
+        traceable: bool = False,
+        q_noise: float = 0.0,
+        qn_block_size: int = 8,
+        compressed: int = 4,
+        shared_kv_compressed: int = 0,
+        shared_layer_kv_compressed: int = 0,
+        freeze_compress: int = 0,
+    ) -> None:
+
+        # Initialize linformer parameters
+        self.compressed = compressed
+        self.shared_kv_compressed = shared_kv_compressed
+        self.shared_layer_kv_compressed = shared_layer_kv_compressed
+        self.compress_layer = None
+        self.freeze_compress = freeze_compress
+
+        super().__init__(
+            padding_idx=padding_idx,
+            vocab_size=vocab_size,
+            num_encoder_layers=num_encoder_layers,
+            embedding_dim=embedding_dim,
+            ffn_embedding_dim=ffn_embedding_dim,
+            num_attention_heads=num_attention_heads,
+            dropout=dropout,
+            attention_dropout=attention_dropout,
+            activation_dropout=activation_dropout,
+            layerdrop=layerdrop,
+            max_seq_len=max_seq_len,
+            num_segments=num_segments,
+            use_position_embeddings=use_position_embeddings,
+            offset_positions_by_padding=offset_positions_by_padding,
+            encoder_normalize_before=encoder_normalize_before,
+            apply_bert_init=apply_bert_init,
+            activation_fn=activation_fn,
+            learned_pos_embedding=learned_pos_embedding,
+            embed_scale=embed_scale,
+            freeze_embeddings=freeze_embeddings,
+            n_trans_layers_to_freeze=n_trans_layers_to_freeze,
+            export=export,
+            traceable=traceable,
+            q_noise=q_noise,
+            qn_block_size=qn_block_size,
+        )
+
+    def build_transformer_sentence_encoder_layer(
+        self,
+        embedding_dim,
+        ffn_embedding_dim,
+        num_attention_heads,
+        dropout,
+        attention_dropout,
+        activation_dropout,
+        activation_fn,
+        export,
+        q_noise,
+        qn_block_size,
+    ):
+        if self.shared_layer_kv_compressed == 1:
+            compress_layer = nn.Linear(
+                self.max_seq_len, self.max_seq_len // self.compressed
+            )
+            # intialize parameters for compressed layer
+            nn.init.xavier_uniform_(compress_layer.weight, gain=1 / math.sqrt(2))
+            if self.freeze_compress == 1:
+                compress_layer.weight.requires_grad = False
+            self.compress_layer = compress_layer
+
+        return LinformerSentenceEncoderLayer(
+            embedding_dim=embedding_dim,
+            ffn_embedding_dim=ffn_embedding_dim,
+            num_attention_heads=num_attention_heads,
+            dropout=dropout,
+            attention_dropout=attention_dropout,
+            activation_dropout=activation_dropout,
+            activation_fn=activation_fn,
+            export=export,
+            q_noise=q_noise,
+            qn_block_size=qn_block_size,
+            compressed=self.compressed,
+            max_seq_len=self.max_seq_len,
+            shared_kv_compressed=self.shared_kv_compressed,
+            shared_compress_layer=(
+                None if self.shared_layer_kv_compressed == 0 else self.compress_layer
+            ),
+            freeze_compress=self.freeze_compress,
+        )
+
+    def upgrade_state_dict_named(self, state_dict, name):
+        prefix = name + "." if name != "" else ""
+        items_to_add = {}
+        keys_to_remove = []
+
+        # update key name for shared layer in new version of code
+        for k in state_dict.keys():
+            if k.startswith(prefix + "compress_layer"):
+                if self.shared_layer_kv_compressed:
+                    for layer_idx in range(len(self.layers)):
+                        new_k = prefix + "layers.{0}.shared_compress_layer.{1}".format(
+                            layer_idx,
+                            k[len(prefix + "compress_layer.") :],
+                        )
+                        items_to_add[new_k] = state_dict[k]
+
+        for k in keys_to_remove:
+            del state_dict[k]
+
+        for key, value in items_to_add.items():
+            state_dict[key] = value
--- a/fairseq/examples/linformer/linformer_src/modules/linformer_sentence_encoder_layer.py
+++ b/fairseq/examples/linformer/linformer_src/modules/linformer_sentence_encoder_layer.py
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Callable
+
+from fairseq.modules import TransformerSentenceEncoderLayer
+
+from .multihead_linear_attention import MultiheadLinearAttention
+
+
+class LinformerSentenceEncoderLayer(TransformerSentenceEncoderLayer):
+    """
+    Implements a Linformer Encoder Layer used in BERT/XLM style pre-trained
+    models.
+    """
+
+    def __init__(
+        self,
+        embedding_dim: int = 768,
+        ffn_embedding_dim: int = 3072,
+        num_attention_heads: int = 8,
+        dropout: float = 0.1,
+        attention_dropout: float = 0.1,
+        activation_dropout: float = 0.1,
+        activation_fn: str = "relu",
+        export: bool = False,
+        q_noise: float = 0.0,
+        qn_block_size: int = 8,
+        init_fn: Callable = None,
+        compressed: int = 1,
+        max_seq_len: int = 256,
+        shared_kv_compressed: int = 0,
+        shared_compress_layer: any = None,
+        freeze_compress: int = 0,
+    ) -> None:
+
+        # Initialize linformer parameters
+        self.compressed = compressed
+        self.max_seq_len = max_seq_len
+        self.shared_kv_compressed = shared_kv_compressed
+        self.freeze_compress = freeze_compress
+
+        def init_fn():
+            # This needs to be set after nn.Module.__init__ is called
+            self.shared_compress_layer = shared_compress_layer
+
+        super().__init__(
+            embedding_dim=embedding_dim,
+            ffn_embedding_dim=ffn_embedding_dim,
+            num_attention_heads=num_attention_heads,
+            dropout=dropout,
+            attention_dropout=attention_dropout,
+            activation_dropout=activation_dropout,
+            activation_fn=activation_fn,
+            export=export,
+            q_noise=q_noise,
+            qn_block_size=qn_block_size,
+            init_fn=init_fn,
+        )
+
+    def build_self_attention(
+        self,
+        embed_dim,
+        num_attention_heads,
+        dropout,
+        self_attention,
+        q_noise,
+        qn_block_size,
+    ):
+        return MultiheadLinearAttention(
+            embed_dim,
+            num_attention_heads,
+            dropout=dropout,
+            self_attention=True,
+            q_noise=q_noise,
+            qn_block_size=qn_block_size,
+            compressed=self.compressed,
+            max_seq_len=self.max_seq_len,
+            shared_kv_compressed=self.shared_kv_compressed,
+            shared_compress_layer=self.shared_compress_layer,
+            freeze_compress=self.freeze_compress,
+        )
--- a/fairseq/examples/linformer/linformer_src/modules/multihead_linear_attention.py
+++ b/fairseq/examples/linformer/linformer_src/modules/multihead_linear_attention.py
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import math
+from typing import Dict, Optional, Tuple
+
+import torch
+import torch.nn.functional as F
+from fairseq import utils
+from fairseq.incremental_decoding_utils import with_incremental_state
+from fairseq.modules.quant_noise import quant_noise
+from torch import Tensor, nn
+from torch.nn import Parameter
+
+
+@with_incremental_state
+class MultiheadLinearAttention(nn.Module):
+    """Multi-headed linformer attention.
+
+    Projects the key and values down to the compressed dimension, before computing self-attention.
+
+    See "Linformer: Self-Attention with Linear Complexity" for more details.
+    """
+
+    def __init__(
+        self,
+        embed_dim,
+        num_heads,
+        kdim=None,
+        vdim=None,
+        dropout=0.0,
+        bias=True,
+        add_bias_kv=False,
+        add_zero_attn=False,
+        self_attention=False,
+        encoder_decoder_attention=False,
+        q_noise=0.0,
+        qn_block_size=8,
+        compressed=1,
+        max_seq_len=256,
+        shared_kv_compressed=0,
+        shared_compress_layer=None,
+        freeze_compress=0,
+    ):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.kdim = kdim if kdim is not None else embed_dim
+        self.vdim = vdim if vdim is not None else embed_dim
+        self.qkv_same_dim = self.kdim == embed_dim and self.vdim == embed_dim
+
+        self.num_heads = num_heads
+        self.dropout = dropout
+        self.head_dim = embed_dim // num_heads
+        assert (
+            self.head_dim * num_heads == self.embed_dim
+        ), "embed_dim must be divisible by num_heads"
+        self.scaling = self.head_dim ** -0.5
+
+        self.self_attention = self_attention
+        self.encoder_decoder_attention = encoder_decoder_attention
+
+        assert not self.self_attention or self.qkv_same_dim, (
+            "Self-attention requires query, key and " "value to be of the same size"
+        )
+
+        self.k_proj = quant_noise(
+            nn.Linear(self.kdim, embed_dim, bias=bias), q_noise, qn_block_size
+        )
+        self.v_proj = quant_noise(
+            nn.Linear(self.vdim, embed_dim, bias=bias), q_noise, qn_block_size
+        )
+        self.q_proj = quant_noise(
+            nn.Linear(embed_dim, embed_dim, bias=bias), q_noise, qn_block_size
+        )
+
+        # used for compress sequence to subsequence
+        if shared_compress_layer is None:
+            self.compress_seq_len = max_seq_len // compressed
+            self.compress_k = nn.Linear(max_seq_len, self.compress_seq_len, bias=False)
+            if shared_kv_compressed == 0:
+                self.compress_v = nn.Linear(
+                    max_seq_len, self.compress_seq_len, bias=False
+                )
+            self.layerwise_sharing = False
+        else:
+            self.compress_k = shared_compress_layer
+            if shared_kv_compressed == 0:
+                self.compress_v = shared_compress_layer
+            self.layerwise_sharing = True
+        self.shared_kv_compressed = shared_kv_compressed
+
+        self.out_proj = quant_noise(
+            nn.Linear(embed_dim, embed_dim, bias=bias), q_noise, qn_block_size
+        )
+
+        if add_bias_kv:
+            self.bias_k = Parameter(torch.Tensor(1, 1, embed_dim))
+            self.bias_v = Parameter(torch.Tensor(1, 1, embed_dim))
+        else:
+            self.bias_k = self.bias_v = None
+
+        self.add_zero_attn = add_zero_attn
+
+        self.reset_parameters()
+
+        if freeze_compress == 1:
+            self.compress_k.weight.requires_grad = False
+            if shared_kv_compressed == 0:
+                self.compress_v.weight.requires_grad = False
+
+        self.onnx_trace = False
+        self.tpu = False
+
+    def prepare_for_onnx_export_(self):
+        self.onnx_trace = True
+
+    def prepare_for_tpu_(self, **kwargs):
+        self.tpu = True
+
+    def reset_parameters(self):
+        if self.qkv_same_dim:
+            # Empirically observed the convergence to be much better with
+            # the scaled initialization
+            nn.init.xavier_uniform_(self.k_proj.weight, gain=1 / math.sqrt(2))
+            nn.init.xavier_uniform_(self.v_proj.weight, gain=1 / math.sqrt(2))
+            nn.init.xavier_uniform_(self.q_proj.weight, gain=1 / math.sqrt(2))
+            if (
+                not self.layerwise_sharing
+            ):  # otherwise, we already initialize the parameters
+                nn.init.xavier_uniform_(self.compress_k.weight, gain=1 / math.sqrt(2))
+                if self.shared_kv_compressed == 0:
+                    nn.init.xavier_uniform_(
+                        self.compress_v.weight, gain=1 / math.sqrt(2)
+                    )
+        else:
+            nn.init.xavier_uniform_(self.k_proj.weight)
+            nn.init.xavier_uniform_(self.v_proj.weight)
+            nn.init.xavier_uniform_(self.q_proj.weight)
+            if (
+                not self.layerwise_sharing
+            ):  # otherwise, we already initialize the parameters
+                nn.init.xavier_uniform_(self.compress_k.weight)
+                if self.shared_kv_compressed == 0:
+                    nn.init.xavier_uniform_(self.compress_v.weight)
+
+        nn.init.xavier_uniform_(self.out_proj.weight)
+        if self.out_proj.bias is not None:
+            nn.init.constant_(self.out_proj.bias, 0.0)
+        if self.bias_k is not None:
+            nn.init.xavier_normal_(self.bias_k)
+        if self.bias_v is not None:
+            nn.init.xavier_normal_(self.bias_v)
+
+    def forward(
+        self,
+        query,
+        key: Optional[Tensor],
+        value: Optional[Tensor],
+        key_padding_mask: Optional[Tensor] = None,
+        incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None,
+        need_weights: bool = True,
+        static_kv: bool = False,
+        attn_mask: Optional[Tensor] = None,
+        before_softmax: bool = False,
+        need_head_weights: bool = False,
+    ) -> Tuple[Tensor, Optional[Tensor]]:
+        """Input shape: Time x Batch x Channel
+
+        Args:
+            key_padding_mask (ByteTensor, optional): mask to exclude
+                keys that are pads, of shape `(batch, src_len)`, where
+                padding elements are indicated by 1s.
+            need_weights (bool, optional): return the attention weights,
+                averaged over heads (default: False).
+            attn_mask (ByteTensor, optional): typically used to
+                implement causal attention, where the mask prevents the
+                attention from looking forward in time (default: None).
+            before_softmax (bool, optional): return the raw attention
+                weights and values before the attention softmax.
+            need_head_weights (bool, optional): return the attention
+                weights for each head. Implies *need_weights*. Default:
+                return the average attention weights over all heads.
+        """
+        if need_head_weights:
+            need_weights = True
+
+        tgt_len, bsz, embed_dim = query.size()
+        assert embed_dim == self.embed_dim
+        assert list(query.size()) == [tgt_len, bsz, embed_dim]
+
+        if incremental_state is not None:
+            saved_state = self._get_input_buffer(incremental_state)
+            if saved_state is not None and "prev_key" in saved_state:
+                # previous time steps are cached - no need to recompute
+                # key and value if they are static
+                if static_kv:
+                    assert self.encoder_decoder_attention and not self.self_attention
+                    key = value = None
+        else:
+            saved_state = None
+
+        if self.self_attention:
+            q = self.q_proj(query)
+
+            k_input = query.permute(1, 2, 0).contiguous()  # B * C * T
+            k_input = (
+                F.linear(k_input, self.compress_k.weight[:, 0:tgt_len])
+                .permute(2, 0, 1)
+                .contiguous()
+            )
+            k = self.k_proj(k_input)
+
+            v_input = query.permute(1, 2, 0).contiguous()  # B * C * T
+            if self.shared_kv_compressed == 0:
+                v_input = (
+                    F.linear(v_input, self.compress_v.weight[:, 0:tgt_len])
+                    .permute(2, 0, 1)
+                    .contiguous()
+                )
+            if self.shared_kv_compressed == 1:  # use shared kv compressed linear layer
+                v_input = (
+                    F.linear(v_input, self.compress_k.weight[:, 0:tgt_len])
+                    .permute(2, 0, 1)
+                    .contiguous()
+                )
+            v = self.v_proj(v_input)
+        elif self.encoder_decoder_attention:
+            # encoder-decoder attention
+            q = self.q_proj(query)
+            if key is None:
+                assert value is None
+                k = v = None
+            else:
+                k = self.k_proj(key)
+                v = self.v_proj(key)
+
+        else:
+            assert key is not None and value is not None
+            q = self.q_proj(query)
+            k = self.k_proj(key)
+            v = self.v_proj(value)
+        q *= self.scaling
+
+        if self.bias_k is not None:
+            assert self.bias_v is not None
+            k = torch.cat([k, self.bias_k.repeat(1, bsz, 1)])
+            v = torch.cat([v, self.bias_v.repeat(1, bsz, 1)])
+            if attn_mask is not None:
+                attn_mask = torch.cat(
+                    [attn_mask, attn_mask.new_zeros(attn_mask.size(0), 1)], dim=1
+                )
+            if key_padding_mask is not None:
+                key_padding_mask = torch.cat(
+                    [
+                        key_padding_mask,
+                        key_padding_mask.new_zeros(key_padding_mask.size(0), 1),
+                    ],
+                    dim=1,
+                )
+
+        q = (
+            q.contiguous()
+            .view(tgt_len, bsz * self.num_heads, self.head_dim)
+            .transpose(0, 1)
+        )
+        if k is not None:
+            k = (
+                k.contiguous()
+                .view(-1, bsz * self.num_heads, self.head_dim)
+                .transpose(0, 1)
+            )
+        if v is not None:
+            v = (
+                v.contiguous()
+                .view(-1, bsz * self.num_heads, self.head_dim)
+                .transpose(0, 1)
+            )
+
+        if saved_state is not None:
+            # saved states are stored with shape (bsz, num_heads, seq_len, head_dim)
+            if "prev_key" in saved_state:
+                _prev_key = saved_state["prev_key"]
+                assert _prev_key is not None
+                prev_key = _prev_key.view(bsz * self.num_heads, -1, self.head_dim)
+                if static_kv:
+                    k = prev_key
+                else:
+                    assert k is not None
+                    k = torch.cat([prev_key, k], dim=1)
+            if "prev_value" in saved_state:
+                _prev_value = saved_state["prev_value"]
+                assert _prev_value is not None
+                prev_value = _prev_value.view(bsz * self.num_heads, -1, self.head_dim)
+                if static_kv:
+                    v = prev_value
+                else:
+                    assert v is not None
+                    v = torch.cat([prev_value, v], dim=1)
+            prev_key_padding_mask: Optional[Tensor] = None
+            if "prev_key_padding_mask" in saved_state:
+                prev_key_padding_mask = saved_state["prev_key_padding_mask"]
+            assert k is not None and v is not None
+            key_padding_mask = MultiheadLinearAttention._append_prev_key_padding_mask(
+                key_padding_mask=key_padding_mask,
+                prev_key_padding_mask=prev_key_padding_mask,
+                batch_size=bsz,
+                src_len=k.size(1),
+                static_kv=static_kv,
+            )
+
+            saved_state["prev_key"] = k.view(bsz, self.num_heads, -1, self.head_dim)
+            saved_state["prev_value"] = v.view(bsz, self.num_heads, -1, self.head_dim)
+            saved_state["prev_key_padding_mask"] = key_padding_mask
+            # In this branch incremental_state is never None
+            assert incremental_state is not None
+            incremental_state = self._set_input_buffer(incremental_state, saved_state)
+        assert k is not None
+        src_len = k.size(1)
+
+        if self.add_zero_attn:
+            assert v is not None
+            src_len += 1
+            k = torch.cat([k, k.new_zeros((k.size(0), 1) + k.size()[2:])], dim=1)
+            v = torch.cat([v, v.new_zeros((v.size(0), 1) + v.size()[2:])], dim=1)
+            if attn_mask is not None:
+                attn_mask = torch.cat(
+                    [attn_mask, attn_mask.new_zeros(attn_mask.size(0), 1)], dim=1
+                )
+
+        attn_weights = torch.bmm(q, k.transpose(1, 2))
+        attn_weights = MultiheadLinearAttention.apply_sparse_mask(
+            attn_weights, tgt_len, src_len, bsz
+        )
+
+        assert list(attn_weights.size()) == [bsz * self.num_heads, tgt_len, src_len]
+
+        if attn_mask is not None:
+            attn_mask = attn_mask.unsqueeze(0)
+            if self.onnx_trace:
+                attn_mask = attn_mask.repeat(attn_weights.size(0), 1, 1)
+            attn_weights += attn_mask
+
+        if before_softmax:
+            return attn_weights, v
+
+        attn_weights_float = utils.softmax(
+            attn_weights, dim=-1, onnx_trace=self.onnx_trace
+        )
+        attn_weights = attn_weights_float.type_as(attn_weights)
+        attn_probs = F.dropout(
+            attn_weights,
+            p=self.dropout,
+            training=self.training,
+        )
+        assert v is not None
+        attn = torch.bmm(attn_probs, v)
+        assert list(attn.size()) == [bsz * self.num_heads, tgt_len, self.head_dim]
+        if self.onnx_trace and attn.size(1) == 1:
+            # when ONNX tracing a single decoder step (sequence length == 1)
+            # the transpose is a no-op copy before view, thus unnecessary
+            attn = attn.contiguous().view(tgt_len, bsz, embed_dim)
+        else:
+            attn = attn.transpose(0, 1).contiguous().view(tgt_len, bsz, embed_dim)
+        attn = self.out_proj(attn)
+        attn_weights: Optional[Tensor] = None
+        if need_weights:
+            attn_weights = attn_weights_float.view(
+                bsz, self.num_heads, tgt_len, src_len
+            ).transpose(1, 0)
+            if not need_head_weights:
+                # average attention weights over heads
+                attn_weights = attn_weights.mean(dim=0)
+
+        return attn, attn_weights
+
+    @staticmethod
+    def _append_prev_key_padding_mask(
+        key_padding_mask: Optional[Tensor],
+        prev_key_padding_mask: Optional[Tensor],
+        batch_size: int,
+        src_len: int,
+        static_kv: bool,
+    ) -> Optional[Tensor]:
+        # saved key padding masks have shape (bsz, seq_len)
+        if prev_key_padding_mask is not None and static_kv:
+            new_key_padding_mask = prev_key_padding_mask
+        elif prev_key_padding_mask is not None and key_padding_mask is not None:
+            new_key_padding_mask = torch.cat(
+                [prev_key_padding_mask.float(), key_padding_mask.float()], dim=1
+            )
+        # During incremental decoding, as the padding token enters and
+        # leaves the frame, there will be a time when prev or current
+        # is None
+        elif prev_key_padding_mask is not None:
+            filler = torch.zeros(
+                (batch_size, src_len - prev_key_padding_mask.size(1)),
+                device=prev_key_padding_mask.device,
+            )
+            new_key_padding_mask = torch.cat(
+                [prev_key_padding_mask.float(), filler.float()], dim=1
+            )
+        elif key_padding_mask is not None:
+            filler = torch.zeros(
+                (batch_size, src_len - key_padding_mask.size(1)),
+                device=key_padding_mask.device,
+            )
+            new_key_padding_mask = torch.cat(
+                [filler.float(), key_padding_mask.float()], dim=1
+            )
+        else:
+            new_key_padding_mask = prev_key_padding_mask
+        return new_key_padding_mask
+
+    @torch.jit.export
+    def reorder_incremental_state(
+        self,
+        incremental_state: Dict[str, Dict[str, Optional[Tensor]]],
+        new_order: Tensor,
+    ):
+        """Reorder buffered internal state (for incremental generation)."""
+        input_buffer = self._get_input_buffer(incremental_state)
+        if input_buffer is not None:
+            for k in input_buffer.keys():
+                input_buffer_k = input_buffer[k]
+                if input_buffer_k is not None:
+                    if self.encoder_decoder_attention and input_buffer_k.size(
+                        0
+                    ) == new_order.size(0):
+                        break
+                    input_buffer[k] = input_buffer_k.index_select(0, new_order)
+            incremental_state = self._set_input_buffer(incremental_state, input_buffer)
+        return incremental_state
+
+    def _get_input_buffer(
+        self, incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]]
+    ) -> Dict[str, Optional[Tensor]]:
+        result = self.get_incremental_state(incremental_state, "attn_state")
+        if result is not None:
+            return result
+        else:
+            empty_result: Dict[str, Optional[Tensor]] = {}
+            return empty_result
+
+    def _set_input_buffer(
+        self,
+        incremental_state: Dict[str, Dict[str, Optional[Tensor]]],
+        buffer: Dict[str, Optional[Tensor]],
+    ):
+        return self.set_incremental_state(incremental_state, "attn_state", buffer)
+
+    def apply_sparse_mask(attn_weights, tgt_len: int, src_len: int, bsz: int):
+        return attn_weights
+
+    def upgrade_state_dict_named(self, state_dict, name):
+        prefix = name + "." if name != "" else ""
+        items_to_add = {}
+        keys_to_remove = []
+        for k in state_dict.keys():
+            if k.endswith(prefix + "in_proj_weight"):
+                # in_proj_weight used to be q + k + v with same dimensions
+                dim = int(state_dict[k].shape[0] / 3)
+                items_to_add[prefix + "q_proj.weight"] = state_dict[k][:dim]
+                items_to_add[prefix + "k_proj.weight"] = state_dict[k][dim : 2 * dim]
+                items_to_add[prefix + "v_proj.weight"] = state_dict[k][2 * dim :]
+
+                keys_to_remove.append(k)
+
+                k_bias = prefix + "in_proj_bias"
+                if k_bias in state_dict.keys():
+                    dim = int(state_dict[k].shape[0] / 3)
+                    items_to_add[prefix + "q_proj.bias"] = state_dict[k_bias][:dim]
+                    items_to_add[prefix + "k_proj.bias"] = state_dict[k_bias][
+                        dim : 2 * dim
+                    ]
+                    items_to_add[prefix + "v_proj.bias"] = state_dict[k_bias][2 * dim :]
+
+                    keys_to_remove.append(prefix + "in_proj_bias")
+
+        for k in keys_to_remove:
+            del state_dict[k]
+
+        for key, value in items_to_add.items():
+            state_dict[key] = value
--- a/fairseq/examples/m2m_100/README.md
+++ b/fairseq/examples/m2m_100/README.md
+# Beyond English-Centric Multilingual Machine Translation
+
+## Introduction
+In this work, we create a true Many-to-Many multilingual translation model that can translate directly between any pair of 100 languages. Our focus on non-English-Centric models brings gains of more than 10 BLEU when directly translating between non-English directions while performing competitively with the best single systems of WMT. 
+
+If you are new to using fairseq, read the following walkthrough. Otherwise, skip to the sections below. 
+
+0. **Generation Data**
+
+To download the generation data, follow the below commands. Note that all datasets need to be detokenized *before* applying SPM in the data preprocessing step. If you use these evaluation datasets, please cite their associated papers. 
+```bash
+# WMT - use sacrebleu, example here:
+sacrebleu -t wmt14 -l fr-en --echo src > wmt.test.fr-en.fr
+sacrebleu -t wmt14 -l fr-en --echo ref > wmt.test.fr-en.en
+
+# WAT
+wget http://lotus.kuee.kyoto-u.ac.jp/WAT/my-en-data/wat2019.my-en.zip
+unzip wat2019.my-en.zip
+
+# FLORES
+# download from: https://github.com/facebookresearch/flores
+
+# TED - need to detokenize with Moses!
+# from: https://github.com/neulab/word-embeddings-for-nmt
+wget http://phontron.com/data/ted_talks.tar.gz
+
+# Autshumato
+# request to download: https://repo.sadilar.org/handle/20.500.12185/397
+
+# Tatoeba Challenge
+# available here: https://github.com/Helsinki-NLP/Tatoeba-Challenge
+```
+
+1. **Training Data**
+
+To produce the training data, we use a combination of [CCMatrix](https://arxiv.org/abs/1911.04944) and [CCAligned](https://arxiv.org/abs/1911.06154). Check out the instructions [here](https://github.com/facebookresearch/LASER/tree/master/tasks/CCMatrix) to download the raw data.
+
+2. **Preprocess Data**
+
+After downloading raw data, you will need to postprocess the data, then apply SPM, then binarize. Note that it is very important you run the postprocessing script, because this removes any instance of the evaluation data in the mined training data.
+
+```bash
+# preprocess data
+
+# remove sentences with more than 50% punctuation
+python /path/to/fairseq/examples/m2m_100/process_data/remove_too_much_punc.py 
+
+# deduplicate training data
+paste /path/to/datadir/train.$src /path/to/datadir/train.$tgt | awk '!x[$0]++' > /path/to/datadir/train.dedup
+echo "keeping $(wc -l /path/to/datadir/train.dedup) bitext out of $(wc -l /path/to/datadir/train.$src)"
+cut -f1 /path/to/datadir/train.dedup > /path/to/datadir/train.$src
+cut -f2 /path/to/datadir/train.dedup > /path/to/datadir/train.$tgt
+
+# remove all instances of evaluation data from the training data
+python /path/to/fairseq/examples/m2m_100/process_data/dedup_data.py 
+
+# frequency cleaning
+wget https://dl.fbaipublicfiles.com/m2m_100/histograms.tar.gz 
+tar -xvzf histograms.tar.gz
+python /path/to/fairseq/examples/m2m_100/process_data/clean_histogram.py --src $src --tgt $tgt --src-file /path/to/source/file --tgt-file /path/to/output/file --src-output-file source_output.$src --tgt-output-file target_output.$tgt --histograms /path/to/histograms
+
+# apply SPM
+wget https://dl.fbaipublicfiles.com/m2m_100/spm.128k.model
+python /path/to/fairseq/scripts/spm_encode.py \
+    --model spm.128k.model \
+    --output_format=piece \
+    --inputs=/path/to/input/file/here \
+    --outputs=/path/to/output/file/here
+
+# length ratio cleaning
+perl mosesdecoder/scripts/training/clean-corpus-n.perl --ratio 3 /path/to/training/data/train.spm.$src-$tgt $src $tgt /path/to/output/directory/train.spm.$src-$tgt 1 250
+
+# binarize data
+wget https://dl.fbaipublicfiles.com/m2m_100/data_dict.128k.txt
+fairseq-preprocess \
+    --source-lang $src --target-lang $tgt \
+    --testpref spm.$src.$tgt \
+    --thresholdsrc 0 --thresholdtgt 0 \
+    --destdir data_bin \
+    --srcdict data_dict.128k.txt --tgtdict data_dict.128k.txt
+```
+
+3. **Training Scripts**
+
+To reproduce the training of our models, we train with fairseq-py's multilingual translation [task](https://github.com/pytorch/fairseq/tree/master/examples/multilingual). If you are interested in model parallel training, also check out [fairscale](https://github.com/facebookresearch/fairscale).
+
+4. **Generation**
+
+To generate from our models, follow the the commands in the generation section below.
+
+
+If you use any of the resources listed here, please cite:
+```bibtex
+@article{fan2020beyond,
+  title={Beyond English-Centric Multilingual Machine Translation},
+  author={Fan, Angela and Bhosale, Shruti and Schwenk, Holger and Ma, Zhiyi and El-Kishky, Ahmed and Goyal, Siddharth and Baines, Mandeep and Celebi, Onur and Wenzek, Guillaume and Chaudhary, Vishrav and Goyal, Naman and Birch, Tom and Liptchinsky, Vitaliy and Edunov, Sergey and Grave, Edouard and Auli, Michael and Joulin, Armand},
+  journal={arXiv preprint},
+  year={2020}
+}
+
+@article{schwenk2019ccmatrix,
+  title={Ccmatrix: Mining billions of high-quality parallel sentences on the web},
+  author={Schwenk, Holger and Wenzek, Guillaume and Edunov, Sergey and Grave, Edouard and Joulin, Armand},
+  journal={arXiv preprint arXiv:1911.04944},
+  year={2019}
+}
+
+@article{el2019massive,
+  title={A Massive Collection of Cross-Lingual Web-Document Pairs},
+  author={El-Kishky, Ahmed and Chaudhary, Vishrav and Guzman, Francisco and Koehn, Philipp},
+  journal={arXiv preprint arXiv:1911.06154},
+  year={2019}
+}
+```
+
+
+## Trained Models
+
+Looking for other trained models? Check back soon. 
+
+Model | Description | Download
+---|---|---
+`12b_last_checkpoint` | 12B parameter model trained on many-to-many training data for 100 languages | [12b_last_checkpoint](https://dl.fbaipublicfiles.com/m2m_100/12b_last_checkpoint.pt)
+
+
+## SentencePiece Model
+
+```bash
+wget https://dl.fbaipublicfiles.com/m2m_100/spm.128k.model
+```
+
+## Generation with M2M-100
+
+### Encode using our SentencePiece Model
+
+Note: Install SentencePiece from [here](https://github.com/google/sentencepiece)
+
+```bash
+fairseq=/path/to/fairseq
+cd $fairseq
+sacrebleu --echo src -l de-fr -t wmt19 | head -n 20 > raw_input.de-fr.de
+sacrebleu --echo ref -l de-fr -t wmt19 | head -n 20 > raw_input.de-fr.fr
+wget https://dl.fbaipublicfiles.com/m2m_100/spm.128k.model
+for lang in de fr ; do
+    python scripts/spm_encode.py \
+        --model spm.128k.model \
+        --output_format=piece \
+        --inputs=raw_input.de-fr.${lang} \
+        --outputs=spm.de-fr.${lang}
+done
+```
+
+### Binarization
+
+```bash
+wget https://dl.fbaipublicfiles.com/m2m_100/data_dict.128k.txt
+fairseq-preprocess \
+    --source-lang de --target-lang fr \
+    --testpref spm.de-fr \
+    --thresholdsrc 0 --thresholdtgt 0 \
+    --destdir data_bin \
+    --srcdict data_dict.128k.txt --tgtdict data_dict.128k.txt
+```
+
+### Generation on a V100 GPU
+
+```bash
+wget https://dl.fbaipublicfiles.com/m2m_100/model_dict.128k.txt
+wget https://dl.fbaipublicfiles.com/m2m_100/language_pairs.txt
+wget https://dl.fbaipublicfiles.com/m2m_100/12b_last_checkpoint.pt
+fairseq-generate \
+    data_bin \
+    --batch-size 1 \
+    --path 12b_last_checkpoint.pt \
+    --fixed-dictionary model_dict.128k.txt \
+    -s de -t fr \
+    --remove-bpe 'sentencepiece' \
+    --beam 5 \
+    --task translation_multi_simple_epoch \
+    --lang-pairs language_pairs.txt \
+    --decoder-langtok --encoder-langtok src \
+    --gen-subset test \
+    --fp16 \
+    --dataset-impl mmap \
+    --distributed-world-size 1 --distributed-no-spawn \
+    --pipeline-model-parallel \
+    --pipeline-chunks 1 \
+    --pipeline-encoder-balance '[26]' \
+    --pipeline-encoder-devices '[0]' \
+    --pipeline-decoder-balance '[1,24,1]' \
+    --pipeline-decoder-devices '[0,1,0]' > gen_out
+```
+## Evaluation with M2M-100
+
+### Tokenization
+
+Note: Refer to tokenizers/README.md for more details on tokenization.
+
+```bash
+cd ${fairseq}/examples/m2m_100
+cat ${fairseq}/gen_out | grep -P "^H" | sort -V | cut -f 3- | sh tok.sh fr > hyp
+cat ${fairseq}/raw_input.de-fr.fr | sh tok.sh fr > ref
+```
+
+### BLEU
+
+```bash
+sacrebleu -tok 'none' ref < hyp
+```
--- a/fairseq/examples/m2m_100/install_dependecies.sh
+++ b/fairseq/examples/m2m_100/install_dependecies.sh
+#!/usr/bin/env bash
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+CWD=`pwd`
+INSTALL_PATH=$CWD/tokenizers/thirdparty
+
+MOSES=$INSTALL_PATH/mosesdecoder
+if [ ! -d $MOSES ]; then
+    echo 'Cloning Moses github repository (for tokenization scripts)...'
+    git clone https://github.com/moses-smt/mosesdecoder.git $MOSES
+    cd $MOSES
+    # To deal with differences in handling ' vs "
+    git checkout 03578921cc1a03402
+    cd -
+fi
+
+WMT16_SCRIPTS=$INSTALL_PATH/wmt16-scripts
+if [ ! -d $WMT16_SCRIPTS ]; then
+    echo 'Cloning Romanian tokenization scripts'
+    git clone https://github.com/rsennrich/wmt16-scripts.git $WMT16_SCRIPTS
+fi
+
+KYTEA=$INSTALL_PATH/kytea
+if [ ! -f $KYTEA/bin/kytea ]; then
+    git clone https://github.com/neubig/kytea.git $KYTEA
+    cd $KYTEA
+    autoreconf -i
+    ./configure --prefix=`pwd`
+    make
+    make install
+    cd ..
+fi
+
+export MECAB=$INSTALL_PATH/mecab-0.996-ko-0.9.2
+if [ ! -f $MECAB/bin/mecab ]; then
+    cd $INSTALL_PATH
+    curl -LO https://bitbucket.org/eunjeon/mecab-ko/downloads/mecab-0.996-ko-0.9.2.tar.gz
+    tar zxfv mecab-0.996-ko-0.9.2.tar.gz
+    cd mecab-0.996-ko-0.9.2/
+    ./configure --prefix=`pwd`
+    make
+    make install
+
+    cd ..
+    curl -LO https://bitbucket.org/eunjeon/mecab-ko-dic/downloads/mecab-ko-dic-2.1.1-20180720.tar.gz
+    tar zxfv mecab-ko-dic-2.1.1-20180720.tar.gz
+    cd mecab-ko-dic-2.1.1-20180720/
+    ./autogen.sh
+    ./configure --prefix=`pwd` --with-dicdir=$MECAB/lib/mecab/dic/mecab-ko-dic --with-mecab-config=$MECAB/bin/mecab-config
+    make
+    sh -c 'echo "dicdir=$MECAB/lib/mecab/dic/mecab-ko-dic" > $MECAB/etc/mecabrc'
+    make install
+    cd $CWD
+fi
+
+INDIC_RESOURCES_PATH=$INSTALL_PATH/indic_nlp_resources
+if [ ! -d $INDIC_RESOURCES_PATH ]; then
+    echo 'Cloning indic_nlp_resources'
+    git clone https://github.com/anoopkunchukuttan/indic_nlp_resources.git $INDIC_RESOURCES_PATH
+fi
+
+
+if [ ! -f $INSTALL_PATH/seg_my.py ]; then
+    cd $INSTALL_PATH
+    wget http://lotus.kuee.kyoto-u.ac.jp/WAT/my-en-data/wat2020.my-en.zip
+    unzip wat2020.my-en.zip
+    # switch to python3
+    cat wat2020.my-en/myseg.py  |sed 's/^sys.std/###sys.std/g' | sed 's/### sys/sys/g' | sed 's/unichr/chr/g' > seg_my.py
+    cd $CWD
+fi
+
+
+pip install pythainlp sacrebleu indic-nlp-library
+
--- a/fairseq/examples/m2m_100/process_data/clean_histogram.py
+++ b/fairseq/examples/m2m_100/process_data/clean_histogram.py
+import argparse
+
+parser = argparse.ArgumentParser()
+parser.add_argument('--src', type=str, help='Source language')
+parser.add_argument('--tgt', type=str, help='Target language')
+parser.add_argument('--src-file', type=str, help='Input source file')
+parser.add_argument('--tgt-file', type=str, help='Input target file')
+parser.add_argument('--src-output-file', type=str, help='Output source file')
+parser.add_argument('--tgt-output-file', type=str, help='Output target file')
+parser.add_argument('--threshold', type=float, default=0.5, help='Threshold')
+parser.add_argument('--threshold-character', type=str, default=']', help='Threshold character')
+parser.add_argument('--histograms', type=str, help='Path to histograms')
+
+args = parser.parse_args()
+
+
+def read_hist(f):
+    ch = []
+    for line in f:
+        c = line[0]
+        if c == args.threshold_character:
+            break
+        ch.append(c)
+    return ch
+
+
+with(open("{}/{}".format(args.histograms, args.src), 'r', encoding='utf8')) as f:
+    ch1 = read_hist(f)
+
+with(open("{}/{}".format(args.histograms, args.tgt), 'r', encoding='utf8')) as f:
+    ch2 = read_hist(f)
+
+print("Accepted characters for {}: {}".format(args.src, ch1))
+print("Accepted characters for {}: {}".format(args.tgt, ch2))
+
+with open(args.src_file, 'r', encoding='utf8') as fs1, open(args.tgt_file, 'r', encoding='utf8') as fs2, open(args.src_output_file, 'w', encoding='utf8') as fos1, open(args.tgt_output_file, 'w', encoding='utf8') as fos2:
+    ls1 = fs1.readline()
+    ls2 = fs2.readline()
+
+    while ls1 or ls2:
+        cnt1 = len([c for c in ls1.strip() if c in ch1])
+        cnt2 = len([c for c in ls2.strip() if c in ch2])
+
+        if cnt1 / len(ls1) > args.threshold and cnt2 / len(ls2) > args.threshold:
+            fos1.write(ls1)
+            fos2.write(ls2)
+        else:
+            print("{} {} {} \n{} {} {}".format(args.src, cnt1 / len(ls1), ls1.strip(), args.tgt, cnt2 / len(ls2), ls2.strip()))
+
+        ls1 = fs1.readline()
+        ls2 = fs2.readline()
+        
\ No newline at end of file
--- a/fairseq/examples/m2m_100/process_data/dedup_data.py
+++ b/fairseq/examples/m2m_100/process_data/dedup_data.py
+import argparse
+from collections import namedtuple
+import os
+
+DATADIR = "/path/to/train_data"
+DEDUP_FROM_DIR = "/path/to/eval/data"
+OUTPUT_DIR = "/path/to/output/data"
+
+
+def main(args):
+    languages = set()
+    for language_directory in os.listdir(DATADIR):
+        if "_" in language_directory:
+            src, tgt = language_directory.split("_")
+            languages.add(LanguagePair(src=src, tgt=tgt))
+
+    data = existing_data()
+    train_languages = sorted(languages)
+    for language_pair in train_languages[args.start_index:args.start_index + args.size]:
+        print(language_pair)
+        dedup(language_pair, data)
+
+
+LanguagePair = namedtuple("LanguagePair", ["src", "tgt"])
+
+
+def existing_data():
+    data = set()
+    for file in os.listdir(DEDUP_FROM_DIR):
+        with open(os.path.join(DEDUP_FROM_DIR, file)) as f:
+            data |= set(f.readlines())
+    return data
+ 
+def dedup(language_pair, data, verbose=True, output=True):
+    train_filenames = LanguagePair(
+            src=f"{DATADIR}/{language_pair.src}_{language_pair.tgt}/train.{language_pair.src}",
+            tgt=f"{DATADIR}/{language_pair.src}_{language_pair.tgt}/train.{language_pair.tgt}",
+        )
+
+    output_filenames = LanguagePair(
+        src=f"{OUTPUT_DIR}/train.dedup.{language_pair.src}-{language_pair.tgt}.{language_pair.src}",
+        tgt=f"{OUTPUT_DIR}/train.dedup.{language_pair.src}-{language_pair.tgt}.{language_pair.tgt}"
+    )
+
+    # If output exists, skip this pair. It has already been done.
+    if (os.path.exists(output_filenames.src) and
+        os.path.exists(output_filenames.tgt)):
+        if verbose:
+            print(f"{language_pair.src}-{language_pair.tgt} already done.")
+        return
+
+    if verbose:
+        print(f"{language_pair.src}-{language_pair.tgt} ready, will check dups.")
+
+    # If there is no output, no need to actually do the loop.
+    if not output:
+        return
+
+    if os.path.exists(train_filenames.src) and os.path.exists(train_filenames.tgt):
+        with open(train_filenames.src) as f:
+            train_source = f.readlines()
+
+        with open(train_filenames.tgt) as f:
+            train_target = f.readlines()
+
+        # do dedup
+        new_train_source = []
+        new_train_target = []
+        for i, train_line in enumerate(train_source):
+            if train_line not in data and train_target[i] not in data:
+                new_train_source.append(train_line)
+                new_train_target.append(train_target[i])
+
+        assert len(train_source) == len(train_target)
+        assert len(new_train_source) == len(new_train_target)
+        assert len(new_train_source) <= len(train_source)
+
+        with open(output_filenames.src, "w") as o:
+            for line in new_train_source:
+                o.write(line)
+
+        with open(output_filenames.tgt, "w") as o:
+            for line in new_train_target:
+                o.write(line)
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-s", "--start-index", required=True, type=int)
+    parser.add_argument("-n", "--size", required=True, type=int)
+    main(parser.parse_args())
--- a/fairseq/examples/m2m_100/process_data/remove_too_much_punc.py
+++ b/fairseq/examples/m2m_100/process_data/remove_too_much_punc.py
+import gzip
+import argparse
+from string import punctuation
+
+def len_no_punc(s, punc):
+    return len([ch for ch in s if ch in punc])
+
+def filter_overpunc(len_npunc, len_sen):
+    return len_npunc < 0.5*len_sen
+
+def main(args):
+    punc = punctuation + "—|–"
+    print('Processing file {}'.format(args.input))
+    with gzip.open(args.input, 'rt', encoding=args.encoding) as tsv:
+        with open(args.bitext + '.' + args.src_lang, 'wt', encoding=args.encoding) as fsrc:
+            with open(args.bitext + '.' + args.tgt_lang, 'wt', encoding=args.encoding) as ftgt:
+                line = tsv.readline()
+                fields = line.split('\t')
+
+                src, tgt = fields[1], fields[2]
+
+                nchar_npunc_src = len_no_punc(src, punc)
+                nchar_npunc_tgt = len_no_punc(tgt, punc)
+
+                if filter_overpunc(nchar_npunc_src, len(src)) and filter_overpunc(nchar_npunc_tgt, len(tgt)):
+                    fsrc.write(src.strip() + '\n')
+                    ftgt.write(tgt.strip() + '\n')
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--input", required=True, type=str)
+    parser.add_argument('--encoding', default='utf-8', help='character encoding for input/output')
+    parser.add_argument('--bitext', type=str, required=True, help='language direction')
+    parser.add_argument('--src-lang', type=str, required=True, help='Source language')
+    parser.add_argument('--tgt-lang', type=str, required=True, help='Target language')
+    main(parser.parse_args())
--- a/fairseq/examples/m2m_100/tok.sh
+++ b/fairseq/examples/m2m_100/tok.sh
+#!/usr/bin/env bash
+# Copyright (c) 2019-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+#
+
+set -e
+
+TOKENIZERS_SCRIPTS=tokenizers
+INSTALL_PATH=$TOKENIZERS_SCRIPTS/thirdparty
+
+N_THREADS=8
+
+lg=$1
+
+MOSES=$INSTALL_PATH/mosesdecoder
+REPLACE_UNICODE_PUNCT=$MOSES/scripts/tokenizer/replace-unicode-punctuation.perl
+NORM_PUNC=$MOSES/scripts/tokenizer/normalize-punctuation.perl
+REM_NON_PRINT_CHAR=$MOSES/scripts/tokenizer/remove-non-printing-char.perl
+TOKENIZER=$MOSES/scripts/tokenizer/tokenizer.perl
+
+# special tokenization for Romanian
+WMT16_SCRIPTS=$INSTALL_PATH/wmt16-scripts
+
+NORMALIZE_ROMANIAN=$WMT16_SCRIPTS/preprocess/normalise-romanian.py
+REMOVE_DIACRITICS=$WMT16_SCRIPTS/preprocess/remove-diacritics.py
+
+# Burmese
+MY_SEGMENT=$INSTALL_PATH/seg_my.py
+
+# Arabic
+AR_TOKENIZER=$TOKENIZERS_SCRIPTS/tokenizer_ar.sh
+
+# Korean
+KO_SEGMENT=$TOKENIZERS_SCRIPTS/seg_ko.sh
+
+# Japanese
+JA_SEGMENT=$TOKENIZERS_SCRIPTS/seg_ja.sh
+
+# Indic
+IN_TOKENIZER=$TOKENIZERS_SCRIPTS/tokenize_indic.py
+INDIC_RESOURCES_PATH=$INSTALL_PATH/indic_nlp_resources
+
+# Thai
+THAI_TOKENIZER=$TOKENIZERS_SCRIPTS/tokenize_thai.py
+
+# Chinese
+CHINESE_TOKENIZER=$TOKENIZERS_SCRIPTS/tokenize_zh.py
+
+# Chinese
+if [ "$lg" = "zh" ]; then
+  cat - | $REPLACE_UNICODE_PUNCT | $NORM_PUNC -l $lg | $REM_NON_PRINT_CHAR | python $CHINESE_TOKENIZER
+# Thai
+elif [ "$lg" = "th" ]; then
+  cat - | python $THAI_TOKENIZER
+# Japanese
+elif [ "$lg" = "ja" ]; then
+  cat - | $REPLACE_UNICODE_PUNCT | $NORM_PUNC -l $lg | $REM_NON_PRINT_CHAR | ${JA_SEGMENT}
+# Korean
+elif [ "$lg" = "ko" ]; then
+  cat - | $REM_NON_PRINT_CHAR | ${KO_SEGMENT}
+# Romanian
+elif [ "$lg" = "ro" ]; then
+  cat - | $REPLACE_UNICODE_PUNCT | $NORM_PUNC -l $lg | $REM_NON_PRINT_CHAR | $NORMALIZE_ROMANIAN | $REMOVE_DIACRITICS | $TOKENIZER -no-escape -threads $N_THREADS -l $lg
+# Burmese
+elif [ "$lg" = "my" ]; then
+  cat - | python ${MY_SEGMENT}
+# Arabic
+elif [ "$lg" = "ar" ]; then
+  cat - | ${AR_TOKENIZER}
+# Indic
+elif [ "$lg" = "ne" ]; then
+  cat - | python ${IN_TOKENIZER} $lg
+elif [ "$lg" = "si" ]; then
+  cat - | python ${IN_TOKENIZER} $lg
+elif [ "$lg" = "hi" ]; then
+  cat - | python ${IN_TOKENIZER} $lg
+# other languages
+else
+  cat - | $REPLACE_UNICODE_PUNCT | $NORM_PUNC -l $lg | $REM_NON_PRINT_CHAR | $TOKENIZER -no-escape -threads $N_THREADS -l $lg
+fi
--- a/fairseq/examples/m2m_100/tokenizers/README.md
+++ b/fairseq/examples/m2m_100/tokenizers/README.md
+# M2M-100 Tokenization
+
+We apply different tokenization strategies for different languages following the existing literature. Here we provide tok.sh a tokenizer that can be used to reproduce our results.
+
+To reproduce the results, follow these steps:
+
+```
+tgt_lang=...
+reference_translation=...
+cat generation_output | grep -P "^H" | sort -V | cut -f 3- | sh tok.sh $tgt_lang > hyp
+cat $reference_translation |sh tok.sh $tgt_lang > ref
+sacrebleu -tok 'none' ref < hyp
+```
+
+## Installation
+
+Tools needed for all the languages except Arabic can be installed by running install_dependencies.sh
+If you want to evaluate Arabic models, please follow the instructions provided here: http://alt.qcri.org/tools/arabic-normalizer/ to install 
--- a/fairseq/examples/m2m_100/tokenizers/seg_ja.sh
+++ b/fairseq/examples/m2m_100/tokenizers/seg_ja.sh
+#!/usr/bin/env bash
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+SCRIPT=`realpath $0`
+KYTEA=`dirname $SCRIPT`/thirdparty/kytea
+export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$KYTEA/lib:/usr/local/lib
+export PATH=$PATH:"$KYTEA/bin"
+
+cat - | tr -d "[:blank:]" | kytea -notags
--- a/fairseq/examples/m2m_100/tokenizers/seg_ko.sh
+++ b/fairseq/examples/m2m_100/tokenizers/seg_ko.sh
+#!/usr/bin/env bash
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+SCRIPT=`realpath $0`
+MECAB=`dirname $SCRIPT`/thirdparty/mecab-0.996-ko-0.9.2
+
+export PATH=$PATH:"$MECAB/bin":"$MECAB/lib"
+export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:"$MECAB/lib"
+
+cat - | mecab -O wakati
--- a/fairseq/examples/m2m_100/tokenizers/thirdparty/.gitignore
+++ b/fairseq/examples/m2m_100/tokenizers/thirdparty/.gitignore
+seg_my.py
+indic_nlp_library/
+indic_nlp_resources/
+kytea/
+mecab-0.996-ko-0.9.2.tar.gz
+mecab-0.996-ko-0.9.2/
+mosesdecoder/
+wat2020.my-en.zip
+wat2020.my-en/
+wmt16-scripts/
+mecab-ko-dic-2.1.1-20180720/
+mecab-ko-dic-2.1.1-20180720.tar.gz
\ No newline at end of file
--- a/fairseq/examples/m2m_100/tokenizers/tokenize_indic.py
+++ b/fairseq/examples/m2m_100/tokenizers/tokenize_indic.py
+#!/usr/bin/env python3
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+# Use: echo {text} | python tokenize_indic.py {language}
+
+import sys
+
+from indicnlp.normalize.indic_normalize import IndicNormalizerFactory
+from indicnlp.tokenize.indic_tokenize import trivial_tokenize
+
+
+factory = IndicNormalizerFactory()
+normalizer = factory.get_normalizer(
+    sys.argv[1], remove_nuktas=False, nasals_mode="do_nothing"
+)
+
+for line in sys.stdin:
+    normalized_line = normalizer.normalize(line.strip())
+    tokenized_line = " ".join(trivial_tokenize(normalized_line, sys.argv[1]))
+    print(tokenized_line)
--- a/fairseq/examples/m2m_100/tokenizers/tokenize_thai.py
+++ b/fairseq/examples/m2m_100/tokenizers/tokenize_thai.py
+#!/usr/bin/env python3
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import sys
+
+from pythainlp import word_tokenize
+
+
+for line in sys.stdin:
+    print(" ".join(word_tokenize(line.strip())))
--- a/fairseq/examples/m2m_100/tokenizers/tokenize_zh.py
+++ b/fairseq/examples/m2m_100/tokenizers/tokenize_zh.py
+#!/usr/bin/env python3
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+import fileinput
+
+import sacrebleu
+
+
+for line in fileinput.input():
+    print(sacrebleu.tokenize_zh(line))
--- a/fairseq/examples/m2m_100/tokenizers/tokenizer_ar.sh
+++ b/fairseq/examples/m2m_100/tokenizers/tokenizer_ar.sh
+#!/usr/bin/env sh
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+#
+# Please follow the instructions here http://alt.qcri.org/tools/arabic-normalizer/
+# to install tools needed for Arabic
+
+echo "Please install Arabic tools: http://alt.qcri.org/tools/arabic-normalizer/"
+echo "Then update environment variables in tokenizer_ar.sh"
+exit 1
+
+SVMTOOL=...
+GOMOSESGO=...
+QCRI_ARABIC_NORMALIZER=...
+
+export PERL5LIB="$SVMTOOL/lib":"$GOMOSESGO/bin/MADA-3.2":$PERL5LIB
+
+
+tempfile=$(mktemp)
+cat - > $tempfile
+
+cd $QCRI_ARABIC_NORMALIZER
+
+bash qcri_normalizer_mada3.2_aramorph1.2.1.sh $tempfile
+cat $tempfile.mada_norm-aramorph.europarl_tok