add fairseq0.10.2

7df61696 · Sugon_ldc · 7df61696 · 7df61696 · 7df61696 · 7df61696
Commit 7df61696 authored Jul 28, 2023 by Sugon_ldc
20 changed files
--- a/fairseq/data/encoders/hf_byte_bpe.py
+++ b/fairseq/data/encoders/hf_byte_bpe.py
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from fairseq.data.encoders import register_bpe
+
+
+@register_bpe("hf_byte_bpe")
+class HuggingFaceByteLevelBPE(object):
+    @staticmethod
+    def add_args(parser):
+        # fmt: off
+        parser.add_argument('--bpe-merges', help='path to merges.txt')
+        parser.add_argument('--bpe-vocab', help='path to vocab.json')
+        parser.add_argument('--bpe-add-prefix-space', action='store_true',
+                            help='add prefix space before encoding')
+        # fmt: on
+
+    def __init__(self, args):
+        try:
+            from tokenizers import ByteLevelBPETokenizer
+        except ImportError:
+            raise ImportError(
+                "Please install huggingface/tokenizers with: " "pip install tokenizers"
+            )
+
+        self.bpe = ByteLevelBPETokenizer(
+            args.bpe_vocab,
+            args.bpe_merges,
+            add_prefix_space=getattr(args, "bpe_add_prefix_space", False),
+        )
+
+    def encode(self, x: str) -> str:
+        return " ".join(map(str, self.bpe.encode(x).ids))
+
+    def decode(self, x: str) -> str:
+        return self.bpe.decode(
+            [int(tok) if tok not in {"<unk>", "<mask>"} else tok for tok in x.split()]
+        )
+
+    def is_beginning_of_word(self, x: str) -> bool:
+        return self.decode(x).startswith(" ")
--- a/fairseq/data/encoders/moses_tokenizer.py
+++ b/fairseq/data/encoders/moses_tokenizer.py
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from fairseq.data.encoders import register_tokenizer
+
+
+@register_tokenizer("moses")
+class MosesTokenizer(object):
+    @staticmethod
+    def add_args(parser):
+        # fmt: off
+        parser.add_argument('--moses-source-lang', metavar='SRC',
+                            help='source language')
+        parser.add_argument('--moses-target-lang', metavar='TARGET',
+                            help='target language')
+        parser.add_argument('--moses-no-dash-splits', action='store_true', default=False,
+                            help='don\'t apply dash split rules')
+        parser.add_argument('--moses-no-escape', action='store_true', default=False,
+                            help='don\'t perform HTML escaping on apostrophy, quotes, etc.')
+        # fmt: on
+
+    def __init__(self, args):
+        self.args = args
+
+        if getattr(args, "moses_source_lang", None) is None:
+            args.moses_source_lang = getattr(args, "source_lang", "en")
+        if getattr(args, "moses_target_lang", None) is None:
+            args.moses_target_lang = getattr(args, "target_lang", "en")
+
+        try:
+            from sacremoses import MosesTokenizer, MosesDetokenizer
+
+            self.tok = MosesTokenizer(args.moses_source_lang)
+            self.detok = MosesDetokenizer(args.moses_target_lang)
+        except ImportError:
+            raise ImportError(
+                "Please install Moses tokenizer with: pip install sacremoses"
+            )
+
+    def encode(self, x: str) -> str:
+        return self.tok.tokenize(
+            x,
+            aggressive_dash_splits=(not self.args.moses_no_dash_splits),
+            return_str=True,
+            escape=(not self.args.moses_no_escape),
+        )
+
+    def decode(self, x: str) -> str:
+        return self.detok.detokenize(x.split())
--- a/fairseq/data/encoders/nltk_tokenizer.py
+++ b/fairseq/data/encoders/nltk_tokenizer.py
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from fairseq.data.encoders import register_tokenizer
+
+
+@register_tokenizer("nltk")
+class NLTKTokenizer(object):
+    def __init__(self, source_lang=None, target_lang=None):
+        try:
+            from nltk.tokenize import word_tokenize
+
+            self.word_tokenize = word_tokenize
+        except ImportError:
+            raise ImportError("Please install nltk with: pip install nltk")
+
+    def encode(self, x: str) -> str:
+        return " ".join(self.word_tokenize(x))
+
+    def decode(self, x: str) -> str:
+        return x
--- a/fairseq/data/encoders/sentencepiece_bpe.py
+++ b/fairseq/data/encoders/sentencepiece_bpe.py
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from fairseq import file_utils
+from fairseq.data.encoders import register_bpe
+
+
+@register_bpe("sentencepiece")
+class SentencepieceBPE(object):
+    @staticmethod
+    def add_args(parser):
+        # fmt: off
+        parser.add_argument('--sentencepiece-model', type=str,
+                            help='path to sentencepiece model')
+        # fmt: on
+
+    def __init__(self, args):
+        sentencepiece_model = file_utils.cached_path(args.sentencepiece_model)
+        try:
+            import sentencepiece as spm
+
+            self.sp = spm.SentencePieceProcessor()
+            self.sp.Load(sentencepiece_model)
+        except ImportError:
+            raise ImportError(
+                "Please install sentencepiece with: pip install sentencepiece"
+            )
+
+    def encode(self, x: str) -> str:
+        return " ".join(self.sp.EncodeAsPieces(x))
+
+    def decode(self, x: str) -> str:
+        return x.replace(" ", "").replace("\u2581", " ").strip()
+
+    def is_beginning_of_word(self, x: str) -> bool:
+        if x in ["<unk>", "<s>", "</s>", "<pad>"]:
+            # special elements are always considered beginnings
+            # HACK: this logic is already present in fairseq/tasks/masked_lm.py
+            # but these special tokens are also contained in the sentencepiece
+            # vocabulary which causes duplicate special tokens. This hack makes
+            # sure that they are all taken into account.
+            return True
+        return x.startswith("\u2581")
--- a/fairseq/data/encoders/space_tokenizer.py
+++ b/fairseq/data/encoders/space_tokenizer.py
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import re
+
+from fairseq.data.encoders import register_tokenizer
+
+
+@register_tokenizer("space")
+class SpaceTokenizer(object):
+    def __init__(self, source_lang=None, target_lang=None):
+        self.space_tok = re.compile(r"\s+")
+
+    def encode(self, x: str) -> str:
+        return self.space_tok.sub(" ", x)
+
+    def decode(self, x: str) -> str:
+        return x
--- a/fairseq/data/encoders/subword_nmt_bpe.py
+++ b/fairseq/data/encoders/subword_nmt_bpe.py
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from fairseq import file_utils
+from fairseq.data.encoders import register_bpe
+
+
+@register_bpe("subword_nmt")
+class SubwordNMTBPE(object):
+    @staticmethod
+    def add_args(parser):
+        # fmt: off
+        parser.add_argument('--bpe-codes', type=str,
+                            help='path to subword NMT BPE')
+        parser.add_argument('--bpe-separator', default='@@',
+                            help='BPE separator')
+        # fmt: on
+
+    def __init__(self, args):
+        if args.bpe_codes is None:
+            raise ValueError("--bpe-codes is required for --bpe=subword_nmt")
+        codes = file_utils.cached_path(args.bpe_codes)
+        try:
+            from subword_nmt import apply_bpe
+
+            bpe_parser = apply_bpe.create_parser()
+            bpe_args = bpe_parser.parse_args(
+                [
+                    "--codes",
+                    codes,
+                    "--separator",
+                    args.bpe_separator,
+                ]
+            )
+            self.bpe = apply_bpe.BPE(
+                bpe_args.codes,
+                bpe_args.merges,
+                bpe_args.separator,
+                None,
+                bpe_args.glossaries,
+            )
+            self.bpe_symbol = bpe_args.separator + " "
+        except ImportError:
+            raise ImportError(
+                "Please install subword_nmt with: pip install subword-nmt"
+            )
+
+    def encode(self, x: str) -> str:
+        return self.bpe.process_line(x)
+
+    def decode(self, x: str) -> str:
+        return (x + " ").replace(self.bpe_symbol, "").rstrip()
--- a/fairseq/data/encoders/utils.py
+++ b/fairseq/data/encoders/utils.py
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+from fairseq.data import encoders
+
+
+def get_whole_word_mask(args, dictionary):
+    bpe = encoders.build_bpe(args)
+    if bpe is not None:
+
+        def is_beginning_of_word(i):
+            if i < dictionary.nspecial:
+                # special elements are always considered beginnings
+                return True
+            tok = dictionary[i]
+            if tok.startswith("madeupword"):
+                return True
+            try:
+                return bpe.is_beginning_of_word(tok)
+            except ValueError:
+                return True
+
+        mask_whole_words = torch.ByteTensor(
+            list(map(is_beginning_of_word, range(len(dictionary))))
+        )
+        return mask_whole_words
+    return None
--- a/fairseq/data/fairseq_dataset.py
+++ b/fairseq/data/fairseq_dataset.py
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import numpy as np
+import torch.utils.data
+from fairseq.data import data_utils
+
+
+class EpochListening:
+    """Mixin for receiving updates whenever the epoch increments."""
+
+    @property
+    def can_reuse_epoch_itr_across_epochs(self):
+        """
+        Whether we can reuse the :class:`fairseq.data.EpochBatchIterator` for
+        this dataset across epochs.
+
+        This needs to return ``False`` if the sample sizes can change across
+        epochs, in which case we may need to regenerate batches at each epoch.
+        If your dataset relies in ``set_epoch`` then you should consider setting
+        this to ``False``.
+        """
+        return True
+
+    def set_epoch(self, epoch):
+        """Will receive the updated epoch number at the beginning of the epoch."""
+        pass
+
+
+class FairseqDataset(torch.utils.data.Dataset, EpochListening):
+    """A dataset that provides helpers for batching."""
+
+    def __getitem__(self, index):
+        raise NotImplementedError
+
+    def __len__(self):
+        raise NotImplementedError
+
+    def collater(self, samples):
+        """Merge a list of samples to form a mini-batch.
+
+        Args:
+            samples (List[dict]): samples to collate
+
+        Returns:
+            dict: a mini-batch suitable for forwarding with a Model
+        """
+        raise NotImplementedError
+
+    def num_tokens(self, index):
+        """Return the number of tokens in a sample. This value is used to
+        enforce ``--max-tokens`` during batching."""
+        raise NotImplementedError
+
+    def size(self, index):
+        """Return an example's size as a float or tuple. This value is used when
+        filtering a dataset with ``--max-positions``."""
+        raise NotImplementedError
+
+    def ordered_indices(self):
+        """Return an ordered list of indices. Batches will be constructed based
+        on this order."""
+        return np.arange(len(self), dtype=np.int64)
+
+    @property
+    def supports_prefetch(self):
+        """Whether this dataset supports prefetching."""
+        return False
+
+    def attr(self, attr: str, index: int):
+        return getattr(self, attr, None)
+
+    def prefetch(self, indices):
+        """Prefetch the data required for this epoch."""
+        raise NotImplementedError
+
+    def get_batch_shapes(self):
+        """
+        Return a list of valid batch shapes, for example::
+
+            [(8, 512), (16, 256), (32, 128)]
+
+        The first dimension of each tuple is the batch size and can be ``None``
+        to automatically infer the max batch size based on ``--max-tokens``.
+        The second dimension of each tuple is the max supported length as given
+        by :func:`fairseq.data.FairseqDataset.num_tokens`.
+
+        This will be used by :func:`fairseq.data.FairseqDataset.batch_by_size`
+        to restrict batch shapes. This is useful on TPUs to avoid too many
+        dynamic shapes (and recompilations).
+        """
+        return None
+
+    def batch_by_size(
+        self,
+        indices,
+        max_tokens=None,
+        max_sentences=None,
+        required_batch_size_multiple=1,
+    ):
+        """
+        Given an ordered set of indices, return batches according to
+        *max_tokens*, *max_sentences* and *required_batch_size_multiple*.
+        """
+        from fairseq.data import data_utils
+
+        fixed_shapes = self.get_batch_shapes()
+        if fixed_shapes is not None:
+
+            def adjust_bsz(bsz, num_tokens):
+                if bsz is None:
+                    assert max_tokens is not None, "Must specify --max-tokens"
+                    bsz = max_tokens // num_tokens
+                if max_sentences is not None:
+                    bsz = min(bsz, max_sentences)
+                elif (
+                    bsz >= required_batch_size_multiple
+                    and bsz % required_batch_size_multiple != 0
+                ):
+                    bsz -= bsz % required_batch_size_multiple
+                return bsz
+
+            fixed_shapes = np.array(
+                [
+                    [adjust_bsz(bsz, num_tokens), num_tokens]
+                    for (bsz, num_tokens) in fixed_shapes
+                ]
+            )
+
+        return data_utils.batch_by_size(
+            indices,
+            num_tokens_fn=self.num_tokens,
+            max_tokens=max_tokens,
+            max_sentences=max_sentences,
+            required_batch_size_multiple=required_batch_size_multiple,
+            fixed_shapes=fixed_shapes,
+        )
+
+    def filter_indices_by_size(self, indices, max_sizes):
+        """
+        Filter a list of sample indices. Remove those that are longer than
+        specified in *max_sizes*.
+
+        WARNING: don't update, override method in child classes
+
+        Args:
+            indices (np.array): original array of sample indices
+            max_sizes (int or list[int] or tuple[int]): max sample size,
+                can be defined separately for src and tgt (then list or tuple)
+
+        Returns:
+            np.array: filtered sample array
+            list: list of removed indices
+        """
+        if isinstance(max_sizes, float) or isinstance(max_sizes, int):
+            if hasattr(self, "sizes") and isinstance(self.sizes, np.ndarray):
+                ignored = indices[self.sizes[indices] > max_sizes].tolist()
+                indices = indices[self.sizes[indices] <= max_sizes]
+            elif (
+                hasattr(self, "sizes")
+                and isinstance(self.sizes, list)
+                and len(self.sizes) == 1
+            ):
+                ignored = indices[self.sizes[0][indices] > max_sizes].tolist()
+                indices = indices[self.sizes[0][indices] <= max_sizes]
+            else:
+                indices, ignored = data_utils._filter_by_size_dynamic(
+                    indices, self.size, max_sizes
+                )
+        else:
+            indices, ignored = data_utils._filter_by_size_dynamic(
+                indices, self.size, max_sizes
+            )
+        return indices, ignored
+
+    @property
+    def supports_fetch_outside_dataloader(self):
+        """Whether this dataset supports fetching outside the workers of the dataloader."""
+        return True
+
+
+class FairseqIterableDataset(torch.utils.data.IterableDataset, EpochListening):
+    """
+    For datasets that need to be read sequentially, usually because the data is
+    being streamed or otherwise can't be manipulated on a single machine.
+    """
+
+    def __iter__(self):
+        raise NotImplementedError
--- a/fairseq/data/fasta_dataset.py
+++ b/fairseq/data/fasta_dataset.py
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import os
+import subprocess
+import threading
+from pathlib import Path
+
+import numpy as np
+import torch
+
+
+def fasta_file_path(prefix_path):
+    return prefix_path + ".fasta"
+
+
+class FastaDataset(torch.utils.data.Dataset):
+    """
+    For loading protein sequence datasets in the common FASTA data format
+    """
+
+    def __init__(self, path: str, cache_indices=False):
+        self.fn = fasta_file_path(path)
+        self.threadlocal = threading.local()
+        self.cache = Path(f"{path}.fasta.idx.npy")
+        if cache_indices:
+            if self.cache.exists():
+                self.offsets, self.sizes = np.load(self.cache)
+            else:
+                self.offsets, self.sizes = self._build_index(path)
+                np.save(self.cache, np.stack([self.offsets, self.sizes]))
+        else:
+            self.offsets, self.sizes = self._build_index(path)
+
+    def _get_file(self):
+        if not hasattr(self.threadlocal, "f"):
+            self.threadlocal.f = open(self.fn, "r")
+        return self.threadlocal.f
+
+    def __getitem__(self, idx):
+        f = self._get_file()
+        f.seek(self.offsets[idx])
+        desc = f.readline().strip()
+        line = f.readline()
+        seq = ""
+        while line != "" and line[0] != ">":
+            seq += line.strip()
+            line = f.readline()
+        return desc, seq
+
+    def __len__(self):
+        return self.offsets.size
+
+    def _build_index(self, path: str):
+        # Use grep and awk to get 100M/s on local SSD.
+        # Should process your enormous 100G fasta in ~10 min single core...
+        path = fasta_file_path(path)
+        bytes_offsets = subprocess.check_output(
+            f"cat {path} | tqdm --bytes --total $(wc -c < {path})"
+            "| grep --byte-offset '^>' -o | cut -d: -f1",
+            shell=True,
+        )
+        fasta_lengths = subprocess.check_output(
+            f"cat {path} | tqdm --bytes --total $(wc -c < {path})"
+            "| awk '/^>/ {print \"\";next;} { printf(\"%s\",$0);}' | tail -n+2 | awk '{print length($1)}'",
+            shell=True,
+        )
+        bytes_np = np.fromstring(bytes_offsets, dtype=np.int64, sep=" ")
+        sizes_np = np.fromstring(fasta_lengths, dtype=np.int64, sep=" ")
+        return bytes_np, sizes_np
+
+    def __setstate__(self, state):
+        self.__dict__ = state
+        self.threadlocal = threading.local()
+
+    def __getstate__(self):
+        d = {}
+        for i, v in self.__dict__.items():
+            if i != "threadlocal":
+                d[i] = v
+        return d
+
+    def __del__(self):
+        if hasattr(self.threadlocal, "f"):
+            self.threadlocal.f.close()
+            del self.threadlocal.f
+
+    @staticmethod
+    def exists(path):
+        return os.path.exists(fasta_file_path(path))
+
+
+class EncodedFastaDataset(FastaDataset):
+    """
+    The FastaDataset returns raw sequences - this allows us to return
+    indices with a dictionary instead.
+    """
+
+    def __init__(self, path, dictionary):
+        super().__init__(path, cache_indices=True)
+        self.dictionary = dictionary
+
+    def __getitem__(self, idx):
+        desc, seq = super().__getitem__(idx)
+        return self.dictionary.encode_line(seq, line_tokenizer=list).long()
--- a/fairseq/data/id_dataset.py
+++ b/fairseq/data/id_dataset.py
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+
+from . import FairseqDataset
+
+
+class IdDataset(FairseqDataset):
+    def __getitem__(self, index):
+        return index
+
+    def __len__(self):
+        return 0
+
+    def collater(self, samples):
+        return torch.tensor(samples)
--- a/fairseq/data/indexed_dataset.py
+++ b/fairseq/data/indexed_dataset.py
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import os
+import shutil
+import struct
+from functools import lru_cache
+
+import numpy as np
+import torch
+from fairseq.data.fasta_dataset import FastaDataset
+from fairseq.file_io import PathManager
+
+from . import FairseqDataset
+
+
+def __best_fitting_dtype(vocab_size=None):
+    if vocab_size is not None and vocab_size < 65500:
+        return np.uint16
+    else:
+        return np.int32
+
+
+def get_available_dataset_impl():
+    return ["raw", "lazy", "cached", "mmap", "fasta"]
+
+
+def infer_dataset_impl(path):
+    if IndexedRawTextDataset.exists(path):
+        return "raw"
+    elif IndexedDataset.exists(path):
+        with open(index_file_path(path), "rb") as f:
+            magic = f.read(8)
+            if magic == IndexedDataset._HDR_MAGIC:
+                return "cached"
+            elif magic == MMapIndexedDataset.Index._HDR_MAGIC[:8]:
+                return "mmap"
+            else:
+                return None
+    elif FastaDataset.exists(path):
+        return "fasta"
+    else:
+        return None
+
+
+def make_builder(out_file, impl, vocab_size=None):
+    if impl == "mmap":
+        return MMapIndexedDatasetBuilder(
+            out_file, dtype=__best_fitting_dtype(vocab_size)
+        )
+    elif impl == "fasta":
+        raise NotImplementedError
+    else:
+        return IndexedDatasetBuilder(out_file)
+
+
+def make_dataset(path, impl, fix_lua_indexing=False, dictionary=None):
+    if impl == "raw" and IndexedRawTextDataset.exists(path):
+        assert dictionary is not None
+        return IndexedRawTextDataset(path, dictionary)
+    elif impl == "lazy" and IndexedDataset.exists(path):
+        return IndexedDataset(path, fix_lua_indexing=fix_lua_indexing)
+    elif impl == "cached" and IndexedDataset.exists(path):
+        return IndexedCachedDataset(path, fix_lua_indexing=fix_lua_indexing)
+    elif impl == "mmap" and MMapIndexedDataset.exists(path):
+        return MMapIndexedDataset(path)
+    elif impl == "fasta" and FastaDataset.exists(path):
+        from fairseq.data.fasta_dataset import EncodedFastaDataset
+
+        return EncodedFastaDataset(path, dictionary)
+    return None
+
+
+def dataset_exists(path, impl):
+    if impl == "raw":
+        return IndexedRawTextDataset.exists(path)
+    elif impl == "mmap":
+        return MMapIndexedDataset.exists(path)
+    else:
+        return IndexedDataset.exists(path)
+
+
+def read_longs(f, n):
+    a = np.empty(n, dtype=np.int64)
+    f.readinto(a)
+    return a
+
+
+def write_longs(f, a):
+    f.write(np.array(a, dtype=np.int64))
+
+
+dtypes = {
+    1: np.uint8,
+    2: np.int8,
+    3: np.int16,
+    4: np.int32,
+    5: np.int64,
+    6: np.float,
+    7: np.double,
+    8: np.uint16,
+}
+
+
+def code(dtype):
+    for k in dtypes.keys():
+        if dtypes[k] == dtype:
+            return k
+    raise ValueError(dtype)
+
+
+def index_file_path(prefix_path):
+    return prefix_path + ".idx"
+
+
+def data_file_path(prefix_path):
+    return prefix_path + ".bin"
+
+
+class IndexedDataset(FairseqDataset):
+    """Loader for TorchNet IndexedDataset"""
+
+    _HDR_MAGIC = b"TNTIDX\x00\x00"
+
+    def __init__(self, path, fix_lua_indexing=False):
+        super().__init__()
+        self.path = path
+        self.fix_lua_indexing = fix_lua_indexing
+        self.data_file = None
+        self.read_index(path)
+
+    def read_index(self, path):
+        with open(index_file_path(path), "rb") as f:
+            magic = f.read(8)
+            assert magic == self._HDR_MAGIC, (
+                "Index file doesn't match expected format. "
+                "Make sure that --dataset-impl is configured properly."
+            )
+            version = f.read(8)
+            assert struct.unpack("<Q", version) == (1,)
+            code, self.element_size = struct.unpack("<QQ", f.read(16))
+            self.dtype = dtypes[code]
+            self._len, self.s = struct.unpack("<QQ", f.read(16))
+            self.dim_offsets = read_longs(f, self._len + 1)
+            self.data_offsets = read_longs(f, self._len + 1)
+            self.sizes = read_longs(f, self.s)
+
+    def read_data(self, path):
+        self.data_file = open(data_file_path(path), "rb", buffering=0)
+
+    def check_index(self, i):
+        if i < 0 or i >= self._len:
+            raise IndexError("index out of range")
+
+    def __del__(self):
+        if self.data_file:
+            self.data_file.close()
+
+    @lru_cache(maxsize=8)
+    def __getitem__(self, i):
+        if not self.data_file:
+            self.read_data(self.path)
+        self.check_index(i)
+        tensor_size = self.sizes[self.dim_offsets[i] : self.dim_offsets[i + 1]]
+        a = np.empty(tensor_size, dtype=self.dtype)
+        self.data_file.seek(self.data_offsets[i] * self.element_size)
+        self.data_file.readinto(a)
+        item = torch.from_numpy(a).long()
+        if self.fix_lua_indexing:
+            item -= 1  # subtract 1 for 0-based indexing
+        return item
+
+    def __len__(self):
+        return self._len
+
+    def num_tokens(self, index):
+        return self.sizes[index]
+
+    def size(self, index):
+        return self.sizes[index]
+
+    @staticmethod
+    def exists(path):
+        return PathManager.exists(index_file_path(path)) and PathManager.exists(
+            data_file_path(path)
+        )
+
+    @property
+    def supports_prefetch(self):
+        return False  # avoid prefetching to save memory
+
+
+class IndexedCachedDataset(IndexedDataset):
+    def __init__(self, path, fix_lua_indexing=False):
+        super().__init__(path, fix_lua_indexing=fix_lua_indexing)
+        self.cache = None
+        self.cache_index = {}
+
+    @property
+    def supports_prefetch(self):
+        return True
+
+    def prefetch(self, indices):
+        if all(i in self.cache_index for i in indices):
+            return
+        if not self.data_file:
+            self.read_data(self.path)
+        indices = sorted(set(indices))
+        total_size = 0
+        for i in indices:
+            total_size += self.data_offsets[i + 1] - self.data_offsets[i]
+        self.cache = np.empty(total_size, dtype=self.dtype)
+        ptx = 0
+        self.cache_index.clear()
+        for i in indices:
+            self.cache_index[i] = ptx
+            size = self.data_offsets[i + 1] - self.data_offsets[i]
+            a = self.cache[ptx : ptx + size]
+            self.data_file.seek(self.data_offsets[i] * self.element_size)
+            self.data_file.readinto(a)
+            ptx += size
+        if self.data_file:
+            # close and delete data file after prefetch so we can pickle
+            self.data_file.close()
+            self.data_file = None
+
+    @lru_cache(maxsize=8)
+    def __getitem__(self, i):
+        self.check_index(i)
+        tensor_size = self.sizes[self.dim_offsets[i] : self.dim_offsets[i + 1]]
+        a = np.empty(tensor_size, dtype=self.dtype)
+        ptx = self.cache_index[i]
+        np.copyto(a, self.cache[ptx : ptx + a.size])
+        item = torch.from_numpy(a).long()
+        if self.fix_lua_indexing:
+            item -= 1  # subtract 1 for 0-based indexing
+        return item
+
+
+class IndexedRawTextDataset(FairseqDataset):
+    """Takes a text file as input and binarizes it in memory at instantiation.
+    Original lines are also kept in memory"""
+
+    def __init__(self, path, dictionary, append_eos=True, reverse_order=False):
+        self.tokens_list = []
+        self.lines = []
+        self.sizes = []
+        self.append_eos = append_eos
+        self.reverse_order = reverse_order
+        self.read_data(path, dictionary)
+        self.size = len(self.tokens_list)
+
+    def read_data(self, path, dictionary):
+        with open(path, "r", encoding="utf-8") as f:
+            for line in f:
+                self.lines.append(line.strip("\n"))
+                tokens = dictionary.encode_line(
+                    line,
+                    add_if_not_exist=False,
+                    append_eos=self.append_eos,
+                    reverse_order=self.reverse_order,
+                ).long()
+                self.tokens_list.append(tokens)
+                self.sizes.append(len(tokens))
+        self.sizes = np.array(self.sizes)
+
+    def check_index(self, i):
+        if i < 0 or i >= self.size:
+            raise IndexError("index out of range")
+
+    @lru_cache(maxsize=8)
+    def __getitem__(self, i):
+        self.check_index(i)
+        return self.tokens_list[i]
+
+    def get_original_text(self, i):
+        self.check_index(i)
+        return self.lines[i]
+
+    def __del__(self):
+        pass
+
+    def __len__(self):
+        return self.size
+
+    def num_tokens(self, index):
+        return self.sizes[index]
+
+    def size(self, index):
+        return self.sizes[index]
+
+    @staticmethod
+    def exists(path):
+        return PathManager.exists(path)
+
+
+class IndexedDatasetBuilder(object):
+    element_sizes = {
+        np.uint8: 1,
+        np.int8: 1,
+        np.int16: 2,
+        np.int32: 4,
+        np.int64: 8,
+        np.float: 4,
+        np.double: 8,
+    }
+
+    def __init__(self, out_file, dtype=np.int32):
+        self.out_file = open(out_file, "wb")
+        self.dtype = dtype
+        self.data_offsets = [0]
+        self.dim_offsets = [0]
+        self.sizes = []
+        self.element_size = self.element_sizes[self.dtype]
+
+    def add_item(self, tensor):
+        # +1 for Lua compatibility
+        bytes = self.out_file.write(np.array(tensor.numpy() + 1, dtype=self.dtype))
+        self.data_offsets.append(self.data_offsets[-1] + bytes / self.element_size)
+        for s in tensor.size():
+            self.sizes.append(s)
+        self.dim_offsets.append(self.dim_offsets[-1] + len(tensor.size()))
+
+    def merge_file_(self, another_file):
+        index = IndexedDataset(another_file)
+        assert index.dtype == self.dtype
+
+        begin = self.data_offsets[-1]
+        for offset in index.data_offsets[1:]:
+            self.data_offsets.append(begin + offset)
+        self.sizes.extend(index.sizes)
+        begin = self.dim_offsets[-1]
+        for dim_offset in index.dim_offsets[1:]:
+            self.dim_offsets.append(begin + dim_offset)
+
+        with open(data_file_path(another_file), "rb") as f:
+            while True:
+                data = f.read(1024)
+                if data:
+                    self.out_file.write(data)
+                else:
+                    break
+
+    def finalize(self, index_file):
+        self.out_file.close()
+        index = open(index_file, "wb")
+        index.write(b"TNTIDX\x00\x00")
+        index.write(struct.pack("<Q", 1))
+        index.write(struct.pack("<QQ", code(self.dtype), self.element_size))
+        index.write(struct.pack("<QQ", len(self.data_offsets) - 1, len(self.sizes)))
+        write_longs(index, self.dim_offsets)
+        write_longs(index, self.data_offsets)
+        write_longs(index, self.sizes)
+        index.close()
+
+
+def _warmup_mmap_file(path):
+    with open(path, "rb") as stream:
+        while stream.read(100 * 1024 * 1024):
+            pass
+
+
+class MMapIndexedDataset(torch.utils.data.Dataset):
+    class Index(object):
+        _HDR_MAGIC = b"MMIDIDX\x00\x00"
+
+        @classmethod
+        def writer(cls, path, dtype):
+            class _Writer(object):
+                def __enter__(self):
+                    self._file = open(path, "wb")
+
+                    self._file.write(cls._HDR_MAGIC)
+                    self._file.write(struct.pack("<Q", 1))
+                    self._file.write(struct.pack("<B", code(dtype)))
+
+                    return self
+
+                @staticmethod
+                def _get_pointers(sizes):
+                    dtype_size = dtype().itemsize
+                    address = 0
+                    pointers = []
+
+                    for size in sizes:
+                        pointers.append(address)
+                        address += size * dtype_size
+
+                    return pointers
+
+                def write(self, sizes):
+                    pointers = self._get_pointers(sizes)
+
+                    self._file.write(struct.pack("<Q", len(sizes)))
+
+                    sizes = np.array(sizes, dtype=np.int32)
+                    self._file.write(sizes.tobytes(order="C"))
+                    del sizes
+
+                    pointers = np.array(pointers, dtype=np.int64)
+                    self._file.write(pointers.tobytes(order="C"))
+                    del pointers
+
+                def __exit__(self, exc_type, exc_val, exc_tb):
+                    self._file.close()
+
+            return _Writer()
+
+        def __init__(self, path):
+            with open(path, "rb") as stream:
+                magic_test = stream.read(9)
+                assert self._HDR_MAGIC == magic_test, (
+                    "Index file doesn't match expected format. "
+                    "Make sure that --dataset-impl is configured properly."
+                )
+                version = struct.unpack("<Q", stream.read(8))
+                assert (1,) == version
+
+                (dtype_code,) = struct.unpack("<B", stream.read(1))
+                self._dtype = dtypes[dtype_code]
+                self._dtype_size = self._dtype().itemsize
+
+                self._len = struct.unpack("<Q", stream.read(8))[0]
+                offset = stream.tell()
+
+            _warmup_mmap_file(path)
+
+            self._bin_buffer_mmap = np.memmap(path, mode="r", order="C")
+            self._bin_buffer = memoryview(self._bin_buffer_mmap)
+            self._sizes = np.frombuffer(
+                self._bin_buffer, dtype=np.int32, count=self._len, offset=offset
+            )
+            self._pointers = np.frombuffer(
+                self._bin_buffer,
+                dtype=np.int64,
+                count=self._len,
+                offset=offset + self._sizes.nbytes,
+            )
+
+        def __del__(self):
+            self._bin_buffer_mmap._mmap.close()
+            del self._bin_buffer_mmap
+
+        @property
+        def dtype(self):
+            return self._dtype
+
+        @property
+        def sizes(self):
+            return self._sizes
+
+        @lru_cache(maxsize=8)
+        def __getitem__(self, i):
+            return self._pointers[i], self._sizes[i]
+
+        def __len__(self):
+            return self._len
+
+    def __init__(self, path):
+        super().__init__()
+
+        self._path = None
+        self._index = None
+        self._bin_buffer = None
+
+        self._do_init(path)
+
+    def __getstate__(self):
+        return self._path
+
+    def __setstate__(self, state):
+        self._do_init(state)
+
+    def _do_init(self, path):
+        self._path = path
+        self._index = self.Index(index_file_path(self._path))
+
+        _warmup_mmap_file(data_file_path(self._path))
+        self._bin_buffer_mmap = np.memmap(
+            data_file_path(self._path), mode="r", order="C"
+        )
+        self._bin_buffer = memoryview(self._bin_buffer_mmap)
+
+    def __del__(self):
+        self._bin_buffer_mmap._mmap.close()
+        del self._bin_buffer_mmap
+        del self._index
+
+    def __len__(self):
+        return len(self._index)
+
+    @lru_cache(maxsize=8)
+    def __getitem__(self, i):
+        ptr, size = self._index[i]
+        np_array = np.frombuffer(
+            self._bin_buffer, dtype=self._index.dtype, count=size, offset=ptr
+        )
+        if self._index.dtype != np.int64:
+            np_array = np_array.astype(np.int64)
+
+        return torch.from_numpy(np_array)
+
+    @property
+    def sizes(self):
+        return self._index.sizes
+
+    @property
+    def supports_prefetch(self):
+        return False
+
+    @staticmethod
+    def exists(path):
+        return PathManager.exists(index_file_path(path)) and PathManager.exists(
+            data_file_path(path)
+        )
+
+
+def get_indexed_dataset_to_local(path):
+    local_index_path = PathManager.get_local_path(index_file_path(path))
+    local_data_path = PathManager.get_local_path(data_file_path(path))
+
+    assert local_index_path.endswith(".idx") and local_data_path.endswith(".bin"), (
+        "PathManager.get_local_path does not return files with expected patterns: "
+        f"{local_index_path} and {local_data_path}"
+    )
+
+    local_path = local_data_path[:-4]  # stripping surfix ".bin"
+    assert local_path == local_index_path[:-4]  # stripping surfix ".idx"
+    return local_path
+
+
+class MMapIndexedDatasetBuilder(object):
+    def __init__(self, out_file, dtype=np.int64):
+        self._data_file = open(out_file, "wb")
+        self._dtype = dtype
+        self._sizes = []
+
+    def add_item(self, tensor):
+        np_array = np.array(tensor.numpy(), dtype=self._dtype)
+        self._data_file.write(np_array.tobytes(order="C"))
+        self._sizes.append(np_array.size)
+
+    def merge_file_(self, another_file):
+        # Concatenate index
+        index = MMapIndexedDataset.Index(index_file_path(another_file))
+        assert index.dtype == self._dtype
+
+        for size in index.sizes:
+            self._sizes.append(size)
+
+        # Concatenate data
+        with open(data_file_path(another_file), "rb") as f:
+            shutil.copyfileobj(f, self._data_file)
+
+    def finalize(self, index_file):
+        self._data_file.close()
+
+        with MMapIndexedDataset.Index.writer(index_file, self._dtype) as index:
+            index.write(self._sizes)
--- a/fairseq/data/iterators.py
+++ b/fairseq/data/iterators.py
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import itertools
+import logging
+import math
+import operator
+import os
+import queue
+import time
+from threading import Thread
+
+import numpy as np
+import torch
+from fairseq.data import data_utils
+
+
+logger = logging.getLogger(__name__)
+
+# Object used by _background_consumer to signal the source is exhausted
+# to the main thread.
+_sentinel = object()
+
+
+class CountingIterator(object):
+    """Wrapper around an iterable that maintains the iteration count.
+
+    Args:
+        iterable (iterable): iterable to wrap
+        start (int): starting iteration count. Note that this doesn't
+            actually advance the iterator.
+        total (int): override the iterator length returned by
+            ``__len__``. This can be used to truncate *iterator*.
+
+    Attributes:
+        n (int): number of elements consumed from this iterator
+    """
+
+    def __init__(self, iterable, start=None, total=None):
+        self.iterable = iterable
+        self.itr = iter(self)
+
+        if start is None:
+            self.n = getattr(iterable, "n", 0)
+        else:
+            self.n = start
+
+        if total is None:
+            self.total = self.n + len(iterable)
+        else:
+            self.total = total
+
+    def __len__(self):
+        return self.total
+
+    def __iter__(self):
+        for x in self.iterable:
+            if self.n >= self.total:
+                raise RuntimeError(
+                    "Mismatch between actual and expected iterable length. "
+                    "This may be caused by resuming training from a checkpoint using "
+                    "a different number of GPUs, in which case you can try the "
+                    "--reset-dataloader option. Alternatively you may have a train or "
+                    "validation set that is smaller than the number of GPUs. If none "
+                    "of these apply, please report this to the fairseq developers."
+                )
+            self.n += 1
+            yield x
+
+    def __next__(self):
+        return next(self.itr)
+
+    def has_next(self):
+        """Whether the iterator has been exhausted."""
+        return self.n < len(self)
+
+    def skip(self, num_to_skip):
+        """Fast-forward the iterator by skipping *num_to_skip* elements."""
+        next(itertools.islice(self.itr, num_to_skip, num_to_skip), None)
+        return self
+
+    def take(self, n):
+        """
+        Truncates the iterator to n elements at most.
+        """
+        self.total = min(self.total, n)
+
+        # Propagate this change to the underlying iterator
+        # Only take after what we have already consumed (i.e. after restarting
+        # from checkpoint mid epoch, we have to subtract self.n which is the
+        # starting point)
+        #
+        # This to maintain the invariant self.total = self.n + len(iterable),
+        # before calling __next__ or __iter__
+        propagated_take = max(n - self.n, 0)
+        if hasattr(self.iterable, "take"):
+            self.iterable.take(propagated_take)
+        else:
+            self.iterable = itertools.islice(self.iterable, propagated_take)
+
+
+class EpochBatchIterating(object):
+    def __len__(self) -> int:
+        raise NotImplementedError
+
+    @property
+    def next_epoch_idx(self):
+        raise NotImplementedError
+
+    def next_epoch_itr(self, shuffle=True, fix_batches_to_gpus=False):
+        """Return a new iterator over the dataset.
+
+        Args:
+            shuffle (bool, optional): shuffle batches before returning the
+                iterator (default: True).
+            fix_batches_to_gpus: ensure that batches are always
+                allocated to the same shards across epochs. Requires
+                that :attr:`dataset` supports prefetching (default: False).
+        """
+        raise NotImplementedError
+
+    def end_of_epoch(self) -> bool:
+        """Returns whether the most recent epoch iterator has been exhausted"""
+        raise NotImplementedError
+
+    @property
+    def iterations_in_epoch(self) -> int:
+        """The number of consumed batches in the current epoch."""
+        raise NotImplementedError
+
+    def state_dict(self):
+        """Returns a dictionary containing a whole state of the iterator."""
+        raise NotImplementedError
+
+    def load_state_dict(self, state_dict):
+        """Copies the state of the iterator from the given *state_dict*."""
+        raise NotImplementedError
+
+
+class StreamingEpochBatchIterator(EpochBatchIterating):
+    def __init__(
+        self,
+        dataset,
+        epoch=1,
+        num_shards=1,
+        shard_id=0,
+    ):
+        assert isinstance(dataset, torch.utils.data.IterableDataset)
+        self.dataset = dataset
+        self.epoch = max(epoch, 1)  # we use 1-based indexing for epochs
+        self._current_epoch_iterator = None
+        self.num_shards = num_shards
+        self.shard_id = shard_id
+
+    @property
+    def next_epoch_idx(self):
+        """Return the epoch index after *next_epoch_itr* is called."""
+        if self._current_epoch_iterator is not None and self.end_of_epoch():
+            return self.epoch + 1
+        else:
+            return self.epoch
+
+    def next_epoch_itr(self, shuffle=True, fix_batches_to_gpus=False):
+        self.epoch = self.next_epoch_idx
+        if hasattr(self.dataset, "set_epoch"):
+            self.dataset.set_epoch(self.epoch)
+        self._current_epoch_iterator = CountingIterator(
+            iterable=ShardedIterator(
+                iterable=self.dataset,
+                num_shards=self.num_shards,
+                shard_id=self.shard_id,
+            ),
+        )
+        return self._current_epoch_iterator
+
+    def end_of_epoch(self) -> bool:
+        return not self._current_epoch_iterator.has_next()
+
+    @property
+    def iterations_in_epoch(self) -> int:
+        if self._current_epoch_iterator is not None:
+            return self._current_epoch_iterator.n
+        return 0
+
+    def state_dict(self):
+        return {
+            "epoch": self.epoch,
+        }
+
+    def load_state_dict(self, state_dict):
+        self.epoch = state_dict["epoch"]
+
+
+class EpochBatchIterator(EpochBatchIterating):
+    """A multi-epoch iterator over a :class:`torch.utils.data.Dataset`.
+
+    Compared to :class:`torch.utils.data.DataLoader`, this iterator:
+
+    - can be reused across multiple epochs with the :func:`next_epoch_itr`
+      method (optionally shuffled between epochs)
+    - can be serialized/deserialized with the :func:`state_dict` and
+      :func:`load_state_dict` methods
+    - supports sharding with the *num_shards* and *shard_id* arguments
+
+    Args:
+        dataset (~torch.utils.data.Dataset): dataset from which to load the data
+        collate_fn (callable): merges a list of samples to form a mini-batch
+        batch_sampler (~torch.utils.data.Sampler or a callable): an iterator over batches of
+            indices, or a callable to create such an iterator (~torch.utils.data.Sampler).
+            A callable batch_sampler will be called for each epoch to enable per epoch dynamic
+            batch iterators defined by this callable batch_sampler.
+        seed (int, optional): seed for random number generator for
+            reproducibility (default: 1).
+        num_shards (int, optional): shard the data iterator into N
+            shards (default: 1).
+        shard_id (int, optional): which shard of the data iterator to
+            return (default: 0).
+        num_workers (int, optional): how many subprocesses to use for data
+            loading. 0 means the data will be loaded in the main process
+            (default: 0).
+        epoch (int, optional): the epoch to start the iterator from
+            (default: 1).
+        buffer_size (int, optional): the number of batches to keep ready in the
+            queue. Helps speeding up dataloading. When buffer_size is zero, the
+            default torch.utils.data.DataLoader preloading is used.
+        timeout (int, optional): if positive, the timeout value for collecting a batch
+            from workers. Should always be non-negative (default: ``0``).
+        disable_shuffling (bool, optional): force disable shuffling
+            (default: ``False``).
+    """
+
+    def __init__(
+        self,
+        dataset,
+        collate_fn,
+        batch_sampler,
+        seed=1,
+        num_shards=1,
+        shard_id=0,
+        num_workers=0,
+        epoch=1,
+        buffer_size=0,
+        timeout=0,
+        disable_shuffling=False,
+    ):
+        assert isinstance(dataset, torch.utils.data.Dataset)
+        self.dataset = dataset
+        self.collate_fn = collate_fn
+        self.batch_sampler = batch_sampler
+        self._frozen_batches = (
+            tuple(batch_sampler) if not callable(batch_sampler) else None
+        )
+        self.seed = seed
+        self.num_shards = num_shards
+        self.shard_id = shard_id
+        self.num_workers = num_workers
+        # This upper limit here is to prevent people from abusing this feature
+        # in a shared computing environment.
+        self.buffer_size = min(buffer_size, 20)
+        self.timeout = timeout
+        self.disable_shuffling = disable_shuffling
+
+        self.epoch = max(epoch, 1)  # we use 1-based indexing for epochs
+        self.shuffle = not disable_shuffling
+        self._cur_epoch_itr = None
+        self._next_epoch_itr = None
+        self._supports_prefetch = getattr(dataset, "supports_prefetch", False)
+
+    @property
+    def frozen_batches(self):
+        if self._frozen_batches is None:
+            self._frozen_batches = tuple(self.batch_sampler(self.dataset, self.epoch))
+        return self._frozen_batches
+
+    @property
+    def first_batch(self):
+        if len(self.frozen_batches) == 0:
+            raise Exception(
+                "The dataset is empty. This could indicate "
+                "that all elements in the dataset have been skipped. "
+                "Try increasing the max number of allowed tokens or using "
+                "a larger dataset."
+            )
+
+        if getattr(self.dataset, "supports_fetch_outside_dataloader", True):
+            return self.collate_fn([self.dataset[i] for i in self.frozen_batches[0]])
+        else:
+            return "DUMMY"
+
+    def __len__(self):
+        return int(math.ceil(len(self.frozen_batches) / float(self.num_shards)))
+
+    @property
+    def n(self):
+        return self.iterations_in_epoch
+
+    @property
+    def next_epoch_idx(self):
+        """Return the epoch index after *next_epoch_itr* is called."""
+        if self._next_epoch_itr is not None:
+            return self.epoch
+        elif self._cur_epoch_itr is not None and self.end_of_epoch():
+            return self.epoch + 1
+        else:
+            return self.epoch
+
+    def next_epoch_itr(self, shuffle=True, fix_batches_to_gpus=False):
+        """Return a new iterator over the dataset.
+
+        Args:
+            shuffle (bool, optional): shuffle batches before returning the
+                iterator (default: True).
+            fix_batches_to_gpus: ensure that batches are always
+                allocated to the same shards across epochs. Requires
+                that :attr:`dataset` supports prefetching (default: False).
+        """
+        if self.disable_shuffling:
+            shuffle = False
+        self.epoch = self.next_epoch_idx
+        if hasattr(self.dataset, "set_epoch"):
+            self.dataset.set_epoch(self.epoch)
+        if self._next_epoch_itr is not None:
+            self._cur_epoch_itr = self._next_epoch_itr
+            self._next_epoch_itr = None
+        else:
+            if callable(self.batch_sampler):
+                # reset _frozen_batches to refresh the next epoch
+                self._frozen_batches = None
+            self._cur_epoch_itr = self._get_iterator_for_epoch(
+                self.epoch,
+                shuffle,
+                fix_batches_to_gpus=fix_batches_to_gpus,
+            )
+        self.shuffle = shuffle
+        return self._cur_epoch_itr
+
+    def end_of_epoch(self) -> bool:
+        """Returns whether the most recent epoch iterator has been exhausted"""
+        return not self._cur_epoch_itr.has_next()
+
+    @property
+    def iterations_in_epoch(self):
+        """The number of consumed batches in the current epoch."""
+        if self._cur_epoch_itr is not None:
+            return self._cur_epoch_itr.n
+        elif self._next_epoch_itr is not None:
+            return self._next_epoch_itr.n
+        return 0
+
+    def state_dict(self):
+        """Returns a dictionary containing a whole state of the iterator."""
+        if self.end_of_epoch():
+            epoch = self.epoch + 1
+            iter_in_epoch = 0
+        else:
+            epoch = self.epoch
+            iter_in_epoch = self.iterations_in_epoch
+        return {
+            "version": 2,
+            "epoch": epoch,
+            "iterations_in_epoch": iter_in_epoch,
+            "shuffle": self.shuffle,
+        }
+
+    def load_state_dict(self, state_dict):
+        """Copies the state of the iterator from the given *state_dict*."""
+        self.epoch = state_dict["epoch"]
+        itr_pos = state_dict.get("iterations_in_epoch", 0)
+        version = state_dict.get("version", 1)
+        if itr_pos > 0:
+            # fast-forward epoch iterator
+            self._next_epoch_itr = self._get_iterator_for_epoch(
+                self.epoch,
+                shuffle=state_dict.get("shuffle", True),
+                offset=itr_pos,
+            )
+            if self._next_epoch_itr is None:
+                if version == 1:
+                    # legacy behavior: we finished the epoch, increment epoch counter
+                    self.epoch += 1
+                else:
+                    raise RuntimeError(
+                        "Cannot resume training due to dataloader mismatch, please "
+                        "report this to the fairseq developers. You can relaunch "
+                        "training with `--reset-dataloader` and it should work."
+                    )
+        else:
+            self._next_epoch_itr = None
+
+    def _get_iterator_for_epoch(
+        self, epoch, shuffle, fix_batches_to_gpus=False, offset=0
+    ):
+        def shuffle_batches(batches, seed):
+            with data_utils.numpy_seed(seed):
+                np.random.shuffle(batches)
+            return batches
+
+        if self._supports_prefetch:
+            batches = self.frozen_batches
+
+            if shuffle and not fix_batches_to_gpus:
+                batches = shuffle_batches(list(batches), self.seed + epoch)
+
+            batches = list(
+                ShardedIterator(batches, self.num_shards, self.shard_id, fill_value=[])
+            )
+            self.dataset.prefetch([i for s in batches for i in s])
+
+            if shuffle and fix_batches_to_gpus:
+                batches = shuffle_batches(batches, self.seed + epoch + self.shard_id)
+        else:
+            if shuffle:
+                batches = shuffle_batches(list(self.frozen_batches), self.seed + epoch)
+            else:
+                batches = self.frozen_batches
+            batches = list(
+                ShardedIterator(batches, self.num_shards, self.shard_id, fill_value=[])
+            )
+
+        if offset > 0 and offset >= len(batches):
+            return None
+
+        if self.num_workers > 0:
+            os.environ["PYTHONWARNINGS"] = "ignore:semaphore_tracker:UserWarning"
+
+        # Create data loader
+        itr = torch.utils.data.DataLoader(
+            self.dataset,
+            collate_fn=self.collate_fn,
+            batch_sampler=batches[offset:],
+            num_workers=self.num_workers,
+            timeout=self.timeout,
+        )
+
+        # Wrap with a BufferedIterator if needed
+        if self.buffer_size > 0:
+            itr = BufferedIterator(self.buffer_size, itr)
+
+        # Wrap with CoutingIterator
+        itr = CountingIterator(itr, start=offset)
+        return itr
+
+
+class GroupedIterator(CountingIterator):
+    """Wrapper around an iterable that returns groups (chunks) of items.
+
+    Args:
+        iterable (iterable): iterable to wrap
+        chunk_size (int): size of each chunk
+
+    Attributes:
+        n (int): number of elements consumed from this iterator
+    """
+
+    def __init__(self, iterable, chunk_size):
+        itr = _chunk_iterator(iterable, chunk_size)
+        super().__init__(
+            itr,
+            start=int(math.ceil(getattr(iterable, "n", 0) / float(chunk_size))),
+            total=int(math.ceil(len(iterable) / float(chunk_size))),
+        )
+        self.chunk_size = chunk_size
+
+
+def _chunk_iterator(itr, chunk_size):
+    chunk = []
+    for x in itr:
+        chunk.append(x)
+        if len(chunk) == chunk_size:
+            yield chunk
+            chunk = []
+    if len(chunk) > 0:
+        yield chunk
+
+
+class ShardedIterator(CountingIterator):
+    """A sharded wrapper around an iterable, padded to length.
+
+    Args:
+        iterable (iterable): iterable to wrap
+        num_shards (int): number of shards to split the iterable into
+        shard_id (int): which shard to iterator over
+        fill_value (Any, optional): padding value when the iterable doesn't
+            evenly divide *num_shards* (default: None).
+
+    Attributes:
+        n (int): number of elements consumed from this iterator
+    """
+
+    def __init__(self, iterable, num_shards, shard_id, fill_value=None):
+        if shard_id < 0 or shard_id >= num_shards:
+            raise ValueError("shard_id must be between 0 and num_shards")
+        sharded_len = int(math.ceil(len(iterable) / float(num_shards)))
+        itr = map(
+            operator.itemgetter(1),
+            itertools.zip_longest(
+                range(sharded_len),
+                itertools.islice(iterable, shard_id, len(iterable), num_shards),
+                fillvalue=fill_value,
+            ),
+        )
+        super().__init__(
+            itr,
+            start=int(math.ceil(getattr(iterable, "n", 0) / float(num_shards))),
+            total=sharded_len,
+        )
+
+
+class BackgroundConsumer(Thread):
+    def __init__(self, queue, source, max_len):
+        Thread.__init__(self)
+
+        self._queue = queue
+        self._source = source
+        self._max_len = max_len
+        self.count = 0
+
+    def run(self):
+        try:
+            for item in self._source:
+                self._queue.put(item)
+
+                # Stop if we reached the maximum length
+                self.count += 1
+                if self._max_len is not None and self.count >= self._max_len:
+                    break
+
+            # Signal the consumer we are done.
+            self._queue.put(_sentinel)
+        except Exception as e:
+            self._queue.put(e)
+
+
+class BufferedIterator(object):
+    def __init__(self, size, iterable):
+        self._queue = queue.Queue(size)
+        self._iterable = iterable
+        self._consumer = None
+
+        self.start_time = time.time()
+        self.warning_time = None
+
+        self.total = len(iterable)
+
+    def _create_consumer(self):
+        self._consumer = BackgroundConsumer(
+            self._queue,
+            self._iterable,
+            self.total,
+        )
+        self._consumer.daemon = True
+        self._consumer.start()
+
+    def __iter__(self):
+        return self
+
+    def __len__(self):
+        return self.total
+
+    def take(self, n):
+        self.total = min(self.total, n)
+
+        # Propagate this change to the underlying iterator
+        if hasattr(self._iterable, "take"):
+            self._iterable.take(n)
+
+    def __next__(self):
+        # Create consumer if not created yet
+        if self._consumer is None:
+            self._create_consumer()
+
+        # Notify the user if there is a data loading bottleneck
+        if self._queue.qsize() < min(2, max(1, self._queue.maxsize // 2)):
+            if time.time() - self.start_time > 5 * 60:
+                if (
+                    self.warning_time is None
+                    or time.time() - self.warning_time > 15 * 60
+                ):
+                    logger.debug(
+                        "Data loading buffer is empty or nearly empty. This may "
+                        "indicate a data loading bottleneck, and increasing the "
+                        "number of workers (--num-workers) may help."
+                    )
+                    self.warning_time = time.time()
+
+        # Get next example
+        item = self._queue.get(True)
+        if isinstance(item, Exception):
+            raise item
+        if item is _sentinel:
+            raise StopIteration()
+        return item
--- a/fairseq/data/language_pair_dataset.py
+++ b/fairseq/data/language_pair_dataset.py
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+
+import numpy as np
+import torch
+from fairseq.data import FairseqDataset, data_utils
+
+
+logger = logging.getLogger(__name__)
+
+
+def collate(
+    samples,
+    pad_idx,
+    eos_idx,
+    left_pad_source=True,
+    left_pad_target=False,
+    input_feeding=True,
+    pad_to_length=None,
+    pad_to_multiple=1,
+):
+    if len(samples) == 0:
+        return {}
+
+    def merge(key, left_pad, move_eos_to_beginning=False, pad_to_length=None):
+        return data_utils.collate_tokens(
+            [s[key] for s in samples],
+            pad_idx,
+            eos_idx,
+            left_pad,
+            move_eos_to_beginning,
+            pad_to_length=pad_to_length,
+            pad_to_multiple=pad_to_multiple,
+        )
+
+    def check_alignment(alignment, src_len, tgt_len):
+        if alignment is None or len(alignment) == 0:
+            return False
+        if (
+            alignment[:, 0].max().item() >= src_len - 1
+            or alignment[:, 1].max().item() >= tgt_len - 1
+        ):
+            logger.warning("alignment size mismatch found, skipping alignment!")
+            return False
+        return True
+
+    def compute_alignment_weights(alignments):
+        """
+        Given a tensor of shape [:, 2] containing the source-target indices
+        corresponding to the alignments, a weight vector containing the
+        inverse frequency of each target index is computed.
+        For e.g. if alignments = [[5, 7], [2, 3], [1, 3], [4, 2]], then
+        a tensor containing [1., 0.5, 0.5, 1] should be returned (since target
+        index 3 is repeated twice)
+        """
+        align_tgt = alignments[:, 1]
+        _, align_tgt_i, align_tgt_c = torch.unique(
+            align_tgt, return_inverse=True, return_counts=True
+        )
+        align_weights = align_tgt_c[align_tgt_i[np.arange(len(align_tgt))]]
+        return 1.0 / align_weights.float()
+
+    id = torch.LongTensor([s["id"] for s in samples])
+    src_tokens = merge(
+        "source",
+        left_pad=left_pad_source,
+        pad_to_length=pad_to_length["source"] if pad_to_length is not None else None,
+    )
+    # sort by descending source length
+    src_lengths = torch.LongTensor(
+        [s["source"].ne(pad_idx).long().sum() for s in samples]
+    )
+    src_lengths, sort_order = src_lengths.sort(descending=True)
+    id = id.index_select(0, sort_order)
+    src_tokens = src_tokens.index_select(0, sort_order)
+
+    prev_output_tokens = None
+    target = None
+    if samples[0].get("target", None) is not None:
+        target = merge(
+            "target",
+            left_pad=left_pad_target,
+            pad_to_length=pad_to_length["target"]
+            if pad_to_length is not None
+            else None,
+        )
+        target = target.index_select(0, sort_order)
+        tgt_lengths = torch.LongTensor(
+            [s["target"].ne(pad_idx).long().sum() for s in samples]
+        ).index_select(0, sort_order)
+        ntokens = tgt_lengths.sum().item()
+
+        if samples[0].get("prev_output_tokens", None) is not None:
+            prev_output_tokens = merge("prev_output_tokens", left_pad=left_pad_target)
+        elif input_feeding:
+            # we create a shifted version of targets for feeding the
+            # previous output token(s) into the next decoder step
+            prev_output_tokens = merge(
+                "target",
+                left_pad=left_pad_target,
+                move_eos_to_beginning=True,
+                pad_to_length=pad_to_length["target"]
+                if pad_to_length is not None
+                else None,
+            )
+    else:
+        ntokens = src_lengths.sum().item()
+
+    batch = {
+        "id": id,
+        "nsentences": len(samples),
+        "ntokens": ntokens,
+        "net_input": {
+            "src_tokens": src_tokens,
+            "src_lengths": src_lengths,
+        },
+        "target": target,
+    }
+    if prev_output_tokens is not None:
+        batch["net_input"]["prev_output_tokens"] = prev_output_tokens.index_select(
+            0, sort_order
+        )
+
+    if samples[0].get("alignment", None) is not None:
+        bsz, tgt_sz = batch["target"].shape
+        src_sz = batch["net_input"]["src_tokens"].shape[1]
+
+        offsets = torch.zeros((len(sort_order), 2), dtype=torch.long)
+        offsets[:, 1] += torch.arange(len(sort_order), dtype=torch.long) * tgt_sz
+        if left_pad_source:
+            offsets[:, 0] += src_sz - src_lengths
+        if left_pad_target:
+            offsets[:, 1] += tgt_sz - tgt_lengths
+
+        alignments = [
+            alignment + offset
+            for align_idx, offset, src_len, tgt_len in zip(
+                sort_order, offsets, src_lengths, tgt_lengths
+            )
+            for alignment in [samples[align_idx]["alignment"].view(-1, 2)]
+            if check_alignment(alignment, src_len, tgt_len)
+        ]
+
+        if len(alignments) > 0:
+            alignments = torch.cat(alignments, dim=0)
+            align_weights = compute_alignment_weights(alignments)
+
+            batch["alignments"] = alignments
+            batch["align_weights"] = align_weights
+
+    if samples[0].get("constraints", None) is not None:
+        # Collate the packed constraints across the samples, padding to
+        # the length of the longest sample.
+        lens = [sample.get("constraints").size(0) for sample in samples]
+        max_len = max(lens)
+        constraints = torch.zeros((len(samples), max(lens))).long()
+        for i, sample in enumerate(samples):
+            constraints[i, 0 : lens[i]] = samples[i].get("constraints")
+        batch["constraints"] = constraints
+
+    return batch
+
+
+class LanguagePairDataset(FairseqDataset):
+    """
+    A pair of torch.utils.data.Datasets.
+
+    Args:
+        src (torch.utils.data.Dataset): source dataset to wrap
+        src_sizes (List[int]): source sentence lengths
+        src_dict (~fairseq.data.Dictionary): source vocabulary
+        tgt (torch.utils.data.Dataset, optional): target dataset to wrap
+        tgt_sizes (List[int], optional): target sentence lengths
+        tgt_dict (~fairseq.data.Dictionary, optional): target vocabulary
+        left_pad_source (bool, optional): pad source tensors on the left side
+            (default: True).
+        left_pad_target (bool, optional): pad target tensors on the left side
+            (default: False).
+        shuffle (bool, optional): shuffle dataset elements before batching
+            (default: True).
+        input_feeding (bool, optional): create a shifted version of the targets
+            to be passed into the model for teacher forcing (default: True).
+        remove_eos_from_source (bool, optional): if set, removes eos from end
+            of source if it's present (default: False).
+        append_eos_to_target (bool, optional): if set, appends eos to end of
+            target if it's absent (default: False).
+        align_dataset (torch.utils.data.Dataset, optional): dataset
+            containing alignments.
+        constraints (Tensor, optional): 2d tensor with a concatenated, zero-
+            delimited list of constraints for each sentence.
+        append_bos (bool, optional): if set, appends bos to the beginning of
+            source/target sentence.
+        num_buckets (int, optional): if set to a value greater than 0, then
+            batches will be bucketed into the given number of batch shapes.
+        src_lang_id (int, optional): source language ID, if set, the collated batch
+            will contain a field 'src_lang_id' in 'net_input' which indicates the
+            source language of the samples.
+        tgt_lang_id (int, optional): target language ID, if set, the collated batch
+            will contain a field 'tgt_lang_id' which indicates the target language
+             of the samples.
+    """
+
+    def __init__(
+        self,
+        src,
+        src_sizes,
+        src_dict,
+        tgt=None,
+        tgt_sizes=None,
+        tgt_dict=None,
+        left_pad_source=True,
+        left_pad_target=False,
+        shuffle=True,
+        input_feeding=True,
+        remove_eos_from_source=False,
+        append_eos_to_target=False,
+        align_dataset=None,
+        constraints=None,
+        append_bos=False,
+        eos=None,
+        num_buckets=0,
+        src_lang_id=None,
+        tgt_lang_id=None,
+        pad_to_multiple=1,
+    ):
+        if tgt_dict is not None:
+            assert src_dict.pad() == tgt_dict.pad()
+            assert src_dict.eos() == tgt_dict.eos()
+            assert src_dict.unk() == tgt_dict.unk()
+        if tgt is not None:
+            assert len(src) == len(
+                tgt
+            ), "Source and target must contain the same number of examples"
+        self.src = src
+        self.tgt = tgt
+        self.src_sizes = np.array(src_sizes)
+        self.tgt_sizes = np.array(tgt_sizes) if tgt_sizes is not None else None
+        self.sizes = (
+            np.vstack((self.src_sizes, self.tgt_sizes)).T
+            if self.tgt_sizes is not None
+            else self.src_sizes
+        )
+        self.src_dict = src_dict
+        self.tgt_dict = tgt_dict
+        self.left_pad_source = left_pad_source
+        self.left_pad_target = left_pad_target
+        self.shuffle = shuffle
+        self.input_feeding = input_feeding
+        self.remove_eos_from_source = remove_eos_from_source
+        self.append_eos_to_target = append_eos_to_target
+        self.align_dataset = align_dataset
+        if self.align_dataset is not None:
+            assert (
+                self.tgt_sizes is not None
+            ), "Both source and target needed when alignments are provided"
+        self.constraints = constraints
+        self.append_bos = append_bos
+        self.eos = eos if eos is not None else src_dict.eos()
+        self.src_lang_id = src_lang_id
+        self.tgt_lang_id = tgt_lang_id
+        if num_buckets > 0:
+            from fairseq.data import BucketPadLengthDataset
+
+            self.src = BucketPadLengthDataset(
+                self.src,
+                sizes=self.src_sizes,
+                num_buckets=num_buckets,
+                pad_idx=self.src_dict.pad(),
+                left_pad=self.left_pad_source,
+            )
+            self.src_sizes = self.src.sizes
+            logger.info("bucketing source lengths: {}".format(list(self.src.buckets)))
+            if self.tgt is not None:
+                self.tgt = BucketPadLengthDataset(
+                    self.tgt,
+                    sizes=self.tgt_sizes,
+                    num_buckets=num_buckets,
+                    pad_idx=self.tgt_dict.pad(),
+                    left_pad=self.left_pad_target,
+                )
+                self.tgt_sizes = self.tgt.sizes
+                logger.info(
+                    "bucketing target lengths: {}".format(list(self.tgt.buckets))
+                )
+
+            # determine bucket sizes using self.num_tokens, which will return
+            # the padded lengths (thanks to BucketPadLengthDataset)
+            num_tokens = np.vectorize(self.num_tokens, otypes=[np.long])
+            self.bucketed_num_tokens = num_tokens(np.arange(len(self.src)))
+            self.buckets = [
+                (None, num_tokens) for num_tokens in np.unique(self.bucketed_num_tokens)
+            ]
+        else:
+            self.buckets = None
+        self.pad_to_multiple = pad_to_multiple
+
+    def get_batch_shapes(self):
+        return self.buckets
+
+    def __getitem__(self, index):
+        tgt_item = self.tgt[index] if self.tgt is not None else None
+        src_item = self.src[index]
+        # Append EOS to end of tgt sentence if it does not have an EOS and remove
+        # EOS from end of src sentence if it exists. This is useful when we use
+        # use existing datasets for opposite directions i.e., when we want to
+        # use tgt_dataset as src_dataset and vice versa
+        if self.append_eos_to_target:
+            eos = self.tgt_dict.eos() if self.tgt_dict else self.src_dict.eos()
+            if self.tgt and self.tgt[index][-1] != eos:
+                tgt_item = torch.cat([self.tgt[index], torch.LongTensor([eos])])
+
+        if self.append_bos:
+            bos = self.tgt_dict.bos() if self.tgt_dict else self.src_dict.bos()
+            if self.tgt and self.tgt[index][0] != bos:
+                tgt_item = torch.cat([torch.LongTensor([bos]), self.tgt[index]])
+
+            bos = self.src_dict.bos()
+            if self.src[index][0] != bos:
+                src_item = torch.cat([torch.LongTensor([bos]), self.src[index]])
+
+        if self.remove_eos_from_source:
+            eos = self.src_dict.eos()
+            if self.src[index][-1] == eos:
+                src_item = self.src[index][:-1]
+
+        example = {
+            "id": index,
+            "source": src_item,
+            "target": tgt_item,
+        }
+        if self.align_dataset is not None:
+            example["alignment"] = self.align_dataset[index]
+        if self.constraints is not None:
+            example["constraints"] = self.constraints[index]
+        return example
+
+    def __len__(self):
+        return len(self.src)
+
+    def collater(self, samples, pad_to_length=None):
+        """Merge a list of samples to form a mini-batch.
+
+        Args:
+            samples (List[dict]): samples to collate
+            pad_to_length (dict, optional): a dictionary of
+                {'source': source_pad_to_length, 'target': target_pad_to_length}
+                to indicate the max length to pad to in source and target respectively.
+
+        Returns:
+            dict: a mini-batch with the following keys:
+
+                - `id` (LongTensor): example IDs in the original input order
+                - `ntokens` (int): total number of tokens in the batch
+                - `net_input` (dict): the input to the Model, containing keys:
+
+                  - `src_tokens` (LongTensor): a padded 2D Tensor of tokens in
+                    the source sentence of shape `(bsz, src_len)`. Padding will
+                    appear on the left if *left_pad_source* is ``True``.
+                  - `src_lengths` (LongTensor): 1D Tensor of the unpadded
+                    lengths of each source sentence of shape `(bsz)`
+                  - `prev_output_tokens` (LongTensor): a padded 2D Tensor of
+                    tokens in the target sentence, shifted right by one
+                    position for teacher forcing, of shape `(bsz, tgt_len)`.
+                    This key will not be present if *input_feeding* is
+                    ``False``.  Padding will appear on the left if
+                    *left_pad_target* is ``True``.
+                  - `src_lang_id` (LongTensor): a long Tensor which contains source
+                    language IDs of each sample in the batch
+
+                - `target` (LongTensor): a padded 2D Tensor of tokens in the
+                  target sentence of shape `(bsz, tgt_len)`. Padding will appear
+                  on the left if *left_pad_target* is ``True``.
+                - `tgt_lang_id` (LongTensor): a long Tensor which contains target language
+                   IDs of each sample in the batch
+        """
+        res = collate(
+            samples,
+            pad_idx=self.src_dict.pad(),
+            eos_idx=self.eos,
+            left_pad_source=self.left_pad_source,
+            left_pad_target=self.left_pad_target,
+            input_feeding=self.input_feeding,
+            pad_to_length=pad_to_length,
+            pad_to_multiple=self.pad_to_multiple,
+        )
+        if self.src_lang_id is not None or self.tgt_lang_id is not None:
+            src_tokens = res["net_input"]["src_tokens"]
+            bsz = src_tokens.size(0)
+            if self.src_lang_id is not None:
+                res["net_input"]["src_lang_id"] = (
+                    torch.LongTensor([[self.src_lang_id]]).expand(bsz, 1).to(src_tokens)
+                )
+            if self.tgt_lang_id is not None:
+                res["tgt_lang_id"] = (
+                    torch.LongTensor([[self.tgt_lang_id]]).expand(bsz, 1).to(src_tokens)
+                )
+        return res
+
+    def num_tokens(self, index):
+        """Return the number of tokens in a sample. This value is used to
+        enforce ``--max-tokens`` during batching."""
+        return max(
+            self.src_sizes[index],
+            self.tgt_sizes[index] if self.tgt_sizes is not None else 0,
+        )
+
+    def size(self, index):
+        """Return an example's size as a float or tuple. This value is used when
+        filtering a dataset with ``--max-positions``."""
+        return (
+            self.src_sizes[index],
+            self.tgt_sizes[index] if self.tgt_sizes is not None else 0,
+        )
+
+    def ordered_indices(self):
+        """Return an ordered list of indices. Batches will be constructed based
+        on this order."""
+        if self.shuffle:
+            indices = np.random.permutation(len(self)).astype(np.int64)
+        else:
+            indices = np.arange(len(self), dtype=np.int64)
+        if self.buckets is None:
+            # sort by target length, then source length
+            if self.tgt_sizes is not None:
+                indices = indices[np.argsort(self.tgt_sizes[indices], kind="mergesort")]
+            return indices[np.argsort(self.src_sizes[indices], kind="mergesort")]
+        else:
+            # sort by bucketed_num_tokens, which is:
+            #   max(padded_src_len, padded_tgt_len)
+            return indices[
+                np.argsort(self.bucketed_num_tokens[indices], kind="mergesort")
+            ]
+
+    @property
+    def supports_prefetch(self):
+        return getattr(self.src, "supports_prefetch", False) and (
+            getattr(self.tgt, "supports_prefetch", False) or self.tgt is None
+        )
+
+    def prefetch(self, indices):
+        self.src.prefetch(indices)
+        if self.tgt is not None:
+            self.tgt.prefetch(indices)
+        if self.align_dataset is not None:
+            self.align_dataset.prefetch(indices)
+
+    def filter_indices_by_size(self, indices, max_sizes):
+        """Filter a list of sample indices. Remove those that are longer
+            than specified in max_sizes.
+
+        Args:
+            indices (np.array): original array of sample indices
+            max_sizes (int or list[int] or tuple[int]): max sample size,
+                can be defined separately for src and tgt (then list or tuple)
+
+        Returns:
+            np.array: filtered sample array
+            list: list of removed indices
+        """
+        return data_utils.filter_paired_dataset_indices_by_size(
+            self.src_sizes,
+            self.tgt_sizes,
+            indices,
+            max_sizes,
+        )
--- a/fairseq/data/legacy/__init__.py
+++ b/fairseq/data/legacy/__init__.py
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from .block_pair_dataset import BlockPairDataset
+from .masked_lm_dataset import MaskedLMDataset
+from .masked_lm_dictionary import BertDictionary, MaskedLMDictionary
+
+
+__all__ = [
+    "BertDictionary",
+    "BlockPairDataset",
+    "MaskedLMDataset",
+    "MaskedLMDictionary",
+]
--- a/fairseq/data/legacy/__pycache__/__init__.cpython-38.pyc
+++ b/fairseq/data/legacy/__pycache__/__init__.cpython-38.pyc
--- a/fairseq/data/legacy/__pycache__/block_pair_dataset.cpython-38.pyc
+++ b/fairseq/data/legacy/__pycache__/block_pair_dataset.cpython-38.pyc
--- a/fairseq/data/legacy/__pycache__/masked_lm_dataset.cpython-38.pyc
+++ b/fairseq/data/legacy/__pycache__/masked_lm_dataset.cpython-38.pyc
--- a/fairseq/data/legacy/__pycache__/masked_lm_dictionary.cpython-38.pyc
+++ b/fairseq/data/legacy/__pycache__/masked_lm_dictionary.cpython-38.pyc
--- a/fairseq/data/legacy/block_pair_dataset.py
+++ b/fairseq/data/legacy/block_pair_dataset.py
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import math
+
+import numpy as np
+import torch
+from fairseq.data import FairseqDataset
+
+
+class BlockPairDataset(FairseqDataset):
+    """Break a Dataset of tokens into sentence pair blocks for next sentence
+       prediction as well as masked language model.
+
+       High-level logics are:
+       1. break input tensor to tensor blocks
+       2. pair the blocks with 50% next sentence and 50% random sentence
+       3. return paired blocks as well as related segment labels
+
+    Args:
+        dataset (~torch.utils.data.Dataset): dataset to break into blocks
+        sizes: array of sentence lengths
+        dictionary: dictionary for the task
+        block_size: maximum block size
+        break_mode: mode for breaking copurs into block pairs. currently we support
+            2 modes
+            doc: respect document boundaries and each part of the pair should belong to on document
+            none: don't respect any boundary and cut tokens evenly
+        short_seq_prob: probability for generating shorter block pairs
+        doc_break_size: Size for empty line separating documents. Typically 1 if
+                        the sentences have eos, 0 otherwise.
+    """
+
+    def __init__(
+        self,
+        dataset,
+        dictionary,
+        sizes,
+        block_size,
+        break_mode="doc",
+        short_seq_prob=0.1,
+        doc_break_size=1,
+    ):
+        super().__init__()
+        self.dataset = dataset
+        self.pad = dictionary.pad()
+        self.eos = dictionary.eos()
+        self.cls = dictionary.cls()
+        self.mask = dictionary.mask()
+        self.sep = dictionary.sep()
+        self.break_mode = break_mode
+        self.dictionary = dictionary
+        self.short_seq_prob = short_seq_prob
+        self.block_indices = []
+
+        assert len(dataset) == len(sizes)
+
+        if break_mode == "doc":
+            cur_doc = []
+            for sent_id, sz in enumerate(sizes):
+                assert doc_break_size == 0 or sz != 0, (
+                    "when doc_break_size is non-zero, we expect documents to be"
+                    "separated by a blank line with a single eos."
+                )
+                # empty line as document separator
+                if sz == doc_break_size:
+                    if len(cur_doc) == 0:
+                        continue
+                    self.block_indices.append(cur_doc)
+                    cur_doc = []
+                else:
+                    cur_doc.append(sent_id)
+            max_num_tokens = block_size - 3  # Account for [CLS], [SEP], [SEP]
+            self.sent_pairs = []
+            self.sizes = []
+            for doc_id, doc in enumerate(self.block_indices):
+                self._generate_sentence_pair(doc, doc_id, max_num_tokens, sizes)
+        elif break_mode is None or break_mode == "none":
+            # each block should have half of the block size since we are constructing block pair
+            sent_length = (block_size - 3) // 2
+            total_len = sum(dataset.sizes)
+            length = math.ceil(total_len / sent_length)
+
+            def block_at(i):
+                start = i * sent_length
+                end = min(start + sent_length, total_len)
+                return (start, end)
+
+            sent_indices = np.array([block_at(i) for i in range(length)])
+            sent_sizes = np.array([e - s for s, e in sent_indices])
+            dataset_index = self._sent_to_dataset_index(sent_sizes)
+
+            # pair sentences
+            self._pair_sentences(dataset_index)
+        else:
+            raise ValueError("Invalid break_mode: " + break_mode)
+
+    def _pair_sentences(self, dataset_index):
+        """
+        Give a list of evenly cut blocks/sentences, pair these sentences with 50%
+        consecutive sentences and 50% random sentences.
+        This is used for none break mode
+        """
+        # pair sentences
+        for sent_id, sent in enumerate(dataset_index):
+            next_sent_label = (
+                1 if np.random.rand() > 0.5 and sent_id != len(dataset_index) - 1 else 0
+            )
+            if next_sent_label:
+                next_sent = dataset_index[sent_id + 1]
+            else:
+                next_sent = dataset_index[
+                    self._skip_sampling(len(dataset_index), [sent_id, sent_id + 1])
+                ]
+            self.sent_pairs.append((sent, next_sent, next_sent_label))
+
+            # The current blocks don't include the special tokens but the
+            # sizes already account for this
+            self.sizes.append(3 + sent[3] + next_sent[3])
+
+    def _sent_to_dataset_index(self, sent_sizes):
+        """
+        Build index mapping block indices to the underlying dataset indices
+        """
+        dataset_index = []
+        ds_idx, ds_remaining = -1, 0
+        for to_consume in sent_sizes:
+            sent_size = to_consume
+            if ds_remaining == 0:
+                ds_idx += 1
+                ds_remaining = sent_sizes[ds_idx]
+            start_ds_idx = ds_idx
+            start_offset = sent_sizes[ds_idx] - ds_remaining
+            while to_consume > ds_remaining:
+                to_consume -= ds_remaining
+                ds_idx += 1
+                ds_remaining = sent_sizes[ds_idx]
+            ds_remaining -= to_consume
+            dataset_index.append(
+                (
+                    start_ds_idx,  # starting index in dataset
+                    start_offset,  # starting offset within starting index
+                    ds_idx,  # ending index in dataset
+                    sent_size,  # sentence length
+                )
+            )
+        assert ds_remaining == 0
+        assert ds_idx == len(self.dataset) - 1
+        return dataset_index
+
+    def _generate_sentence_pair(self, doc, doc_id, max_num_tokens, sizes):
+        """
+        Go through a single document and genrate sentence paris from it
+        """
+        current_chunk = []
+        current_length = 0
+        curr = 0
+        # To provide more randomness, we decrease target seq length for parts of
+        # samples (10% by default). Note that max_num_tokens is the hard threshold
+        # for batching and will never be changed.
+        target_seq_length = max_num_tokens
+        if np.random.random() < self.short_seq_prob:
+            target_seq_length = np.random.randint(2, max_num_tokens)
+        # loop through all sentences in document
+        while curr < len(doc):
+            sent_id = doc[curr]
+            current_chunk.append(sent_id)
+            current_length = sum(sizes[current_chunk])
+            # split chunk and generate pair when exceed target_seq_length or
+            # finish the loop
+            if curr == len(doc) - 1 or current_length >= target_seq_length:
+                # split the chunk into 2 parts
+                a_end = 1
+                if len(current_chunk) > 2:
+                    a_end = np.random.randint(1, len(current_chunk) - 1)
+                sent_a = current_chunk[:a_end]
+                len_a = sum(sizes[sent_a])
+                # generate next sentence label, note that if there is only 1 sentence
+                # in current chunk, label is always 0
+                next_sent_label = (
+                    1 if np.random.rand() > 0.5 and len(current_chunk) != 1 else 0
+                )
+                if not next_sent_label:
+                    # if next sentence label is 0, sample sent_b from a random doc
+                    target_b_length = target_seq_length - len_a
+                    rand_doc_id = self._skip_sampling(len(self.block_indices), [doc_id])
+                    random_doc = self.block_indices[rand_doc_id]
+                    random_start = np.random.randint(0, len(random_doc))
+                    sent_b = []
+                    len_b = 0
+                    for j in range(random_start, len(random_doc)):
+                        sent_b.append(random_doc[j])
+                        len_b = sum(sizes[sent_b])
+                        if len_b >= target_b_length:
+                            break
+                    # return the second part of the chunk since it's not used
+                    num_unused_segments = len(current_chunk) - a_end
+                    curr -= num_unused_segments
+                else:
+                    # if next sentence label is 1, use the second part of chunk as sent_B
+                    sent_b = current_chunk[a_end:]
+                    len_b = sum(sizes[sent_b])
+                # currently sent_a and sent_B may be longer than max_num_tokens,
+                # truncate them and return block idx and offsets for them
+                sent_a, sent_b = self._truncate_sentences(
+                    sent_a, sent_b, max_num_tokens
+                )
+                self.sent_pairs.append((sent_a, sent_b, next_sent_label))
+                self.sizes.append(3 + sent_a[3] + sent_b[3])
+                current_chunk = []
+            curr += 1
+
+    def _skip_sampling(self, total, skip_ids):
+        """
+        Generate a random integer which is not in skip_ids. Sample range is [0, total)
+        TODO: ids in skip_ids should be consecutive, we can extend it to more generic version later
+        """
+        rand_id = np.random.randint(total - len(skip_ids))
+        return rand_id if rand_id < min(skip_ids) else rand_id + len(skip_ids)
+
+    def _truncate_sentences(self, sent_a, sent_b, max_num_tokens):
+        """
+        Trancate a pair of sentence to limit total length under max_num_tokens
+        Logics:
+            1. Truncate longer sentence
+            2. Tokens to be truncated could be at the beginning or the end of the sentnce
+        Returns:
+            Truncated sentences represented by dataset idx
+        """
+        len_a, len_b = sum(self.dataset.sizes[sent_a]), sum(self.dataset.sizes[sent_b])
+        front_cut_a = front_cut_b = end_cut_a = end_cut_b = 0
+
+        while True:
+            total_length = (
+                len_a + len_b - front_cut_a - front_cut_b - end_cut_a - end_cut_b
+            )
+            if total_length <= max_num_tokens:
+                break
+
+            if len_a - front_cut_a - end_cut_a > len_b - front_cut_b - end_cut_b:
+                if np.random.rand() < 0.5:
+                    front_cut_a += 1
+                else:
+                    end_cut_a += 1
+            else:
+                if np.random.rand() < 0.5:
+                    front_cut_b += 1
+                else:
+                    end_cut_b += 1
+
+        # calculate ds indices as well as offsets and return
+        truncated_sent_a = self._cut_sentence(sent_a, front_cut_a, end_cut_a)
+        truncated_sent_b = self._cut_sentence(sent_b, front_cut_b, end_cut_b)
+        return truncated_sent_a, truncated_sent_b
+
+    def _cut_sentence(self, sent, front_cut, end_cut):
+        """
+        Cut a sentence based on the numbers of tokens to be cut from beginning and end
+        Represent the sentence as dataset idx and return
+        """
+        start_ds_idx, end_ds_idx, offset = sent[0], sent[-1], 0
+        target_len = sum(self.dataset.sizes[sent]) - front_cut - end_cut
+        while front_cut > 0:
+            if self.dataset.sizes[start_ds_idx] > front_cut:
+                offset += front_cut
+                break
+            else:
+                front_cut -= self.dataset.sizes[start_ds_idx]
+                start_ds_idx += 1
+        while end_cut > 0:
+            if self.dataset.sizes[end_ds_idx] > end_cut:
+                break
+            else:
+                end_cut -= self.dataset.sizes[end_ds_idx]
+                end_ds_idx -= 1
+        return start_ds_idx, offset, end_ds_idx, target_len
+
+    def _fetch_block(self, start_ds_idx, offset, end_ds_idx, length):
+        """
+        Fetch a block of tokens based on its dataset idx
+        """
+        buffer = torch.cat(
+            [self.dataset[idx] for idx in range(start_ds_idx, end_ds_idx + 1)]
+        )
+        s, e = offset, offset + length
+        return buffer[s:e]
+
+    def __getitem__(self, index):
+        block1, block2, next_sent_label = self.sent_pairs[index]
+        block1 = self._fetch_block(*block1)
+        block2 = self._fetch_block(*block2)
+        return block1, block2, next_sent_label
+
+    def __len__(self):
+        return len(self.sizes)
+
+    @property
+    def supports_prefetch(self):
+        return getattr(self.dataset, "supports_prefetch", False)
+
+    def prefetch(self, indices):
+        prefetch_idx = set()
+        for index in indices:
+            for block1, block2, _ in [self.sent_pairs[index]]:
+                for ds_idx in range(block1[0], block1[2] + 1):
+                    prefetch_idx.add(ds_idx)
+                for ds_idx in range(block2[0], block2[2] + 1):
+                    prefetch_idx.add(ds_idx)
+        self.dataset.prefetch(prefetch_idx)
--- a/fairseq/data/legacy/masked_lm_dataset.py
+++ b/fairseq/data/legacy/masked_lm_dataset.py
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import math
+from typing import Dict, List, Tuple
+
+import numpy as np
+import torch
+from fairseq.data import Dictionary, FairseqDataset, data_utils
+from fairseq.data.concat_dataset import ConcatDataset
+from fairseq.data.legacy.block_pair_dataset import BlockPairDataset
+from fairseq.data.token_block_dataset import TokenBlockDataset
+
+
+class MaskedLMDataset(FairseqDataset):
+    """
+    A wrapper Dataset for masked language modelling. The dataset
+    wraps around TokenBlockDataset or BlockedPairDataset and creates a batch
+    where the input blocks are masked according to the specified masking
+    probability. Additionally the batch can also contain sentence level targets
+    if this is specified.
+
+    Args:
+        dataset: Dataset which generates blocks of data. Only BlockPairDataset
+            and TokenBlockDataset are supported.
+        sizes: Sentence lengths
+        vocab: Dictionary with the vocabulary and special tokens.
+        pad_idx: Id of padding token in dictionary
+        mask_idx: Id of mask token in dictionary
+        classif_token_idx: Id of classification token in dictionary. This is the
+            token associated with the sentence embedding (Eg: CLS for BERT)
+        sep_token_idx: Id of separator token in dictionary
+            (Eg: SEP in BERT)
+        seed: Seed for random number generator for reproducibility.
+        shuffle: Shuffle the elements before batching.
+        has_pairs: Specifies whether the underlying dataset
+            generates a pair of blocks along with a sentence_target or not.
+            Setting it to True assumes that the underlying dataset generates a
+            label for the pair of sentences which is surfaced as
+            sentence_target. The default value assumes a single block with no
+            sentence target.
+        segment_id: An optional segment id for filling in the segment labels
+            when we are in the single block setting (Eg: XLM). Default is 0.
+        masking_ratio: specifies what percentage of the blocks should be masked.
+        masking_prob: specifies the probability of a given token being
+            replaced with the "MASK" token.
+        random_token_prob: specifies the probability of a given token being
+            replaced by a random token from the vocabulary.
+    """
+
+    def __init__(
+        self,
+        dataset: FairseqDataset,
+        sizes: np.ndarray,
+        vocab: Dictionary,
+        pad_idx: int,
+        mask_idx: int,
+        classif_token_idx: int,
+        sep_token_idx: int,
+        seed: int = 1,
+        shuffle: bool = True,
+        has_pairs: bool = True,
+        segment_id: int = 0,
+        masking_ratio: float = 0.15,
+        masking_prob: float = 0.8,
+        random_token_prob: float = 0.1,
+    ):
+        # Make sure the input datasets are the ones supported
+        assert (
+            isinstance(dataset, TokenBlockDataset)
+            or isinstance(dataset, BlockPairDataset)
+            or isinstance(dataset, ConcatDataset)
+        ), (
+            "MaskedLMDataset only wraps TokenBlockDataset or BlockPairDataset or "
+            "ConcatDataset"
+        )
+
+        self.dataset = dataset
+        self.sizes = np.array(sizes)
+        self.vocab = vocab
+        self.pad_idx = pad_idx
+        self.mask_idx = mask_idx
+        self.classif_token_idx = classif_token_idx
+        self.sep_token_idx = sep_token_idx
+        self.shuffle = shuffle
+        self.seed = seed
+        self.has_pairs = has_pairs
+        self.segment_id = segment_id
+        self.masking_ratio = masking_ratio
+        self.masking_prob = masking_prob
+        self.random_token_prob = random_token_prob
+
+        # If we have only one block then sizes needs to be updated to include
+        # the classification token
+        if not has_pairs:
+            self.sizes = self.sizes + 1
+
+    def __getitem__(self, index: int):
+        # if has_pairs, then expect 2 blocks and a sentence target
+        if self.has_pairs:
+            (block_one, block_two, sentence_target) = self.dataset[index]
+        else:
+            block_one = self.dataset[index]
+
+        return {
+            "id": index,
+            "block_one": block_one,
+            "block_two": block_two if self.has_pairs else None,
+            "sentence_target": sentence_target if self.has_pairs else None,
+        }
+
+    def __len__(self):
+        return len(self.dataset)
+
+    def _mask_block(
+        self,
+        sentence: np.ndarray,
+        mask_idx: int,
+        pad_idx: int,
+        dictionary_token_range: Tuple,
+    ):
+        """
+        Mask tokens for Masked Language Model training
+        Samples mask_ratio tokens that will be predicted by LM.
+
+        Note:This function may not be efficient enough since we had multiple
+        conversions between np and torch, we can replace them with torch
+        operators later.
+
+        Args:
+            sentence: 1d tensor to be masked
+            mask_idx: index to use for masking the sentence
+            pad_idx: index to use for masking the target for tokens we aren't
+                predicting
+            dictionary_token_range: range of indices in dictionary which can
+                be used for random word replacement
+                (e.g. without special characters)
+        Return:
+            masked_sent: masked sentence
+            target: target with words which we are not predicting replaced
+                by pad_idx
+        """
+        masked_sent = np.copy(sentence)
+        sent_length = len(sentence)
+        mask_num = math.ceil(sent_length * self.masking_ratio)
+        mask = np.random.choice(sent_length, mask_num, replace=False)
+        target = np.copy(sentence)
+
+        for i in range(sent_length):
+            if i in mask:
+                rand = np.random.random()
+
+                # replace with mask if probability is less than masking_prob
+                # (Eg: 0.8)
+                if rand < self.masking_prob:
+                    masked_sent[i] = mask_idx
+
+                # replace with random token if probability is less than
+                # masking_prob + random_token_prob (Eg: 0.9)
+                elif rand < (self.masking_prob + self.random_token_prob):
+                    # sample random token from dictionary
+                    masked_sent[i] = np.random.randint(
+                        dictionary_token_range[0], dictionary_token_range[1]
+                    )
+            else:
+                target[i] = pad_idx
+
+        return masked_sent, target
+
+    def _collate(self, samples: List[Dict], pad_idx: int, eos_idx: int):
+        """
+        Does the heavy lifting for creating a batch from the input list of
+        examples. The logic is as follows:
+            1. Mask the input blocks. In case has_pair is True then we have 2
+               blocks to mask.
+            2. Prepend the first masked block tensor with the special token
+               used as sentence embedding. Eg: CLS in BERT. This happens
+               irrespective of the value of has_pair.
+            3. If has_pair is True, then append the first masked block with the
+               special separator token (eg: SEP for BERT) and compute segment
+               label accordingly. In this case, also append the second masked
+               block with this special separator token and compute its segment
+               label.
+            4. For the targets tensor, prepend and append with padding index
+               accordingly.
+            5. Concatenate all tensors.
+        """
+        if len(samples) == 0:
+            return {}
+        # To ensure determinism, we reset the state of the PRNG after every
+        # batch based on the seed and the first id of the batch. This ensures
+        # that across epochs we get the same mask for the same example. This
+        # is needed for reproducibility and is how BERT does masking
+        # TODO: Can we add deteminism without this constraint?
+        with data_utils.numpy_seed(self.seed + samples[0]["id"]):
+            for s in samples:
+
+                # token range is needed for replacing with random token during
+                # masking
+                token_range = (self.vocab.nspecial, len(self.vocab))
+
+                # mask according to specified probabilities.
+                masked_blk_one, masked_tgt_one = self._mask_block(
+                    s["block_one"],
+                    self.mask_idx,
+                    self.pad_idx,
+                    token_range,
+                )
+
+                tokens = np.concatenate([[self.classif_token_idx], masked_blk_one])
+                targets = np.concatenate([[self.pad_idx], masked_tgt_one])
+                segments = np.ones(len(tokens)) * self.segment_id
+
+                # if has_pairs is True then we need to add the SEP token to both
+                # the blocks after masking and re-compute segments based on the new
+                # lengths.
+                if self.has_pairs:
+                    tokens_one = np.concatenate([tokens, [self.sep_token_idx]])
+                    targets_one = np.concatenate([targets, [self.pad_idx]])
+
+                    masked_blk_two, masked_tgt_two = self._mask_block(
+                        s["block_two"], self.mask_idx, self.pad_idx, token_range
+                    )
+                    tokens_two = np.concatenate([masked_blk_two, [self.sep_token_idx]])
+                    targets_two = np.concatenate([masked_tgt_two, [self.pad_idx]])
+
+                    # block + 1 sep + 1 special (CLS)
+                    segments_one = np.zeros(len(tokens_one))
+                    # block + 1 sep
+                    segments_two = np.ones(len(tokens_two))
+
+                    tokens = np.concatenate([tokens_one, tokens_two])
+                    targets = np.concatenate([targets_one, targets_two])
+                    segments = np.concatenate([segments_one, segments_two])
+
+                s["source"] = torch.LongTensor(tokens)
+                s["segment_labels"] = torch.LongTensor(segments)
+                s["lm_target"] = torch.LongTensor(targets)
+
+        def merge(key):
+            return data_utils.collate_tokens(
+                [s[key] for s in samples], pad_idx, eos_idx, left_pad=False
+            )
+
+        return {
+            "id": torch.LongTensor([s["id"] for s in samples]),
+            "ntokens": sum(len(s["source"]) for s in samples),
+            "net_input": {
+                "src_tokens": merge("source"),
+                "segment_labels": merge("segment_labels"),
+            },
+            "lm_target": merge("lm_target"),
+            "sentence_target": torch.LongTensor([s["sentence_target"] for s in samples])
+            if self.has_pairs
+            else None,
+            "nsentences": len(samples),
+        }
+
+    def collater(self, samples: List[Dict]):
+        """Merge a list of samples to form a mini-batch.
+
+        Args:
+            samples (List[dict]): samples to collate
+
+        Returns:
+            dict: a mini-batch of data
+        """
+        return self._collate(samples, self.vocab.pad(), self.vocab.eos())
+
+    def num_tokens(self, index: int):
+        """
+        Return the number of tokens in a sample. This value is used to
+        enforce max-tokens during batching.
+        """
+        return self.sizes[index]
+
+    def size(self, index: int):
+        """
+        Return an example's size as a float or tuple. This value is used when
+        filtering a dataset with max-positions.
+        """
+        return self.sizes[index]
+
+    def ordered_indices(self):
+        """
+        Return an ordered list of indices. Batches will be constructed based
+        on this order.
+        """
+        if self.shuffle:
+            return np.random.permutation(len(self))
+        else:
+            order = [np.arange(len(self))]
+            order.append(self.sizes)
+            return np.lexsort(order)
+
+    @property
+    def supports_prefetch(self):
+        return getattr(self.dataset, "supports_prefetch", False)
+
+    def prefetch(self, indices):
+        self.dataset.prefetch(indices)