Merge branch 'GNMT-v2' into 'main'

更新了GNMT v2 See merge request dcutoolkit/deeplearing/dlexamples_new!11

Merge branch 'GNMT-v2' into 'main'
更新了GNMT v2 See merge request dcutoolkit/deeplearing/dlexamples_new!11
27dab946 · huchen · 20291e9d · 07c30a15 · 27dab946 · 27dab946
Commit 27dab946 authored Apr 20, 2022 by huchen
20 changed files
--- a/PyTorch/NLP/gnmt/seq2seq/data/__pycache__/sampler.cpython-36.pyc
+++ b/PyTorch/NLP/gnmt/seq2seq/data/__pycache__/sampler.cpython-36.pyc
--- a/PyTorch/NLP/gnmt/seq2seq/data/__pycache__/tokenizer.cpython-36.pyc
+++ b/PyTorch/NLP/gnmt/seq2seq/data/__pycache__/tokenizer.cpython-36.pyc
--- a/PyTorch/NLP/gnmt/seq2seq/data/config.py
+++ b/PyTorch/NLP/gnmt/seq2seq/data/config.py
+# Copyright (c) 2017 Elad Hoffer
+# Copyright (c) 2018-2020, NVIDIA CORPORATION. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
 PAD_TOKEN = '<pad>'
 UNK_TOKEN = '<unk>'
 BOS_TOKEN = '<s>'
@@ -6,27 +27,5 @@ EOS_TOKEN = '<\s>'
 # special PAD, UNKNOWN, BEGIN-OF-STRING, END-OF-STRING tokens
 PAD, UNK, BOS, EOS = [0, 1, 2, 3]

-# path to the BPE vocabulary file, relative to the data directory, it should
-# point to file generated by subword-nmt/get_vocab.py
-VOCAB_FNAME = 'vocab.bpe.32000'
-
-# paths to source and target training files, relative to the data directory, it
-# should point to BPE-encoded files, generated by subword-nmt/apply_bpe.py
-SRC_TRAIN_FNAME = 'train.tok.clean.bpe.32000.en'
-TGT_TRAIN_FNAME = 'train.tok.clean.bpe.32000.de'
-
-# paths to source and target validation files, relative to the data directory,
-# it should point to BPE-encoded files, generated by subword-nmt/apply_bpe.py
-SRC_VAL_FNAME = 'newstest_dev.tok.clean.bpe.32000.en'
-TGT_VAL_FNAME = 'newstest_dev.tok.clean.bpe.32000.de'
-
-# path to the test source file, relative to the data directory, it should point
-# to BPE-encoded file, generated by subword-nmt/apply_bpe.py
-SRC_TEST_FNAME = 'newstest2014.tok.bpe.32000.en'
-
-# path to the test target file, relative to the data directory, it should point
-# to plaintext file, tokenization is performed by the sacrebleu package
-TGT_TEST_TARGET_FNAME = 'newstest2014.de'
-
 # path to the moses detokenizer, relative to the data directory
 DETOKENIZER = 'mosesdecoder/scripts/tokenizer/detokenizer.perl'
--- a/PyTorch/NLP/gnmt/seq2seq/data/dataset.py
+++ b/PyTorch/NLP/gnmt/seq2seq/data/dataset.py
-import time
-import os
+# Copyright (c) 2017 Elad Hoffer
+# Copyright (c) 2018-2020, NVIDIA CORPORATION. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
 import logging
 from operator import itemgetter

-import numpy as np
 import torch
 from torch.utils.data import DataLoader
 from torch.utils.data import Dataset
@@ -31,17 +49,17 @@ def build_collate_fn(batch_first=False, parallel=True, sort=False):

        :param seq: list of sequences
        """
-        lengths = [len(s) for s in seq]
+        lengths = torch.tensor([len(s) for s in seq], dtype=torch.int64)
        batch_length = max(lengths)

-        shape = (batch_length, len(seq))
+        shape = (len(seq), batch_length)
        seq_tensor = torch.full(shape, config.PAD, dtype=torch.int64)

        for i, s in enumerate(seq):
            end_seq = lengths[i]
-            seq_tensor[:end_seq, i].copy_(s[:end_seq])
+            seq_tensor[i, :end_seq].copy_(s[:end_seq])

-        if batch_first:
+        if not batch_first:
            seq_tensor = seq_tensor.t()

        return (seq_tensor, lengths)
@@ -62,19 +80,19 @@ def build_collate_fn(batch_first=False, parallel=True, sort=False):

        return tuple([collate_seq(s) for s in [src_seqs, tgt_seqs]])

-    def single_collate(seqs):
+    def single_collate(src_seqs):
        """
        Builds batches from text dataset, optionally sorts batch by src
        sequence length.

        :param src_seqs: source sequences
        """
-        src_seqs, indices = zip(*seqs)
        if sort:
-            idx, src_seqs = zip(*sorted(enumerate(src_seqs),
-                                        key=lambda item: len(item[1]),
-                                        reverse=True))
-            indices = [indices[i] for i in idx]
+            indices, src_seqs = zip(*sorted(enumerate(src_seqs),
+                                            key=lambda item: len(item[1]),
+                                            reverse=True))
+        else:
+            indices = range(len(src_seqs))

        return collate_seq(src_seqs), tuple(indices)

@@ -84,6 +102,102 @@ def build_collate_fn(batch_first=False, parallel=True, sort=False):
        return single_collate


+class SyntheticDataset(Dataset):
+    def __init__(self, vocab_size, seq_len, nsamples):
+        self.vocab_size = vocab_size
+        self.nsamples = nsamples
+        self.seq_len = seq_len
+
+    def __getitem__(self, idx):
+        rand = torch.randint(0, self.vocab_size, size=(self.seq_len,))
+        return rand
+
+    def unsort(self, array):
+        return array
+
+    def get_loader(self, batch_size=1, num_workers=0, batch_first=False,
+                   pad=False, repeat=1):
+
+        collate_fn = build_collate_fn(batch_first, parallel=False,
+                                      sort=True)
+        sampler = StaticDistributedSampler(self, batch_size, pad, repeat)
+
+        return DataLoader(self,
+                          batch_size=batch_size,
+                          collate_fn=collate_fn,
+                          sampler=sampler,
+                          num_workers=num_workers,
+                          pin_memory=True,
+                          drop_last=False)
+
+    def __len__(self):
+        return self.nsamples
+
+class RawTextDataset(Dataset):
+    def __init__(self, raw_data=None, raw_datafile=None, tokenizer=None,
+                 sort=False, max_size=None):
+        self.tokenizer = tokenizer
+        self.sorted = False
+
+        if raw_datafile:
+            with open(raw_datafile, 'r') as f:
+                self.raw_data = f.readlines()
+        else:
+            self.raw_data = raw_data
+
+        if max_size:
+            self.raw_data = self.raw_data[:max_size]
+
+        self.lengths = [len(s.split()) for s in self.raw_data]
+
+        if sort:
+            self.sort_by_length()
+
+    def __getitem__(self, idx):
+        raw = self.raw_data[idx]
+        tokenized = self.tokenizer.tokenize(raw)
+        return tokenized
+
+    def unsort(self, array):
+        """
+        "Unsorts" given array (restores original order of elements before
+        dataset was sorted by sequence length).
+
+        :param array: array to be "unsorted"
+        """
+        if self.sorted:
+            inverse = sorted(enumerate(self.indices), key=itemgetter(1))
+            array = [array[i[0]] for i in inverse]
+        return array
+
+    def sort_by_length(self):
+        output = sorted(
+            enumerate(self.raw_data),
+            key=lambda x: len(x[1].split()),
+            )
+        self.indices, self.raw_data = zip(*output)
+        self.lengths = [self.lengths[idx] for idx in self.indices]
+        self.sorted = True
+
+    def __len__(self):
+        return len(self.raw_data)
+
+    def get_loader(self, batch_size=1, num_workers=0, batch_first=False,
+                   pad=False, repeat=1):
+
+        collate_fn = build_collate_fn(batch_first, parallel=False,
+                                      sort=True)
+        sampler = StaticDistributedSampler(self, batch_size, pad, repeat)
+
+        return DataLoader(self,
+                          batch_size=batch_size,
+                          collate_fn=collate_fn,
+                          sampler=sampler,
+                          num_workers=num_workers,
+                          pin_memory=True,
+                          drop_last=False)
+
+
 class TextDataset(Dataset):
    def __init__(self, src_fname, tokenizer, min_len=None, max_len=None,
                 sort=False, max_size=None):
@@ -181,7 +295,7 @@ class TextDataset(Dataset):
        return len(self.src)

    def __getitem__(self, idx):
-        return self.src[idx], self.indices[idx]
+        return self.src[idx]

    def get_loader(self, batch_size=1, seeds=None, shuffle=False,
                   num_workers=0, batch_first=False, pad=False,
@@ -212,13 +326,6 @@ class TextDataset(Dataset):
                          pin_memory=True,
                          drop_last=False)

-        #return DataLoader(self,
-        #                  batch_size=batch_size,
-        #                  collate_fn=collate_fn,
-        #                  sampler=sampler,
-        #                  num_workers=num_workers,
-        #                  pin_memory=False,
-        #                  drop_last=False)

 class ParallelDataset(TextDataset):
    def __init__(self, src_fname, tgt_fname, tokenizer,
@@ -347,10 +454,12 @@ class LazyParallelDataset(TextDataset):
            if None loads the entire dataset
        """
        logging.info(f'Processing data from {fname}')
+        data = []
        with open(fname) as dfile:
-            data = dfile.readlines()
-        if max_size:
-            data = data[:max_size]
+            for idx, line in enumerate(dfile):
+                if max_size and idx == max_size:
+                    break
+                data.append(line)
        return data

    def filter_raw_data(self, min_len, max_len):
@@ -391,100 +500,3 @@ class LazyParallelDataset(TextDataset):

    def __len__(self):
        return len(self.raw_src)
-
-
-class PreprocessedDataset(TextDataset):
-    def __init__(self, min_len, max_len, vocab_size):
-        self.min_len = min_len
-        self.max_len = max_len
-        self.vocab_size = vocab_size
-        self.parallel = True
-
-    def get_data_dtype(self, vocab_size):
-        if vocab_size <= np.iinfo(np.int16).max:
-            dtype = np.int16
-        elif vocab_size <= np.iinfo(np.int32).max:
-            dtype = np.int32
-        elif vocab_size <= np.iinfo(np.int64).max:
-            dtype = np.int64
-        else:
-            raise ValueError('Vocabulary size is too large')
-        return dtype
-
-    def write_data(self, fname, src, tgt):
-        src, src_lengths = src
-        tgt, tgt_lengths = tgt
-
-        assert len(src) == len(tgt) == len(src_lengths) == len(tgt_lengths)
-        length = len(src)
-
-        dtype = self.get_data_dtype(self.vocab_size)
-        data = torch.cat((src, tgt), dim=1).numpy()
-
-        offset = 0
-        with open(fname, 'wb') as f:
-            offset += f.write((np.array(length, dtype=np.int64)))
-            offset += f.write((np.array(self.vocab_size, dtype=np.int64)))
-            offset += f.write((np.array(self.min_len, dtype=np.int64)))
-            offset += f.write((np.array(self.max_len, dtype=np.int64)))
-            offset += f.write((np.array(src_lengths, dtype=np.int64)))
-            offset += f.write((np.array(tgt_lengths, dtype=np.int64)))
-
-            offset += np.iinfo(np.int64).dtype.itemsize
-            f.write((np.array(offset, dtype=np.int64)))
-            f.write((np.array(data, dtype=dtype)))
-
-    def read_data(self, fname, vocab_size):
-        self.fname = fname
-
-        with open(fname, 'rb') as f:
-            logging.info(f'Reading preprocessed data file from {fname}')
-            length = int(np.fromfile(f, np.int64, 1))
-            file_vocab_size = int(np.fromfile(f, np.int64, 1))
-            file_min_len = int(np.fromfile(f, np.int64, 1))
-            file_max_len = int(np.fromfile(f, np.int64, 1))
-            src_lengths = np.fromfile(f, np.int64, length)
-            tgt_lengths = np.fromfile(f, np.int64, length)
-            self.offset = int(np.fromfile(f, np.int64, 1))
-
-        assert file_max_len == self.max_len
-        assert file_min_len == self.min_len
-        assert file_vocab_size == self.vocab_size
-
-        logging.info(
-            f'Preprocessed data: length: {length} '
-            f'min length: {self.min_len} '
-            f'max length: {self.max_len} '
-            )
-
-        self.length = length
-        self.src_lengths = torch.tensor(src_lengths)
-        self.tgt_lengths = torch.tensor(tgt_lengths)
-        self.lengths = self.src_lengths + self.tgt_lengths
-        self.dtype = self.get_data_dtype(vocab_size)
-        itemsize = np.iinfo(self.dtype).dtype.itemsize
-        self.item_stride = itemsize * self.max_len * 2
-
-    def prepare(self):
-        logging.info(f'Opening preprocessed data {self.fname} for reading')
-        self.file = open(self.fname, 'rb')
-
-    def finalize(self):
-        logging.info(f'Closing preprocessed data file')
-        self.file.close()
-
-    def __getitem__(self, idx):
-        offset = self.offset + self.item_stride * idx
-        self.file.seek(offset, os.SEEK_SET)
-        data = np.fromfile(self.file, self.dtype, self.max_len * 2)
-        data = data.astype(np.int64)
-        src_len = self.src_lengths[idx]
-        tgt_len = self.tgt_lengths[idx]
-        src = torch.tensor(data[0: src_len])
-        tgt = torch.tensor(data[self.max_len: self.max_len + tgt_len])
-        return src, tgt
-
-    def __len__(self):
-        return self.length
-
-
--- a/PyTorch/NLP/gnmt/seq2seq/data/sampler.py
+++ b/PyTorch/NLP/gnmt/seq2seq/data/sampler.py
+# Copyright (c) 2018-2020, NVIDIA CORPORATION. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
 import logging

 import torch
@@ -226,7 +246,7 @@ class BucketingSampler(DistributedSampler):


 class StaticDistributedSampler(Sampler):
-    def __init__(self, dataset, batch_size, pad, world_size=None, rank=None):
+    def __init__(self, dataset, batch_size, pad, repeat=1, world_size=None, rank=None):
        """
        Constructor for the StaticDistributedSampler.

@@ -247,11 +267,12 @@ class StaticDistributedSampler(Sampler):
        global_batch_size = batch_size * world_size

        data_len = len(dataset)
-        num_samples = (data_len + global_batch_size - 1) \
+        repeated_data_len = int(len(dataset) * repeat)
+        num_samples = (repeated_data_len + global_batch_size - 1) \
            // global_batch_size * global_batch_size
        self.num_samples = num_samples

-        indices = list(range(data_len))
+        indices = list(range(repeated_data_len))
        if pad:
            # pad dataset to a multiple of global_batch_size samples, uses
            # sample with idx 0 as pad
@@ -267,6 +288,7 @@ class StaticDistributedSampler(Sampler):
        indices = indices.view(-1)
        # remove temporary pad
        indices = indices[indices != -1]
+        indices = indices % data_len
        indices = indices.tolist()
        self.indices = indices


--- a/PyTorch/NLP/gnmt/seq2seq/data/tokenizer.py
+++ b/PyTorch/NLP/gnmt/seq2seq/data/tokenizer.py
+# Copyright (c) 2017 Elad Hoffer
+# Copyright (c) 2018-2020, NVIDIA CORPORATION. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
 import logging
 from collections import defaultdict
 from functools import partial

+import torch
+import subword_nmt.apply_bpe
+import sacremoses
 import seq2seq.data.config as config


@@ -9,37 +33,53 @@ class Tokenizer:
    """
    Tokenizer class.
    """
-    def __init__(self, vocab_fname=None, pad=1, separator='@@'):
+    def __init__(self, vocab_fname=None, bpe_fname=None, lang=None, pad=1,
+                 separator='@@'):
        """
        Constructor for the Tokenizer class.

        :param vocab_fname: path to the file with vocabulary
+        :param bpe_fname: path to the file with bpe codes
        :param pad: pads vocabulary to a multiple of 'pad' tokens
        :param separator: tokenization separator
        """
+        self.separator = separator
+        self.lang = lang
+
+        if bpe_fname:
+            with open(bpe_fname, 'r') as bpe_codes:
+                self.bpe = subword_nmt.apply_bpe.BPE(bpe_codes)
+
        if vocab_fname:
-            self.separator = separator
+            self.build_vocabulary(vocab_fname, pad)
+
+        if lang:
+            self.init_moses(lang)

-            logging.info(f'Building vocabulary from {vocab_fname}')
-            vocab = [config.PAD_TOKEN, config.UNK_TOKEN,
-                     config.BOS_TOKEN, config.EOS_TOKEN]
+    def init_moses(self, lang):
+        self.moses_tokenizer = sacremoses.MosesTokenizer(lang['src'])
+        self.moses_detokenizer = sacremoses.MosesDetokenizer(lang['tgt'])

-            with open(vocab_fname) as vfile:
-                for line in vfile:
-                    vocab.append(line.strip())
+    def build_vocabulary(self, vocab_fname, pad):
+        logging.info(f'Building vocabulary from {vocab_fname}')
+        vocab = [config.PAD_TOKEN, config.UNK_TOKEN,
+                 config.BOS_TOKEN, config.EOS_TOKEN]
+        with open(vocab_fname) as vfile:
+            for line in vfile:
+                vocab.append(line.strip())

-            self.pad_vocabulary(vocab, pad)
+        self.pad_vocabulary(vocab, pad)

-            self.vocab_size = len(vocab)
-            logging.info(f'Size of vocabulary: {self.vocab_size}')
+        self.vocab_size = len(vocab)
+        logging.info(f'Size of vocabulary: {self.vocab_size}')

-            self.tok2idx = defaultdict(partial(int, config.UNK))
-            for idx, token in enumerate(vocab):
-                self.tok2idx[token] = idx
+        self.tok2idx = defaultdict(partial(int, config.UNK))
+        for idx, token in enumerate(vocab):
+            self.tok2idx[token] = idx

-            self.idx2tok = {}
-            for key, value in self.tok2idx.items():
-                self.idx2tok[value] = key
+        self.idx2tok = {}
+        for key, value in self.tok2idx.items():
+            self.idx2tok[value] = key

    def pad_vocabulary(self, vocab, pad):
        """
@@ -58,8 +98,10 @@ class Tokenizer:
    def get_state(self):
        logging.info(f'Saving state of the tokenizer')
        state = {
+            'lang': self.lang,
            'separator': self.separator,
            'vocab_size': self.vocab_size,
+            'bpe': self.bpe,
            'tok2idx': self.tok2idx,
            'idx2tok': self.idx2tok,
        }
@@ -67,11 +109,15 @@ class Tokenizer:

    def set_state(self, state):
        logging.info(f'Restoring state of the tokenizer')
+        self.lang = state['lang']
        self.separator = state['separator']
        self.vocab_size = state['vocab_size']
+        self.bpe = state['bpe']
        self.tok2idx = state['tok2idx']
        self.idx2tok = state['idx2tok']

+        self.init_moses(self.lang)
+
    def segment(self, line):
        """
        Tokenizes single sentence and adds special BOS and EOS tokens.
@@ -85,7 +131,14 @@ class Tokenizer:
        entry = [config.BOS] + entry + [config.EOS]
        return entry

-    def detokenize(self, inputs, delim=' '):
+    def tokenize(self, line):
+        tokenized = self.moses_tokenizer.tokenize(line, return_str=True)
+        bpe = self.bpe.process_line(tokenized)
+        segmented = self.segment(bpe)
+        tensor = torch.tensor(segmented)
+        return tensor
+
+    def detokenize_bpe(self, inp, delim=' '):
        """
        Detokenizes single sentence and removes token separator characters.

@@ -94,7 +147,7 @@ class Tokenizer:

        returns: string representing detokenized sentence
        """
-        detok = delim.join([self.idx2tok[idx] for idx in inputs])
+        detok = delim.join([self.idx2tok[idx] for idx in inp])
        detok = detok.replace(self.separator + ' ', '')
        detok = detok.replace(self.separator, '')

@@ -103,3 +156,12 @@ class Tokenizer:
        detok = detok.replace(config.PAD_TOKEN, '')
        detok = detok.strip()
        return detok
+
+    def detokenize_moses(self, inp):
+        output = self.moses_detokenizer.detokenize(inp.split())
+        return output
+
+    def detokenize(self, inp):
+        detok_bpe = self.detokenize_bpe(inp)
+        output = self.detokenize_moses(detok_bpe)
+        return output
--- a/PyTorch/NLP/gnmt/seq2seq/gnmt.log
+++ b/PyTorch/NLP/gnmt/seq2seq/gnmt.log
--- a/PyTorch/NLP/gnmt/seq2seq/gpu_affinity.py
+++ b/PyTorch/NLP/gnmt/seq2seq/gpu_affinity.py
+import collections
+import math
+import os
+import pathlib
+import re
+
+import pynvml
+
+pynvml.nvmlInit()
+
+
+def systemGetDriverVersion():
+    return pynvml.nvmlSystemGetDriverVersion()
+
+
+def deviceGetCount():
+    return pynvml.nvmlDeviceGetCount()
+
+
+class device:
+    # assume nvml returns list of 64 bit ints
+    _nvml_affinity_elements = math.ceil(os.cpu_count() / 64)
+
+    def __init__(self, device_idx):
+        super().__init__()
+        self.handle = pynvml.nvmlDeviceGetHandleByIndex(device_idx)
+
+    def getName(self):
+        return pynvml.nvmlDeviceGetName(self.handle)
+
+    def getCpuAffinity(self):
+        affinity_string = ''
+        for j in pynvml.nvmlDeviceGetCpuAffinity(
+            self.handle, device._nvml_affinity_elements
+        ):
+            # assume nvml returns list of 64 bit ints
+            affinity_string = '{:064b}'.format(j) + affinity_string
+        affinity_list = [int(x) for x in affinity_string]
+        affinity_list.reverse()  # so core 0 is in 0th element of list
+
+        ret = [i for i, e in enumerate(affinity_list) if e != 0]
+        return ret
+
+
+def set_socket_affinity(gpu_id):
+    dev = device(gpu_id)
+    affinity = dev.getCpuAffinity()
+    os.sched_setaffinity(0, affinity)
+
+
+def set_single_affinity(gpu_id):
+    dev = device(gpu_id)
+    affinity = dev.getCpuAffinity()
+    os.sched_setaffinity(0, affinity[:1])
+
+
+def set_single_unique_affinity(gpu_id, nproc_per_node):
+    devices = [device(i) for i in range(nproc_per_node)]
+    socket_affinities = [dev.getCpuAffinity() for dev in devices]
+
+    siblings_list = get_thread_siblings_list()
+    siblings_dict = dict(siblings_list)
+
+    # remove siblings
+    for idx, socket_affinity in enumerate(socket_affinities):
+        socket_affinities[idx] = list(set(socket_affinity) - set(siblings_dict.values()))
+
+    affinities = []
+    assigned = []
+
+    for socket_affinity in socket_affinities:
+        for core in socket_affinity:
+            if core not in assigned:
+                affinities.append([core])
+                assigned.append(core)
+                break
+    os.sched_setaffinity(0, affinities[gpu_id])
+
+
+def set_socket_unique_affinity(gpu_id, nproc_per_node, mode):
+    device_ids = [device(i) for i in range(nproc_per_node)]
+    socket_affinities = [dev.getCpuAffinity() for dev in device_ids]
+
+    siblings_list = get_thread_siblings_list()
+    siblings_dict = dict(siblings_list)
+
+    # remove siblings
+    for idx, socket_affinity in enumerate(socket_affinities):
+        socket_affinities[idx] = list(set(socket_affinity) - set(siblings_dict.values()))
+
+    socket_affinities_to_device_ids = collections.defaultdict(list)
+
+    for idx, socket_affinity in enumerate(socket_affinities):
+        socket_affinities_to_device_ids[tuple(socket_affinity)].append(idx)
+
+    for socket_affinity, device_ids in socket_affinities_to_device_ids.items():
+        devices_per_group = len(device_ids)
+        cores_per_device = len(socket_affinity) // devices_per_group
+        for group_id, device_id in enumerate(device_ids):
+            if device_id == gpu_id:
+                if mode == 'interleaved':
+                    affinity = list(socket_affinity[group_id::devices_per_group])
+                elif mode == 'continuous':
+                    affinity = list(socket_affinity[group_id*cores_per_device:(group_id+1)*cores_per_device])
+                else:
+                    raise RuntimeError('Unknown set_socket_unique_affinity mode')
+
+                # reintroduce siblings
+                affinity += [siblings_dict[aff] for aff in affinity if aff in siblings_dict]
+                os.sched_setaffinity(0, affinity)
+
+
+def get_thread_siblings_list():
+    path = '/sys/devices/system/cpu/cpu*/topology/thread_siblings_list'
+    thread_siblings_list = []
+    pattern = re.compile(r'(\d+)\D(\d+)')
+    for fname in pathlib.Path(path[0]).glob(path[1:]):
+        with open(fname) as f:
+            content = f.read().strip()
+            res = pattern.findall(content)
+            if res:
+                pair = tuple(map(int, res[0]))
+                thread_siblings_list.append(pair)
+    return thread_siblings_list
+
+
+def set_affinity(gpu_id, nproc_per_node, mode='socket'):
+    if mode == 'socket':
+        set_socket_affinity(gpu_id)
+    elif mode == 'single':
+        set_single_affinity(gpu_id)
+    elif mode == 'single_unique':
+        set_single_unique_affinity(gpu_id, nproc_per_node)
+    elif mode == 'socket_unique_interleaved':
+        set_socket_unique_affinity(gpu_id, nproc_per_node, 'interleaved')
+    elif mode == 'socket_unique_continuous':
+        set_socket_unique_affinity(gpu_id, nproc_per_node, 'continuous')
+    else:
+        raise RuntimeError('Unknown affinity mode')
+
+    affinity = os.sched_getaffinity(0)
+    return affinity
--- a/PyTorch/NLP/gnmt/seq2seq/inference/__pycache__/beam_search.cpython-36.pyc
+++ b/PyTorch/NLP/gnmt/seq2seq/inference/__pycache__/beam_search.cpython-36.pyc
--- a/PyTorch/NLP/gnmt/seq2seq/inference/__pycache__/bleu.cpython-36.pyc
+++ b/PyTorch/NLP/gnmt/seq2seq/inference/__pycache__/bleu.cpython-36.pyc
--- a/PyTorch/NLP/gnmt/seq2seq/inference/__pycache__/inference.cpython-36.pyc
+++ b/PyTorch/NLP/gnmt/seq2seq/inference/__pycache__/inference.cpython-36.pyc
--- a/PyTorch/NLP/gnmt/seq2seq/inference/__pycache__/translator.cpython-36.pyc
+++ b/PyTorch/NLP/gnmt/seq2seq/inference/__pycache__/translator.cpython-36.pyc
--- a/PyTorch/NLP/gnmt/seq2seq/inference/beam_search.py
+++ b/PyTorch/NLP/gnmt/seq2seq/inference/beam_search.py
+# Copyright (c) 2017 Elad Hoffer
+# Copyright (c) 2018-2020, NVIDIA CORPORATION. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
 import torch

 from seq2seq.data.config import BOS
@@ -8,7 +29,7 @@ class SequenceGenerator:
    """
    Generator for the autoregressive inference with beam search decoding.
    """
-    def __init__(self, model, beam_size=5, max_seq_len=100, cuda=False,
+    def __init__(self, model, beam_size=5, max_seq_len=100,
                 len_norm_factor=0.6, len_norm_const=5,
                 cov_penalty_factor=0.1):
        """
@@ -21,14 +42,12 @@ class SequenceGenerator:
        :param model: model which implements generate method
        :param beam_size: decoder beam size
        :param max_seq_len: maximum decoder sequence length
-        :param cuda: whether to use cuda
        :param len_norm_factor: length normalization factor
        :param len_norm_const: length normalization constant
        :param cov_penalty_factor: coverage penalty factor
        """

        self.model = model
-        self.cuda = cuda
        self.beam_size = beam_size
        self.max_seq_len = max_seq_len
        self.len_norm_factor = len_norm_factor
@@ -51,18 +70,17 @@ class SequenceGenerator:
            lengths: (batch_size) - lengths of generated translations
            counter: number of iterations of the decoding loop
        """
+        device = initial_input.device
        max_seq_len = self.max_seq_len

-        translation = torch.zeros(batch_size, max_seq_len, dtype=torch.int64)
-        lengths = torch.ones(batch_size, dtype=torch.int64)
-        active = torch.arange(0, batch_size, dtype=torch.int64)
-        base_mask = torch.arange(0, batch_size, dtype=torch.int64)
-
-        if self.cuda:
-            translation = translation.cuda()
-            lengths = lengths.cuda()
-            active = active.cuda()
-            base_mask = base_mask.cuda()
+        translation = torch.zeros(batch_size, max_seq_len, dtype=torch.int64,
+                                  device=device)
+        lengths = torch.ones(batch_size, dtype=torch.int64,
+                             device=device)
+        active = torch.arange(0, batch_size, dtype=torch.int64,
+                              device=device)
+        base_mask = torch.arange(0, batch_size, dtype=torch.int64,
+                                 device=device)

        translation[:, 0] = BOS
        words, context = initial_input, initial_context
@@ -118,6 +136,7 @@ class SequenceGenerator:
            lengths: (batch_size) - lengths of generated translations
            counter: number of iterations of the decoding loop
        """
+        device = initial_input.device
        beam_size = self.beam_size
        norm_const = self.len_norm_const
        norm_factor = self.len_norm_factor
@@ -125,25 +144,19 @@ class SequenceGenerator:
        cov_penalty_factor = self.cov_penalty_factor

        translation = torch.zeros(batch_size * beam_size, max_seq_len,
-                                  dtype=torch.int64)
-        lengths = torch.ones(batch_size * beam_size, dtype=torch.int64)
-        scores = torch.zeros(batch_size * beam_size, dtype=torch.float32)
-
-        active = torch.arange(0, batch_size * beam_size, dtype=torch.int64)
-        base_mask = torch.arange(0, batch_size * beam_size, dtype=torch.int64)
+                                  dtype=torch.int64, device=device)
+        lengths = torch.ones(batch_size * beam_size,
+                             dtype=torch.int64, device=device)
+        scores = torch.zeros(batch_size * beam_size,
+                             dtype=torch.float32, device=device)
+        active = torch.arange(0, batch_size * beam_size,
+                              dtype=torch.int64, device=device)
+        base_mask = torch.arange(0, batch_size * beam_size,
+                                 dtype=torch.int64, device=device)
        global_offset = torch.arange(0, batch_size * beam_size, beam_size,
-                                     dtype=torch.int64)
-
-        eos_beam_fill = torch.tensor([0] + (beam_size - 1) * [float('-inf')])
-
-        if self.cuda:
-            translation = translation.cuda()
-            lengths = lengths.cuda()
-            active = active.cuda()
-            base_mask = base_mask.cuda()
-            scores = scores.cuda()
-            global_offset = global_offset.cuda()
-            eos_beam_fill = eos_beam_fill.cuda()
+                                     device=device, dtype=torch.int64)
+        eos_beam_fill = torch.tensor([0] + (beam_size - 1) * [float('-inf')],
+                                     dtype=torch.float32, device=device)

        translation[:, 0] = BOS

@@ -182,9 +195,8 @@ class SequenceGenerator:
        context[1] = context[1].contiguous().view(batch_size * beam_size)
        # context[1]: (batch * beam)

-        accu_attn_scores = torch.zeros(batch_size * beam_size, seq)
-        if self.cuda:
-            accu_attn_scores = accu_attn_scores.cuda()
+        accu_attn_scores = torch.zeros(batch_size * beam_size, seq,
+                                       dtype=torch.float32, device=device)

        counter = 0
        for idx in range(1, self.max_seq_len):

--- a/PyTorch/NLP/gnmt/seq2seq/inference/bleu.py
+++ b/PyTorch/NLP/gnmt/seq2seq/inference/bleu.py
-from itertools import zip_longest
-
-import sacrebleu
-import torch
-
-
-def read_reference(fname, indices):
-    with open(fname) as f:
-        refs = f.readlines()
-    refs = [refs[i] for i in indices]
-    return refs
-
-
-def all_reduce(val):
-    if torch.distributed.is_available() and torch.distributed.is_initialized():
-        val = torch.tensor(val)
-
-        if hasattr(torch.distributed, "get_backend"):
-            _backend = torch.distributed.get_backend()
-            if hasattr(torch.distributed, "DistBackend"):
-                backend_enum_holder = torch.distributed.DistBackend
-            else:
-                backend_enum_holder = torch.distributed.Backend
-        else:
-            _backend = torch.distributed._backend
-            backend_enum_holder = torch.distributed.dist_backend
-
-        if _backend == backend_enum_holder.NCCL:
-            device = torch.device('cuda')
-        else:
-            device = torch.device('cpu')
-
-        val = val.to(device)
-        torch.distributed.all_reduce(val)
-        val = val.tolist()
-    return val
-
-
-def corpus_bleu(sys_stream, ref_streams, smooth='exp', smooth_floor=0.0,
-                force=False, lowercase=False,
-                tokenize=sacrebleu.DEFAULT_TOKENIZER,
-                use_effective_order=False) -> sacrebleu.BLEU:
-    """Produces BLEU scores along with its sufficient statistics from a source
-    against one or more references.
-
-    :param sys_stream: The system stream (a sequence of segments)
-    :param ref_streams: A list of one or more reference streams (each a
-                        sequence of segments)
-    :param smooth: The smoothing method to use
-    :param smooth_floor: For 'floor' smoothing, the floor to use
-    :param force: Ignore data that looks already tokenized
-    :param lowercase: Lowercase the data
-    :param tokenize: The tokenizer to use
-    :return: a BLEU object containing everything you'd want
-    """
-
-    # Add some robustness to the input arguments
-    if isinstance(sys_stream, str):
-        sys_stream = [sys_stream]
-    if isinstance(ref_streams, str):
-        ref_streams = [[ref_streams]]
-
-    sys_len = 0
-    ref_len = 0
-
-    correct = [0 for n in range(sacrebleu.NGRAM_ORDER)]
-    total = [0 for n in range(sacrebleu.NGRAM_ORDER)]
-
-    fhs = [sys_stream] + ref_streams
-    for lines in zip_longest(*fhs):
-        if None in lines:
-            raise EOFError("Source and reference streams have different "
-                           "lengths!")
-
-        if lowercase:
-            lines = [x.lower() for x in lines]
-
-        output, *refs = [sacrebleu.TOKENIZERS[tokenize](x.rstrip()) for x in
-                         lines]
-
-        ref_ngrams, closest_diff, closest_len = sacrebleu.ref_stats(output,
-                                                                    refs)
-
-        sys_len += len(output.split())
-        ref_len += closest_len
-
-        sys_ngrams = sacrebleu.extract_ngrams(output)
-        for ngram in sys_ngrams.keys():
-            n = len(ngram.split())
-            correct[n-1] += min(sys_ngrams[ngram], ref_ngrams.get(ngram, 0))
-            total[n-1] += sys_ngrams[ngram]
-
-    correct = all_reduce(correct)
-    total = all_reduce(total)
-    sys_len = all_reduce(sys_len)
-    ref_len = all_reduce(ref_len)
-
-    return sacrebleu.compute_bleu(correct, total, sys_len, ref_len, smooth,
-                                  smooth_floor, use_effective_order)
-
-
-def compute_bleu(output, indices, ref_fname):
-    refs = read_reference(ref_fname, indices)
-    bleu = corpus_bleu(output, [refs], lowercase=True,
-                       tokenize='intl')
-    return bleu
--- a/PyTorch/NLP/gnmt/seq2seq/inference/tables.py
+++ b/PyTorch/NLP/gnmt/seq2seq/inference/tables.py
+# Copyright (c) 2018-2020, NVIDIA CORPORATION. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+import collections
+import itertools
+
+import numpy as np
+from pytablewriter import MarkdownTableWriter
+
+
+def interleave(*args):
+    return list(itertools.chain(*zip(*args)))
+
+
+class AccuracyTable:
+    def __init__(self, unit):
+        self.data = collections.defaultdict(dict)
+        self.unit = unit
+
+    def add(self, key, data):
+        self.data[key].update(data)
+
+    def write(self, title, write_math):
+        writer = MarkdownTableWriter()
+        writer.table_name = f'{title}'
+        main_header = ['**Batch Size**', '**Beam Size**']
+        data_header = []
+        if 'fp32' in write_math:
+            data_header += [f'**Accuracy - FP32 ({self.unit})**']
+        if 'tf32' in write_math:
+            data_header += [f'**Accuracy - TF32 ({self.unit})**']
+        if 'fp16' in write_math:
+            data_header += [f'**Accuracy - FP16 ({self.unit})**']
+        writer.headers = main_header + data_header
+
+        writer.value_matrix = []
+        for k, v in self.data.items():
+            batch_size, beam_size = k
+            row = [batch_size, beam_size]
+            if 'fp32' in write_math:
+                row.append(v['fp32'])
+            if 'tf32' in write_math:
+                row.append(v['tf32'])
+            if 'fp16' in write_math:
+                row.append(v['fp16'])
+            writer.value_matrix.append(row)
+        writer.write_table()
+
+
+class PerformanceTable:
+    def __init__(self, percentiles, unit, reverse_percentiles=False):
+        self.percentiles = percentiles
+        self.data = collections.defaultdict(dict)
+        self.unit = unit
+        self.reverse_percentiles = reverse_percentiles
+
+    def add(self, key, value):
+        math, value = next(iter(value.items()))
+        value = np.array(value)
+
+        if self.reverse_percentiles:
+            percentiles = [100 - p for p in self.percentiles]
+        else:
+            percentiles = self.percentiles
+
+        stats = []
+        for p in percentiles:
+            val = np.percentile(value, p)
+            stats.append(val * self.unit_convert[self.unit])
+
+        avg = value.mean() * self.unit_convert[self.unit]
+
+        self.data[key].update({math: (avg, stats)})
+
+    def write(self, title, math, relative=None, reverse_speedup=False):
+        writer = MarkdownTableWriter()
+        writer.table_name = f'{title} - {math.upper()}'
+        main_header = ['**Batch Size**', '**Beam Size**']
+        data_header = [f'**Avg ({self.unit})**']
+        data_header += [f'**{p}% ({self.unit})**' for p in self.percentiles]
+
+        if relative:
+            speedup_header = ['**Speedup**'] * len(data_header)
+            data_header = interleave(data_header, speedup_header)
+
+        writer.headers = main_header + data_header
+
+        writer.value_matrix = []
+        for k, v in self.data.items():
+            batch_size, beam_size = k
+            avg, res_percentiles = v[math]
+            main = [batch_size, beam_size]
+            data = [avg, *res_percentiles]
+
+            if relative:
+                rel = self.data[k][relative]
+                rel_avg, rel_res_percentiles = rel
+                rel = [rel_avg, *rel_res_percentiles]
+                speedup = [d / r for (r, d) in zip(rel, data)]
+                if reverse_speedup:
+                    speedup = [1 / s for s in speedup]
+                data = interleave(data, speedup)
+
+            writer.value_matrix.append(main + data)
+        writer.write_table()
+
+
+class LatencyTable(PerformanceTable):
+    def __init__(self, percentiles, unit='ms'):
+        super().__init__(percentiles, unit)
+        self.unit_convert = {'s': 1, 'ms': 1e3, 'us': 1e6}
+
+
+class ThroughputTable(PerformanceTable):
+    def __init__(self, percentiles, unit='tok/s', reverse_percentiles=True):
+        super().__init__(percentiles, unit, reverse_percentiles)
+        self.unit_convert = {'tok/s': 1}
--- a/PyTorch/NLP/gnmt/seq2seq/inference/inference.py
+++ b/PyTorch/NLP/gnmt/seq2seq/inference/inference.py
-import contextlib
+# Copyright (c) 2017 Elad Hoffer
+# Copyright (c) 2018-2020, NVIDIA CORPORATION. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
 import logging
-import os
 import subprocess
 import time

@@ -8,24 +27,35 @@ import torch
 import torch.distributed as dist

 import seq2seq.data.config as config
+import seq2seq.utils as utils
 from seq2seq.inference.beam_search import SequenceGenerator
-from seq2seq.utils import AverageMeter
-from seq2seq.utils import barrier
-from seq2seq.utils import get_rank
-from seq2seq.utils import get_world_size
-import seq2seq.inference.bleu


 def gather_predictions(preds):
-    world_size = get_world_size()
+    world_size = utils.get_world_size()
    if world_size > 1:
-        all_preds = preds.new(world_size * preds.size(0), preds.size(1))
-        all_preds_list = all_preds.chunk(world_size, dim=0)
-        dist.all_gather(all_preds_list, preds)
-        preds = all_preds
+        all_preds = [preds.new(preds.size(0), preds.size(1)) for i in range(world_size)]
+        dist.all_gather(all_preds, preds)
+        preds = torch.cat(all_preds)
    return preds


+def run_sacrebleu(test_path, reference_path):
+    """
+    Executes sacrebleu and returns BLEU score.
+
+    :param test_path: path to the test file
+    :param reference_path: path to the reference file
+    """
+    sacrebleu_params = '--score-only -lc --tokenize intl'
+    logging.info(f'Running sacrebleu (parameters: {sacrebleu_params})')
+    sacrebleu = subprocess.run([f'sacrebleu --input {test_path} \
+                                {reference_path} {sacrebleu_params}'],
+                               stdout=subprocess.PIPE, shell=True)
+    test_bleu = round(float(sacrebleu.stdout.strip()), 2)
+    return test_bleu
+
+
 class Translator:
    """
    Translator runs validation on test dataset, executes inference, optionally
@@ -34,17 +64,15 @@ class Translator:
    def __init__(self,
                 model,
                 tokenizer,
-                 loader,
+                 loader=None,
                 beam_size=5,
                 len_norm_factor=0.6,
                 len_norm_const=5.0,
                 cov_penalty_factor=0.1,
                 max_seq_len=50,
-                 cuda=False,
                 print_freq=1,
-                 dataset_dir=None,
-                 save_path=None,
-                 target_bleu=None):
+                 reference=None,
+                 ):

        self.model = model
        self.tokenizer = tokenizer
@@ -53,24 +81,22 @@ class Translator:
        self.insert_src_start = [config.BOS]
        self.insert_src_end = [config.EOS]
        self.batch_first = model.batch_first
-        self.cuda = cuda
        self.beam_size = beam_size
        self.print_freq = print_freq
-        self.dataset_dir = dataset_dir
-        self.target_bleu = target_bleu
-        self.save_path = save_path
+        self.reference = reference
+
+        self.distributed = (utils.get_world_size() > 1)

        self.generator = SequenceGenerator(
            model=self.model,
            beam_size=beam_size,
            max_seq_len=max_seq_len,
-            cuda=cuda,
            len_norm_factor=len_norm_factor,
            len_norm_const=len_norm_const,
            cov_penalty_factor=cov_penalty_factor)

-    def run(self, calc_bleu=True, epoch=None, iteration=None, summary=False,
-            reference_path=None):
+    def run(self, calc_bleu=True, epoch=None, iteration=None, eval_path=None,
+            summary=False, warmup=0, reference_path=None):
        """
        Runs translation on test dataset.

@@ -78,47 +104,49 @@ class Translator:
            BLEU score
        :param epoch: index of the current epoch
        :param iteration: index of the current iteration
+        :param eval_path: path to the file for saving results
        :param summary: if True prints summary
        :param reference_path: path to the file with reference translation
        """
-        test_bleu = 0.
-        break_training = False
+        if reference_path is None:
+            reference_path = self.reference
+
+        device = next(self.model.parameters()).device
+
+        test_bleu = torch.tensor([0.], device=device)

+        rank = utils.get_rank()
        logging.info(f'Running evaluation on test set')
        self.model.eval()
-        output = self.evaluate(epoch, iteration, summary)

-        # detokenize (BPE)
-        detok_output = []
-        for idx, pred in output:
-            pred = pred.tolist()
-            detok = self.tokenizer.detokenize(pred)
-            detok_output.append((idx, detok + '\n'))
+        output, eval_stats = self.evaluate(self.loader, epoch, iteration,
+                                           warmup, summary)
+        output = output[:len(self.loader.dataset)]
+        output = self.loader.dataset.unsort(output)
+
+        if rank == 0 and eval_path:
+            with open(eval_path, 'w') as eval_file:
+                lines = [line + '\n' for line in output]
+                eval_file.writelines(lines)
+            if calc_bleu:
+                test_bleu[0] = run_sacrebleu(eval_path, reference_path)
+                if summary:
+                    logging.info(f'BLEU on test dataset: {test_bleu[0]:.2f}')
+
+        utils.barrier()
+        logging.info(f'Finished evaluation on test set')

-        if calc_bleu:
-            if detok_output:
-                indices, output = zip(*detok_output)
-            else:
-                indices, output = [], []
-            output = self.run_detokenizer(output)
-            reference_path = os.path.join(self.dataset_dir,
-                                          config.TGT_TEST_TARGET_FNAME)
-            bleu = seq2seq.inference.bleu.compute_bleu(output, indices,
-                                                       reference_path)
-            logging.info(bleu)
-            test_bleu = round(bleu.score, 2)
-            if summary:
-                logging.info(f'BLEU on test dataset: {test_bleu:.2f}')
-
-            if self.target_bleu and test_bleu >= self.target_bleu:
-                logging.info(f'Target accuracy reached')
-                break_training = True
+        if self.distributed:
+            dist.broadcast(test_bleu, 0)

-        logging.info(f'Finished evaluation on test set')
+        if calc_bleu:
+            eval_stats['bleu'] = test_bleu[0].item()
+        else:
+            eval_stats['bleu'] = None

-        return test_bleu, break_training
+        return output, eval_stats

-    def evaluate(self, epoch, iteration, summary):
+    def evaluate(self, loader, epoch=0, iteration=0, warmup=0, summary=False):
        """
        Runs evaluation on test dataset.

@@ -126,56 +154,58 @@ class Translator:
        :param iteration: index of the current iteration
        :param summary: if True prints summary
        """
-        batch_time = AverageMeter(False)
-        tot_tok_per_sec = AverageMeter(False)
-        iterations = AverageMeter(False)
-        enc_seq_len = AverageMeter(False)
-        dec_seq_len = AverageMeter(False)
+        device = next(self.model.parameters()).device
+
+        batch_time = utils.AverageMeter(warmup, keep=True)
+        tot_tok_per_sec = utils.AverageMeter(warmup, keep=True)
+        iterations = utils.AverageMeter()
+        enc_seq_len = utils.AverageMeter()
+        dec_seq_len = utils.AverageMeter()
        stats = {}

+        batch_size = loader.batch_size
+        global_batch_size = batch_size * utils.get_world_size()
+        beam_size = self.beam_size
+
+        bos = [self.insert_target_start] * (batch_size * beam_size)
+        bos = torch.tensor(bos, dtype=torch.int64, device=device)
+        if self.batch_first:
+            bos = bos.view(-1, 1)
+        else:
+            bos = bos.view(1, -1)
+
+        if beam_size == 1:
+            generator = self.generator.greedy_search
+        else:
+            generator = self.generator.beam_search
+
        output = []

-        for i, (src, indices) in enumerate(self.loader):
+        for i, (src, indices) in enumerate(loader):
            translate_timer = time.time()
            src, src_length = src
-
-            if self.batch_first:
-                batch_size = src.shape[0]
-            else:
-                batch_size = src.shape[1]
-            global_batch_size = batch_size * get_world_size()
-            beam_size = self.beam_size
-
-            bos = [self.insert_target_start] * (batch_size * beam_size)
-            bos = torch.LongTensor(bos)
-            if self.batch_first:
-                bos = bos.view(-1, 1)
-            else:
-                bos = bos.view(1, -1)
-
-            src_length = torch.LongTensor(src_length)
            stats['total_enc_len'] = int(src_length.sum())

-            if self.cuda:
-                src = src.cuda()
-                bos = bos.cuda()
+            src = src.to(device)
+            src_length = src_length.to(device)

            with torch.no_grad():
                context = self.model.encode(src, src_length)
-                if self.cuda:  src_length = src_length.cuda()
                context = [context, src_length, None]
-
-                if beam_size == 1:
-                    generator = self.generator.greedy_search
-                else:
-                    generator = self.generator.beam_search
                preds, lengths, counter = generator(batch_size, bos, context)

            stats['total_dec_len'] = lengths.sum().item()
            stats['iters'] = counter

-            for idx, pred in zip(indices, preds):
-                output.append((idx, pred))
+            indices = torch.tensor(indices).to(preds)
+            preds = preds.scatter(0, indices.unsqueeze(1).expand_as(preds), preds)
+            preds = gather_predictions(preds).cpu()
+
+            if self.tokenizer:
+                for pred in preds:
+                    pred = pred.tolist()
+                    detok = self.tokenizer.detokenize(pred)
+                    output.append(detok)

            elapsed = time.time() - translate_timer
            batch_time.update(elapsed, batch_size)
@@ -188,15 +218,15 @@ class Translator:
            enc_seq_len.update(stats['total_enc_len'] / batch_size, batch_size)
            dec_seq_len.update(stats['total_dec_len'] / batch_size, batch_size)

-            if i % self.print_freq == 0:
+            if i % self.print_freq == self.print_freq - 1:
                log = []
                log += f'TEST '
                if epoch is not None:
                    log += f'[{epoch}]'
                if iteration is not None:
                    log += f'[{iteration}]'
-                log += f'[{i}/{len(self.loader)}]\t'
-                log += f'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
+                log += f'[{i}/{len(loader)}]\t'
+                log += f'Time {batch_time.val:.4f} ({batch_time.avg:.4f})\t'
                log += f'Decoder iters {iterations.val:.1f} ({iterations.avg:.1f})\t'
                log += f'Tok/s {tot_tok_per_sec.val:.0f} ({tot_tok_per_sec.avg:.0f})'
                log = ''.join(log)
@@ -208,11 +238,11 @@ class Translator:
        batch_time.reduce('mean')
        iterations.reduce('sum')

-        if summary and get_rank() == 0:
+        if summary and utils.get_rank() == 0:
            time_per_sentence = (batch_time.avg / global_batch_size)
            log = []
            log += f'TEST SUMMARY:\n'
-            log += f'Lines translated: {len(self.loader.dataset)}\t'
+            log += f'Lines translated: {len(loader.dataset)}\t'
            log += f'Avg total tokens/s: {tot_tok_per_sec.avg:.0f}\n'
            log += f'Avg time per batch: {batch_time.avg:.3f} s\t'
            log += f'Avg time per sentence: {1000*time_per_sentence:.3f} ms\n'
@@ -222,21 +252,9 @@ class Translator:
            log = ''.join(log)
            logging.info(log)

-        return output
-
-    def run_detokenizer(self, data):
-        """
-        Executes moses detokenizer.
-
-        :param data: list of sentences to detokenize
-        """
+        eval_stats = {}
+        eval_stats['tokens_per_sec'] = tot_tok_per_sec.avg
+        eval_stats['runtimes'] = batch_time.vals
+        eval_stats['throughputs'] = tot_tok_per_sec.vals

-        data = ''.join(data)
-        detok_path = os.path.join(self.dataset_dir, config.DETOKENIZER)
-        cmd = f'perl {detok_path}'
-        logging.info('Running moses detokenizer')
-        z = subprocess.run(cmd, shell=True, input=data.encode(),
-                           stdout=subprocess.PIPE,
-                           stderr=subprocess.DEVNULL)
-        output = z.stdout.decode().splitlines()
-        return output
+        return output, eval_stats
--- a/PyTorch/NLP/gnmt/seq2seq/models/__pycache__/attention.cpython-36.pyc
+++ b/PyTorch/NLP/gnmt/seq2seq/models/__pycache__/attention.cpython-36.pyc
--- a/PyTorch/NLP/gnmt/seq2seq/models/__pycache__/decoder.cpython-36.pyc
+++ b/PyTorch/NLP/gnmt/seq2seq/models/__pycache__/decoder.cpython-36.pyc
--- a/PyTorch/NLP/gnmt/seq2seq/models/__pycache__/encoder.cpython-36.pyc
+++ b/PyTorch/NLP/gnmt/seq2seq/models/__pycache__/encoder.cpython-36.pyc
--- a/PyTorch/NLP/gnmt/seq2seq/models/__pycache__/gnmt.cpython-36.pyc
+++ b/PyTorch/NLP/gnmt/seq2seq/models/__pycache__/gnmt.cpython-36.pyc