更新transformer代码

c0f05c10 · hepj · c056df78 · c056df78 · c056df78 · c056df78
Commit c0f05c10 authored Nov 29, 2022 by hepj
20 changed files
--- a/PyTorch/NLP/Transformer/fairseq/prefixes/nonbreaking_prefix.en
+++ b/PyTorch/NLP/Transformer/fairseq/prefixes/nonbreaking_prefix.en
-#
-# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-#Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
-#Special cases are included for prefixes that ONLY appear before 0-9 numbers.
-
-#any single upper case letter  followed by a period is not a sentence ender (excluding I occasionally, but we leave it in)
-#usually upper case letters are initials in a name
-A
-B
-C
-D
-E
-F
-G
-H
-I
-J
-K
-L
-M
-N
-O
-P
-Q
-R
-S
-T
-U
-V
-W
-X
-Y
-Z
-
-#List of titles. These are often followed by upper-case names, but do not indicate sentence breaks
-Adj
-Adm
-Adv
-Asst
-Bart
-Bldg
-Brig
-Bros
-Capt
-Cmdr
-Col
-Comdr
-Con
-Corp
-Cpl
-DR
-Dr
-Drs
-Ens
-Gen
-Gov
-Hon
-Hr
-Hosp
-Insp
-Lt
-MM
-MR
-MRS
-MS
-Maj
-Messrs
-Mlle
-Mme
-Mr
-Mrs
-Ms
-Msgr
-Op
-Ord
-Pfc
-Ph
-Prof
-Pvt
-Rep
-Reps
-Res
-Rev
-Rt
-Sen
-Sens
-Sfc
-Sgt
-Sr
-St
-Supt
-Surg
-
-#misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence)
-v
-vs
-i.e
-rev
-e.g
-
-#Numbers only. These should only induce breaks when followed by a numeric sequence
-# add NUMERIC_ONLY after the word for this function
-#This case is mostly for the english "No." which can either be a sentence of its own, or
-#if followed by a number, a non-breaking prefix
-No #NUMERIC_ONLY# 
-Nos
-Art #NUMERIC_ONLY#
-Nr
-pp #NUMERIC_ONLY#
-
-#month abbreviations
-Jan
-Feb
-Mar
-Apr
-#May is a full word
-Jun
-Jul
-Aug
-Sep
-Oct
-Nov
-Dec
--- a/PyTorch/NLP/Transformer/fairseq/sequence_generator.py
+++ b/PyTorch/NLP/Transformer/fairseq/sequence_generator.py
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
-#
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
-#
-#-------------------------------------------------------------------------
-#
-# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import math
-
-import torch
-import torch.nn.functional as F
-from torch.cuda import amp
-
-from fairseq import utils
-from fairseq.models import FairseqIncrementalDecoder
-
-
-class SequenceGenerator(object):
-    def __init__(
-        self, models, vocab_meta, maxlen, beam_size=1, minlen=1, stop_early=True,
-        normalize_scores=True, len_penalty=1, unk_penalty=0, retain_dropout=False,
-        sampling=False, sampling_topk=-1, sampling_temperature=1, use_amp=False
-    ):
-        """Generates translations of a given source sentence.
-        Args:
-            min/maxlen: The length of the generated output will be bounded by
-                minlen and maxlen (not including the end-of-sentence marker).
-            stop_early: Stop generation immediately after we finalize beam_size
-                hypotheses, even though longer hypotheses might have better
-                normalized scores.
-            normalize_scores: Normalize scores by the length of the output.
-        """
-        self.models = models
-        self.pad = vocab_meta['pad']
-        self.unk = vocab_meta['unk']
-        self.eos = vocab_meta['eos']
-        self.vocab_size = vocab_meta['len']
-        self.beam_size = beam_size
-        self.minlen = minlen
-        #max_decoder_len = min(m.max_decoder_positions() for m in self.models)
-        #max_decoder_len -= 1  # we define maxlen not including the EOS marker
-        #self.maxlen = max_decoder_len if maxlen is None else min(maxlen, max_decoder_len)
-        self.maxlen = maxlen
-        self.stop_early = stop_early
-        self.normalize_scores = normalize_scores
-        self.len_penalty = len_penalty
-        self.unk_penalty = unk_penalty
-        self.retain_dropout = retain_dropout
-        self.sampling = sampling
-        self.sampling_topk = sampling_topk
-        self.sampling_temperature = sampling_temperature
-        self.use_amp = use_amp
-
-    def cuda(self):
-        for model in self.models:
-            model.cuda()
-        return self
-
-    def generate_batched_itr(
-        self, data_itr, beam_size=None, maxlen_a=0.0, maxlen_b=None,
-        cuda=False, timer=None, prefix_size=0,
-    ):
-        """Iterate over a batched dataset and yield individual translations.
-        Args:
-            maxlen_a/b: generate sequences of maximum length ax + b,
-                where x is the source sentence length.
-            cuda: use GPU for generation
-            timer: StopwatchMeter for timing generations.
-        """
-        if maxlen_b is None:
-            maxlen_b = self.maxlen
-
-        for sample in data_itr:
-            s = utils.move_to_cuda(sample) if cuda else sample
-            if 'net_input' not in s:
-                continue
-            input = s['net_input']
-            srclen = input['src_tokens'].size(1)
-            if timer is not None:
-                timer.start()
-            with torch.no_grad():
-                hypos = self.generate(
-                    input['src_tokens'],
-                    input['src_lengths'],
-                    beam_size=beam_size,
-                    maxlen=int(maxlen_a * srclen + maxlen_b),
-                    prefix_tokens=s['target'][:, :prefix_size] if prefix_size > 0 else None,
-                )
-            if timer is not None:
-                timer.stop(sum(len(h[0]['tokens']) for h in hypos))
-            for i, id in enumerate(s['id'].data):
-                # remove padding
-                src = utils.strip_pad(input['src_tokens'].data[i, :], self.pad)
-                ref = utils.strip_pad(s['target'].data[i, :], self.pad) if s['target'] is not None else None
-                yield id, src, ref, hypos[i]
-
-    def generate(self, src_tokens, src_lengths, beam_size=None, maxlen=None, prefix_tokens=None):
-        """Generate a batch of translations."""
-        with torch.no_grad():
-            with amp.autocast(enabled=self.use_amp):
-                return self._generate(src_tokens, src_lengths, beam_size, maxlen, prefix_tokens)
-
-    def _generate(self, src_tokens, src_lengths, beam_size=None, maxlen=None, prefix_tokens=None):
-        bsz, srclen = src_tokens.size()
-        maxlen = min(maxlen, self.maxlen) if maxlen is not None else self.maxlen
-
-        # the max beam size is the dictionary size - 1, since we never select pad
-        beam_size = beam_size if beam_size is not None else self.beam_size
-        beam_size = min(beam_size, self.vocab_size - 1)
-
-        encoder_outs = []
-        incremental_states = {}
-        for model in self.models:
-            if not self.retain_dropout:
-                model.eval()
-            if isinstance(model.decoder, FairseqIncrementalDecoder):
-                incremental_states[model] = {}
-            else:
-                incremental_states[model] = None
-
-            # compute the encoder output for each beam
-            encoder_out = model.encoder(
-                src_tokens.repeat(1, beam_size).view(-1, srclen),
-                src_lengths.expand(beam_size, src_lengths.numel()).t().contiguous().view(-1),
-            )
-            encoder_outs.append(encoder_out)
-
-        # initialize buffers
-        scores = src_tokens.data.new(bsz * beam_size, maxlen + 1).float().fill_(0)
-        scores_buf = scores.clone()
-        tokens = src_tokens.data.new(bsz * beam_size, maxlen + 2).fill_(self.pad)
-        tokens_buf = tokens.clone()
-        tokens[:, 0] = self.eos
-        attn, attn_buf = None, None
-        nonpad_idxs = None
-
-        # list of completed sentences
-        finalized = [[] for i in range(bsz)]
-        finished = [False for i in range(bsz)]
-        worst_finalized = [{'idx': None, 'score': -math.inf} for i in range(bsz)]
-        num_remaining_sent = bsz
-
-        # number of candidate hypos per step
-        cand_size = 2 * beam_size  # 2 x beam size in case half are EOS
-
-        # offset arrays for converting between different indexing schemes
-        bbsz_offsets = (torch.arange(0, bsz) * beam_size).unsqueeze(1).type_as(tokens)
-        cand_offsets = torch.arange(0, cand_size).type_as(tokens)
-
-        # helper function for allocating buffers on the fly
-        buffers = {}
-
-        def buffer(name, type_of=tokens):  # noqa
-            if name not in buffers:
-                buffers[name] = type_of.new()
-            return buffers[name]
-
-        def is_finished(sent, step, unfinalized_scores=None):
-            """
-            Check whether we've finished generation for a given sentence, by
-            comparing the worst score among finalized hypotheses to the best
-            possible score among unfinalized hypotheses.
-            """
-            assert len(finalized[sent]) <= beam_size
-            if len(finalized[sent]) == beam_size:
-                if self.stop_early or step == maxlen or unfinalized_scores is None:
-                    return True
-                # stop if the best unfinalized score is worse than the worst
-                # finalized one
-                best_unfinalized_score = unfinalized_scores[sent].max()
-                if self.normalize_scores:
-                    best_unfinalized_score /= maxlen ** self.len_penalty
-                if worst_finalized[sent]['score'] >= best_unfinalized_score:
-                    return True
-            return False
-
-        def finalize_hypos(step, bbsz_idx, eos_scores, unfinalized_scores=None):
-            """
-            Finalize the given hypotheses at this step, while keeping the total
-            number of finalized hypotheses per sentence <= beam_size.
-            Note: the input must be in the desired finalization order, so that
-            hypotheses that appear earlier in the input are preferred to those
-            that appear later.
-            Args:
-                step: current time step
-                bbsz_idx: A vector of indices in the range [0, bsz*beam_size),
-                    indicating which hypotheses to finalize
-                eos_scores: A vector of the same size as bbsz_idx containing
-                    scores for each hypothesis
-                unfinalized_scores: A vector containing scores for all
-                    unfinalized hypotheses
-            """
-            assert bbsz_idx.numel() == eos_scores.numel()
-
-            # clone relevant token and attention tensors
-            tokens_clone = tokens.index_select(0, bbsz_idx)
-            tokens_clone = tokens_clone[:, 1:step + 2]  # skip the first index, which is EOS
-            tokens_clone[:, step] = self.eos
-            attn_clone = attn.index_select(0, bbsz_idx)[:, :, 1:step + 2] if attn is not None else None
-
-            # compute scores per token position
-            pos_scores = scores.index_select(0, bbsz_idx)[:, :step + 1]
-            pos_scores[:, step] = eos_scores
-            # convert from cumulative to per-position scores
-            pos_scores[:, 1:] = pos_scores[:, 1:] - pos_scores[:, :-1]
-
-            # normalize sentence-level scores
-            if self.normalize_scores:
-                eos_scores /= (step + 1) ** self.len_penalty
-
-            cum_unfin = []
-            prev = 0
-            for f in finished:
-                if f:
-                    prev += 1
-                else:
-                    cum_unfin.append(prev)
-
-            sents_seen = set()
-            for i, (idx, score) in enumerate(zip(bbsz_idx.tolist(), eos_scores.tolist())):
-                unfin_idx = idx // beam_size
-                sent = unfin_idx + cum_unfin[unfin_idx]
-
-                sents_seen.add((sent, unfin_idx))
-
-                def get_hypo():
-
-                    if attn_clone is not None:
-                        # remove padding tokens from attn scores
-                        hypo_attn = attn_clone[i][nonpad_idxs[sent]]
-                        _, alignment = hypo_attn.max(dim=0)
-                    else:
-                        hypo_attn = None
-                        alignment = None
-
-                    return {
-                        'tokens': tokens_clone[i],
-                        'score': score,
-                        'attention': hypo_attn,  # src_len x tgt_len
-                        'alignment': alignment,
-                        'positional_scores': pos_scores[i],
-                    }
-
-                if len(finalized[sent]) < beam_size:
-                    finalized[sent].append(get_hypo())
-                elif not self.stop_early and score > worst_finalized[sent]['score']:
-                    # replace worst hypo for this sentence with new/better one
-                    worst_idx = worst_finalized[sent]['idx']
-                    if worst_idx is not None:
-                        finalized[sent][worst_idx] = get_hypo()
-
-                    # find new worst finalized hypo for this sentence
-                    idx, s = min(enumerate(finalized[sent]), key=lambda r: r[1]['score'])
-                    worst_finalized[sent] = {
-                        'score': s['score'],
-                        'idx': idx,
-                    }
-
-            newly_finished = []
-            for sent, unfin_idx in sents_seen:
-                # check termination conditions for this sentence
-                if not finished[sent] and is_finished(sent, step, unfinalized_scores):
-                    finished[sent] = True
-                    newly_finished.append(unfin_idx)
-            return newly_finished
-
-        reorder_state = None
-        batch_idxs = None
-        for step in range(maxlen + 1):  # one extra step for EOS marker
-            # reorder decoder internal states based on the prev choice of beams
-            if reorder_state is not None:
-                if batch_idxs is not None:
-                    # update beam indices to take into account removed sentences
-                    corr = batch_idxs - torch.arange(batch_idxs.numel()).type_as(batch_idxs)
-                    reorder_state.view(-1, beam_size).add_(corr.unsqueeze(-1) * beam_size)
-                for i, model in enumerate(self.models):
-                    if isinstance(model.decoder, FairseqIncrementalDecoder):
-                        model.decoder.reorder_incremental_state(incremental_states[model], reorder_state)
-                    encoder_outs[i] = model.encoder.reorder_encoder_out(*encoder_outs[i], reorder_state)
-
-            probs, avg_attn_scores = self._decode(tokens[:, :step + 1], encoder_outs, incremental_states)
-            if step == 0:
-                # at the first step all hypotheses are equally likely, so use
-                # only the first beam
-                probs = probs.unfold(0, 1, beam_size).squeeze(2).contiguous()
-                scores = scores.type_as(probs)
-                scores_buf = scores_buf.type_as(probs)
-            elif not self.sampling:
-                # make probs contain cumulative scores for each hypothesis
-                probs.add_(scores[:, step - 1].view(-1, 1))
-
-            probs[:, self.pad] = -math.inf  # never select pad
-            probs[:, self.unk] -= self.unk_penalty  # apply unk penalty
-
-            # Record attention scores
-            if avg_attn_scores is not None:
-                if attn is None:
-                    attn = scores.new(bsz * beam_size, src_tokens.size(1), maxlen + 2)
-                    attn_buf = attn.clone()
-                    nonpad_idxs = src_tokens.ne(self.pad)
-                attn[:, :, step + 1].copy_(avg_attn_scores)
-
-            cand_scores = buffer('cand_scores', type_of=scores)
-            cand_indices = buffer('cand_indices')
-            cand_beams = buffer('cand_beams')
-            eos_bbsz_idx = buffer('eos_bbsz_idx')
-            eos_scores = buffer('eos_scores', type_of=scores)
-            if step < maxlen:
-                if prefix_tokens is not None and step < prefix_tokens.size(1):
-                    probs_slice = probs.view(bsz, -1, probs.size(-1))[:, 0, :]
-                    cand_scores = torch.gather(
-                        probs_slice, dim=1,
-                        index=prefix_tokens[:, step].view(-1, 1).data
-                    ).expand(-1, cand_size)
-                    cand_indices = prefix_tokens[:, step].view(-1, 1).expand(bsz, cand_size).data
-                    cand_beams.resize_as_(cand_indices).fill_(0)
-                elif self.sampling:
-                    assert self.pad == 1, 'sampling assumes the first two symbols can be ignored'
-
-                    if self.sampling_topk > 0:
-                        values, indices = probs[:, 2:].topk(self.sampling_topk)
-                        exp_probs = values.div_(self.sampling_temperature).exp()
-                        if step == 0:
-                            torch.multinomial(exp_probs, beam_size, replacement=True, out=cand_indices)
-                        else:
-                            torch.multinomial(exp_probs, 1, replacement=True, out=cand_indices)
-                        torch.gather(exp_probs, dim=1, index=cand_indices, out=cand_scores)
-                        torch.gather(indices, dim=1, index=cand_indices, out=cand_indices)
-                        cand_indices.add_(2)
-                    else:
-                        exp_probs = probs.div_(self.sampling_temperature).exp_().view(-1, self.vocab_size)
-
-                        if step == 0:
-                            # we exclude the first two vocab items, one of which is pad
-                            torch.multinomial(exp_probs[:, 2:], beam_size, replacement=True, out=cand_indices)
-                        else:
-                            torch.multinomial(exp_probs[:, 2:], 1, replacement=True, out=cand_indices)
-
-                        cand_indices.add_(2)
-                        torch.gather(exp_probs, dim=1, index=cand_indices, out=cand_scores)
-
-                    cand_scores.log_()
-                    cand_indices = cand_indices.view(bsz, -1).repeat(1, 2)
-                    cand_scores = cand_scores.view(bsz, -1).repeat(1, 2)
-                    if step == 0:
-                        cand_beams = torch.zeros(bsz, cand_size).type_as(cand_indices)
-                    else:
-                        cand_beams = torch.arange(0, beam_size).repeat(bsz, 2).type_as(cand_indices)
-                        # make scores cumulative
-                        cand_scores.add_(
-                            torch.gather(
-                                scores[:, step - 1].view(bsz, beam_size), dim=1,
-                                index=cand_beams,
-                            )
-                        )
-                else:
-                    # take the best 2 x beam_size predictions. We'll choose the first
-                    # beam_size of these which don't predict eos to continue with.
-                    torch.topk(
-                        probs.view(bsz, -1),
-                        k=min(cand_size, probs.view(bsz, -1).size(1) - 1),  # -1 so we never select pad
-                        out=(cand_scores, cand_indices),
-                    )
-                    torch.div(cand_indices, self.vocab_size, out=cand_beams, rounding_mode='trunc')
-                    cand_indices.fmod_(self.vocab_size)
-            else:
-                # finalize all active hypotheses once we hit maxlen
-                # pick the hypothesis with the highest prob of EOS right now
-                torch.sort(
-                    probs[:, self.eos],
-                    descending=True,
-                    out=(eos_scores, eos_bbsz_idx),
-                )
-                num_remaining_sent -= len(finalize_hypos(
-                    step, eos_bbsz_idx, eos_scores))
-                assert num_remaining_sent == 0
-                break
-
-            # cand_bbsz_idx contains beam indices for the top candidate
-            # hypotheses, with a range of values: [0, bsz*beam_size),
-            # and dimensions: [bsz, cand_size]
-            cand_bbsz_idx = cand_beams.add(bbsz_offsets)
-
-            # finalize hypotheses that end in eos
-            eos_mask = cand_indices.eq(self.eos)
-
-            finalized_sents = set()
-            if step >= self.minlen:
-                # only consider eos when it's among the top beam_size indices
-                torch.masked_select(
-                    cand_bbsz_idx[:, :beam_size],
-                    mask=eos_mask[:, :beam_size],
-                    out=eos_bbsz_idx,
-                )
-                if eos_bbsz_idx.numel() > 0:
-                    torch.masked_select(
-                        cand_scores[:, :beam_size],
-                        mask=eos_mask[:, :beam_size],
-                        out=eos_scores,
-                    )
-                    finalized_sents = finalize_hypos(
-                        step, eos_bbsz_idx, eos_scores, cand_scores)
-                    num_remaining_sent -= len(finalized_sents)
-
-            assert num_remaining_sent >= 0
-            if num_remaining_sent == 0:
-                break
-            assert step < maxlen
-
-            if len(finalized_sents) > 0:
-                new_bsz = bsz - len(finalized_sents)
-
-                # construct batch_idxs which holds indices of batches to keep for the next pass
-                batch_mask = torch.ones(bsz).type_as(cand_indices)
-                batch_mask[cand_indices.new(finalized_sents)] = 0
-                batch_idxs = batch_mask.nonzero().squeeze(-1)
-
-                eos_mask = eos_mask[batch_idxs]
-                cand_beams = cand_beams[batch_idxs]
-                bbsz_offsets.resize_(new_bsz, 1)
-                cand_bbsz_idx = cand_beams.add(bbsz_offsets)
-
-                cand_scores = cand_scores[batch_idxs]
-                cand_indices = cand_indices[batch_idxs]
-                if prefix_tokens is not None:
-                    prefix_tokens = prefix_tokens[batch_idxs]
-
-                scores = scores.view(bsz, -1)[batch_idxs].view(new_bsz * beam_size, -1)
-                scores_buf.resize_as_(scores)
-                tokens = tokens.view(bsz, -1)[batch_idxs].view(new_bsz * beam_size, -1)
-                tokens_buf.resize_as_(tokens)
-                if attn is not None:
-                    attn = attn.view(bsz, -1)[batch_idxs].view(new_bsz * beam_size, attn.size(1), -1)
-                    attn_buf.resize_as_(attn)
-                bsz = new_bsz
-            else:
-                batch_idxs = None
-
-            # set active_mask so that values > cand_size indicate eos hypos
-            # and values < cand_size indicate candidate active hypos.
-            # After, the min values per row are the top candidate active hypos
-            active_mask = buffer('active_mask')
-            torch.add(
-                eos_mask.type_as(cand_offsets) * cand_size,
-                cand_offsets[:eos_mask.size(1)],
-                out=active_mask,
-            )
-
-            # get the top beam_size active hypotheses, which are just the hypos
-            # with the smallest values in active_mask
-            active_hypos, _ignore = buffer('active_hypos'), buffer('_ignore')
-            torch.topk(
-                active_mask, k=beam_size, dim=1, largest=False,
-                out=(_ignore, active_hypos)
-            )
-            active_bbsz_idx = buffer('active_bbsz_idx')
-            torch.gather(
-                cand_bbsz_idx, dim=1, index=active_hypos,
-                out=active_bbsz_idx,
-            )
-            active_scores = torch.gather(
-                cand_scores, dim=1, index=active_hypos,
-                out=scores[:, step].view(bsz, beam_size),
-            )
-
-            active_bbsz_idx = active_bbsz_idx.view(-1)
-            active_scores = active_scores.view(-1)
-
-            # copy tokens and scores for active hypotheses
-            torch.index_select(
-                tokens[:, :step + 1], dim=0, index=active_bbsz_idx,
-                out=tokens_buf[:, :step + 1],
-            )
-            torch.gather(
-                cand_indices, dim=1, index=active_hypos,
-                out=tokens_buf.view(bsz, beam_size, -1)[:, :, step + 1],
-            )
-            if step > 0:
-                torch.index_select(
-                    scores[:, :step], dim=0, index=active_bbsz_idx,
-                    out=scores_buf[:, :step],
-                )
-            torch.gather(
-                cand_scores, dim=1, index=active_hypos,
-                out=scores_buf.view(bsz, beam_size, -1)[:, :, step],
-            )
-
-            # copy attention for active hypotheses
-            if attn is not None:
-                torch.index_select(
-                    attn[:, :, :step + 2], dim=0, index=active_bbsz_idx,
-                    out=attn_buf[:, :, :step + 2],
-                )
-
-            # swap buffers
-            tokens, tokens_buf = tokens_buf, tokens
-            scores, scores_buf = scores_buf, scores
-            if attn is not None:
-                attn, attn_buf = attn_buf, attn
-
-            # reorder incremental state in decoder
-            reorder_state = active_bbsz_idx
-
-        # sort by score descending
-        for sent in range(len(finalized)):
-            finalized[sent] = sorted(finalized[sent], key=lambda r: r['score'], reverse=True)
-
-        return finalized
-
-    def _decode(self, tokens, encoder_outs, incremental_states):
-        if len(self.models) == 1:
-            return self._decode_one(tokens, self.models[0], encoder_outs[0], incremental_states, log_probs=True)
-
-        avg_probs = None
-        avg_attn = None
-        for model, encoder_out in zip(self.models, encoder_outs):
-            probs, attn = self._decode_one(tokens, model, encoder_out, incremental_states, log_probs=False)
-            if avg_probs is None:
-                avg_probs = probs
-            else:
-                avg_probs.add_(probs)
-            if attn is not None:
-                if avg_attn is None:
-                    avg_attn = attn
-                else:
-                    avg_attn.add_(attn)
-        avg_probs.div_(len(self.models))
-        avg_probs.log_()
-        if avg_attn is not None:
-            avg_attn.div_(len(self.models))
-        return avg_probs, avg_attn
-
-    def _decode_one(self, tokens, model, encoder_out, incremental_states, log_probs):
-        with torch.no_grad():
-            if incremental_states[model] is not None:
-                decoder_out = list(model.decoder(tokens, encoder_out[0], encoder_out[1], incremental_state=incremental_states[model]))
-            else:
-                decoder_out = list(model.decoder(tokens, encoder_out[0], encoder_out[1]))
-            decoder_out[0] = decoder_out[0][:, -1, :]
-            attn = decoder_out[1]
-            if isinstance(attn, torch.Tensor) and attn.numel() == 0:
-                attn = None
-            if attn is not None:
-                attn = attn[:, -1, :]
-
-        logits = decoder_out[0]
-        if log_probs:
-            probs = F.log_softmax(logits, dim=-1, dtype=torch.float32)
-        else:
-            probs = F.softmax(logits, dim=-1, dtype=torch.float32)
-
-        return probs, attn
--- a/PyTorch/NLP/Transformer/fairseq/tokenizer.py
+++ b/PyTorch/NLP/Transformer/fairseq/tokenizer.py
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
-#
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
-#
-#-------------------------------------------------------------------------
-#
-# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-from collections import Counter
-import re
-
-import torch
-
-
-SPACE_NORMALIZER = re.compile("\s+")
-
-path = os.path.join(os.path.dirname(os.path.abspath(__file__)),'prefixes/nonbreaking_prefix.en')
-prefixes ={}
-
-with open(path, 'r') as f:
-    for line in f:
-        line = line.strip()
-        if line and not line[0] == '#':
-            match = re.search(r'(.*)[\s]+(\#NUMERIC_ONLY\#)', line)
-            if match:
-                prefixes[match.group(1)] = 2
-            else:
-                prefixes[line] = 1
-
-def get_unicode_categories():
-    import sys
-    from collections import defaultdict
-    import unicodedata
-    cats = defaultdict(list)
-    for c in map(chr, range(sys.maxunicode + 1)):
-        cats[unicodedata.category(c)].append(c)
-    return cats
-
-NUMERICS = ''.join(get_unicode_categories()['No'])
-
-def tokenize_line(line):
-    line = SPACE_NORMALIZER.sub(" ", line)
-    line = line.strip()
-    return line
-
-def tokenize_en(line):
-    line = line.strip()
-    line = ' ' + line + ' '
-    # remove ASCII junk
-    line = re.sub(r'\s+', ' ', line)
-    line = re.sub(r'[\x00-\x1F]', '', line)
-    #fix whitespaces
-    line = re.sub('\ +', ' ', line)
-    line = re.sub('^ ', '', line)
-    line = re.sub(' $', '', line)
-    #separate other special characters
-    line = re.sub(r'([^\s\.\'\`\,\-\w]|[_'+NUMERICS+'])', r' \g<1> ', line)
-    line = re.sub(r'(\w)\-(?=\w)', r'\g<1> @-@ ', line)
-
-    #multidots stay together
-    line = re.sub(r'\.([\.]+)', r' DOTMULTI\g<1>', line)
-    while re.search(r'DOTMULTI\.', line):
-        line = re.sub(r'DOTMULTI\.([^\.])', r'DOTDOTMULTI \g<1>', line)
-        line = re.sub(r'DOTMULTI\.', r'DOTDOTMULTI', line)
-
-    # separate out "," except if within numbers (5,300)
-    line = re.sub(r'([\D])[,]', r'\g<1> , ', line)
-    line = re.sub(r'[,]([\D])', r' , \g<1>', line)
-
-    # separate "," after a number if it's the end of sentence
-    line = re.sub(r'(\d)[,]$', r'\g<1> ,', line)
-
-    # split contractions right
-    line = re.sub(r'([\W\d])[\']([\W\d])', '\g<1> \' \g<2>', line)
-    line = re.sub(r'(\W)[\']([\w\D])', '\g<1> \' \g<2>', line)
-    line = re.sub(r'([\w\D])[\']([\W\d])', '\g<1> \' \g<2>', line)
-    line = re.sub(r'([\w\D])[\']([\w\D])', '\g<1> \'\g<2>', line)
-    # special case for "1990's"
-    line = re.sub(r'([\W\d])[\']([s])', '\g<1> \'\g<2>', line)
-
-    # apply nonbreaking prefixes
-    words = line.split()
-    line = ''
-    for i in range(len(words)):
-        word = words[i]
-        match =  re.search(r'^(\S+)\.$', word)
-        if match:
-            pre = match.group(1)
-            if i==len(words)-1:
-                # split last words independently as they are unlikely to be non-breaking prefixes
-                word = pre+' .'
-            elif ((re.search(r'\.', pre) and re.search(r'[^\.\W\d]', pre))
-                    or (pre in prefixes and prefixes[pre]==1)
-                    or re.search(r'^[a-z]', words[i+1])
-                    or (pre in prefixes and prefixes[pre]==2 and re.search(r'^[0-9]+', words[i+1]))):
-                pass
-            else:
-                word = pre+' .'
-
-        word +=' '
-        line += word
-
-    # clean up extraneous spaces
-    line = re.sub(' +', ' ', line)
-    line = re.sub('^ ', '', line)
-    line = re.sub(' $', '', line)
-
-    # .' at end of sentence is missed
-    line = re.sub(r'\.\' ?$', ' . \' ', line)
-
-    #restore multi-dots
-    while re.search('DOTDOTMULTI', line):
-        line = re.sub('DOTDOTMULTI', 'DOTMULTI.', line)
-
-    line = re.sub('DOTMULTI', '.', line)
-
-    # escape special characters
-    line = re.sub(r'\&', r'&amp;', line)
-    line = re.sub(r'\|', r'&#124;', line)
-    line = re.sub(r'\<', r'&lt;', line)
-    line = re.sub(r'\>', r'&gt;', line)
-    line = re.sub(r'\'', r'&apos;', line)
-    line = re.sub(r'\"', r'&quot;', line)
-    line = re.sub(r'\[', r'&#91;', line)
-    line = re.sub(r'\]', r'&#93;', line)
-
-    #ensure final line breaks
-    if line[-1] != '\n':
-        line += '\n'
-
-    return line
-
-def deescape(line):
-    line = re.sub(r'&#124;', r'|', line)
-    line = re.sub(r'&lt;', r'<', line)
-    line = re.sub(r'&gt;', r'>', line)
-    line = re.sub(r'&quot;', '\"', line)
-    line = re.sub(r'&apos;', '\'', line)
-    line = re.sub(r'&#91;', r'[', line)
-    line = re.sub(r'&#93;', r']', line)
-    line = re.sub(r'&amp;', r'&', line)
-    return line
-
-
-class Tokenizer:
-
-    @staticmethod
-    def add_file_to_dictionary(filename, dict, tokenize):
-        with open(filename, 'r') as f:
-            for line in f:
-                for word in tokenize(line).split():
-                    dict.add_symbol(word)
-                dict.add_symbol(dict.eos_word)
-
-    @staticmethod
-    def binarize(filename, dict, consumer, tokenize=tokenize_line,
-                 append_eos=True, reverse_order=False):
-        nseq, ntok = 0, 0
-        replaced = Counter()
-
-        def replaced_consumer(word, idx):
-            if idx == dict.unk_index and word != dict.unk_word:
-                replaced.update([word])
-
-        with open(filename, 'r') as f:
-            for line in f:
-                ids = Tokenizer.tokenize(
-                    line=line,
-                    dictionary=dict,
-                    tokenize=tokenize,
-                    add_if_not_exist=False,
-                    consumer=replaced_consumer,
-                    append_eos=append_eos,
-                    reverse_order=reverse_order,
-                )
-                nseq += 1
-
-                consumer(ids)
-                ntok += len(ids)
-        return {'nseq': nseq, 'nunk': sum(replaced.values()), 'ntok': ntok, 'replaced': len(replaced)}
-
-    @staticmethod
-    def tokenize(line, dictionary, tokenize=tokenize_line, add_if_not_exist=True,
-                 consumer=None, append_eos=True, reverse_order=False, bpe=None):
-        line = tokenize(line)
-        if bpe:
-            line = bpe.process_line(line)
-        words = line.split()
-        if reverse_order:
-            words = list(reversed(words))
-        nwords = len(words)
-        ids = torch.IntTensor(nwords + 1 if append_eos else nwords)
-
-        for i, word in enumerate(words):
-            if add_if_not_exist:
-                idx = dictionary.add_symbol(word)
-            else:
-                idx = dictionary.index(word)
-            if consumer is not None:
-                consumer(word, idx)
-            ids[i] = idx
-        if append_eos:
-            ids[nwords] = dictionary.eos_index
-        return ids
-    
-    @staticmethod
-    def detokenize(line, lang):
-        #don't try to detokenize XML/HTML tag lines
-        if re.search(r'^<.+>$', line) or re.search(r'^\s*$', line):
-            return line
-
-        line = line.strip()
-        line = ' '+line+' '
-        line = re.sub(r' @-@ ', '-', line)
-        line = deescape(line)
-        words = line.split()
-        line = ''
-        quote_count = {'\'':0, '\"':0}
-        prepend_space = ' '
-        for i in range(len(words)):
-            #perform rught shift of currency and some punctuation
-            if re.search(r'^[\u20ac\x24\(\[\{]+$', words[i]):
-                line += prepend_space + words[i]
-                prepend_space = ''
-            elif re.search(r'^[\,\.\?\!\:\;\\\%\}\]\)]+$', words[i]):
-                if lang=='fr' and re.search(r'^[\?\!\:\;\\\%]$', words[i]):
-                    line += ' '
-                line += words[i]
-                prepend_space = ' '
-            elif lang=='en' and i>0 and re.search(r'^[\'][\w\D]', words[i]) and re.search(r'\w$', words[i-1]):
-                line += words[i]
-                prepend_space = ' '
-            elif lang=='cs' and i>1 and re.search(r'^\d+$', words[i-2]) and re.search(r'^[.,]$', words[i-1]) and re.search(r'^\w+$', words[i]):
-                line += words[i]
-                prepend_space = ' '
-            elif (lang=='fr' or lang=='it') and i<len(words)-1 and re.search(r'[\w\D][\']$', words[i]) and re.search(r'^[\w\D]', words[i+1]):
-                line += prepend_space + words[i]
-                prepend_space = ''
-            elif lang=='cs' and i<len(words)-3 and \
-                    re.search(r'[\w\D]$', words[i]) and \
-                    re.search(r'^-$', words[i+1]) and \
-                    re.search(r'^li$|^mail.*', words[i+2], re.I):
-                #line += ' '+words[i]+words[i+1]
-                pass #TODO: skip one word
-            elif re.search(r'^[\'\"\x60\u201c\u201d]+$', words[i]):
-                normalized_quo = '\"' if re.search(r'^[\u201c\u201d]+$', words[i]) else words[i]
-                quote_count[normalized_quo] = 0 if normalized_quo not in quote_count.keys() else quote_count[normalized_quo]
-                if lang=='cs' and words[i] == '\u201c':
-                    quote_count[normalized_quo] = 0
-                if lang=='cs' and words[i] == '\u201d':
-                    quote_count[normalized_quo] = 1
-                if quote_count[normalized_quo] % 2 == 0:
-                    if lang=='en' and words[i]=='\'' and i > 0 and re.search(r'[s]$', words[i-1]):
-                        #single quote for posessives ending in s... "The Jones' house"
-                        #left shift
-                        line += words[i]
-                        prepend_space = ' '
-                    else:
-                        #right shift
-                        line += prepend_space + words[i]
-                        prepend_space = ''
-                        quote_count[normalized_quo] += 1
-                else:
-                    #left shift
-                    line += words[i]
-                    prepend_space = ' '
-                    quote_count[normalized_quo] += 1
-            elif lang=='fi' and re.search(r':$', words[i-1]) and re.search(r'^(N|n|A|a|Ä|ä|ssa|Ssa|ssä|Ssä|sta|stä|Sta|Stä|hun|Hun|hyn|Hyn|han|Han|hän|Hän|hön|Hön|un|Un|yn|Yn|an|An|än|Än|ön|Ön|seen|Seen|lla|Lla|llä|Llä|lta|Lta|ltä|Ltä|lle|Lle|ksi|Ksi|kse|Kse|tta|Tta|ine|Ine)(ni|si|mme|nne|nsa)?(ko|kö|han|hän|pa|pä|kaan|kään|kin)?$', words[i]):
-                line += words[i].lower()
-                prepend_space = ' '
-            else:
-                line += prepend_space + words[i]
-                prepend_space = ' '
-
-        #clean up spaces at head and tail of each line as well as any double-spacing
-        line = re.sub(r' +', ' ', line)
-        line = re.sub(r'\n ', '\n', line)
-        line = re.sub(r' \n', '\n', line)
-        line = re.sub(r'^ ', '', line)
-        line = re.sub(r' $', '', line)
-
-        #add trailing break
-        line += '\n' if line[-1] != '\n' else ''
-
-        return line
--- a/PyTorch/NLP/Transformer/fairseq/utils.py
+++ b/PyTorch/NLP/Transformer/fairseq/utils.py
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
-#
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
-#
-#--------------------------------------------------------------------
-#
-# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from collections import defaultdict, OrderedDict
-import logging
-import os
-import re
-import torch
-import traceback
-
-from torch.serialization import default_restore_location
-
-
-def torch_persistent_save(*args, **kwargs):
-    for i in range(3):
-        try:
-            return torch.save(*args, **kwargs)
-        except Exception:
-            if i == 2:
-                logging.error(traceback.format_exc())
-
-
-def convert_state_dict_type(state_dict, ttype=torch.FloatTensor):
-    if isinstance(state_dict, dict):
-        cpu_dict = OrderedDict()
-        for k, v in state_dict.items():
-            cpu_dict[k] = convert_state_dict_type(v)
-        return cpu_dict
-    elif isinstance(state_dict, list):
-        return [convert_state_dict_type(v) for v in state_dict]
-    elif torch.is_tensor(state_dict):
-        return state_dict.type(ttype)
-    else:
-        return state_dict
-
-
-def save_state(filename, args, model, criterion, optimizer, lr_scheduler,
-               num_updates, optim_history=None, extra_state=None):
-    if optim_history is None:
-        optim_history = []
-    if extra_state is None:
-        extra_state = {}
-    state_dict = {
-        'args': args,
-        'model': convert_state_dict_type(model.state_dict()),
-        'optimizer_history': optim_history + [
-            {
-                'criterion_name': criterion.__class__.__name__,
-                'optimizer_name': optimizer.__class__.__name__,
-                'lr_scheduler_state': lr_scheduler.state_dict(),
-                'num_updates': num_updates,
-            }
-        ],
-        'last_optimizer_state': convert_state_dict_type(optimizer.state_dict()),
-        'extra_state': extra_state,
-    }
-    torch_persistent_save(state_dict, filename)
-
-
-def load_model_state(filename, model):
-    if not os.path.exists(filename):
-        return None, [], None
-    state = torch.load(filename, map_location=lambda s, l: default_restore_location(s, 'cpu'))
-
-    # load model parameters
-    try:
-        model.load_state_dict(state['model'], strict=True)
-    except Exception:
-        raise Exception('Cannot load model parameters from checkpoint, '
-                        'please ensure that the architectures match')
-
-    return state['extra_state'], state['optimizer_history'], state['last_optimizer_state']
-
-
-def move_to_cuda(sample):
-    if len(sample) == 0:
-        return {}
-
-    def _move_to_cuda(maybe_tensor):
-        if torch.is_tensor(maybe_tensor):
-            return maybe_tensor.cuda()
-        elif isinstance(maybe_tensor, dict):
-            return {
-                key: _move_to_cuda(value)
-                for key, value in maybe_tensor.items()
-            }
-        elif isinstance(maybe_tensor, list):
-            return [_move_to_cuda(x) for x in maybe_tensor]
-        else:
-            return maybe_tensor
-
-    return _move_to_cuda(sample)
-
-
-INCREMENTAL_STATE_INSTANCE_ID = defaultdict(lambda: 0)
-
-
-def _get_full_incremental_state_key(module_instance, key):
-    module_name = module_instance.__class__.__name__
-
-    # assign a unique ID to each module instance, so that incremental state is
-    # not shared across module instances
-    if not hasattr(module_instance, '_fairseq_instance_id'):
-        INCREMENTAL_STATE_INSTANCE_ID[module_name] += 1
-        module_instance._fairseq_instance_id = INCREMENTAL_STATE_INSTANCE_ID[module_name]
-
-    return '{}.{}.{}'.format(module_name, module_instance._fairseq_instance_id, key)
-
-
-def get_incremental_state(module, incremental_state, key):
-    """Helper for getting incremental state for an nn.Module."""
-    full_key = _get_full_incremental_state_key(module, key)
-    if incremental_state is None or full_key not in incremental_state:
-        return None
-    return incremental_state[full_key]
-
-
-def set_incremental_state(module, incremental_state, key, value):
-    """Helper for setting incremental state for an nn.Module."""
-    if incremental_state is not None:
-        full_key = _get_full_incremental_state_key(module, key)
-        incremental_state[full_key] = value
-
-
-def load_align_dict(replace_unk):
-    if replace_unk is None:
-        align_dict = None
-    elif isinstance(replace_unk, str):
-        # Load alignment dictionary for unknown word replacement if it was passed as an argument.
-        align_dict = {}
-        with open(replace_unk, 'r') as f:
-            for line in f:
-                cols = line.split()
-                align_dict[cols[0]] = cols[1]
-    else:
-        # No alignment dictionary provided but we still want to perform unknown word replacement by copying
-        # the original source word.
-        align_dict = {}
-    return align_dict
-
-
-def print_embed_overlap(embed_dict, vocab_dict):
-    embed_keys = set(embed_dict.keys())
-    vocab_keys = set(vocab_dict.symbols)
-    overlap = len(embed_keys & vocab_keys)
-    print("| Found {}/{} types in embedding file.".format(overlap, len(vocab_dict)))
-
-
-def parse_embedding(embed_path):
-    """Parse embedding text file into a dictionary of word and embedding tensors.
-
-    The first line can have vocabulary size and dimension. The following lines
-    should contain word and embedding separated by spaces.
-
-    Example:
-        2 5
-        the -0.0230 -0.0264  0.0287  0.0171  0.1403
-        at -0.0395 -0.1286  0.0275  0.0254 -0.0932
-    """
-    embed_dict = {}
-    with open(embed_path) as f_embed:
-        next(f_embed)  # skip header
-        for line in f_embed:
-            pieces = line.rstrip().split(" ")
-            embed_dict[pieces[0]] = torch.Tensor([float(weight) for weight in pieces[1:]])
-    return embed_dict
-
-
-def load_embedding(embed_dict, vocab, embedding):
-    for idx in range(len(vocab)):
-        token = vocab[idx]
-        if token in embed_dict:
-            embedding.weight.data[idx] = embed_dict[token]
-    return embedding
-
-
-def replace_unk(hypo_str, src_str, alignment, align_dict, unk):
-    from fairseq import tokenizer
-    # Tokens are strings here
-    hypo_tokens = tokenizer.tokenize_line(hypo_str)
-    # TODO: Very rare cases where the replacement is '<eos>' should be handled gracefully
-    src_tokens = tokenizer.tokenize_line(src_str) + ['<eos>']
-    for i, ht in enumerate(hypo_tokens):
-        if ht == unk:
-            src_token = src_tokens[alignment[i]]
-            # Either take the corresponding value in the aligned dictionary or just copy the original value.
-            hypo_tokens[i] = align_dict.get(src_token, src_token)
-    return ' '.join(hypo_tokens)
-
-
-def post_process_prediction(hypo_tokens, src_str, alignment, align_dict, tgt_dict, remove_bpe):
-    from fairseq import tokenizer
-    hypo_str = tgt_dict.string(hypo_tokens, remove_bpe)
-    if align_dict is not None:
-        hypo_str = replace_unk(hypo_str, src_str, alignment, align_dict, tgt_dict.unk_string())
-    if align_dict is not None or remove_bpe is not None:
-        # Convert back to tokens for evaluating with unk replacement or without BPE
-        # Note that the dictionary can be modified inside the method.
-        hypo_tokens = tokenizer.Tokenizer.tokenize(hypo_str, tgt_dict, add_if_not_exist=True)
-    return hypo_tokens, hypo_str, alignment
-
-
-def make_positions(tensor, padding_idx, left_pad):
-    """Replace non-padding symbols with their position numbers.
-
-    Position numbers begin at padding_idx+1.
-
-    Padding symbols are ignored, but it is necessary to specify whether padding
-    is added on the left side (left_pad=True) or right side (left_pad=False).
-    """
-    max_pos = padding_idx + 1 + tensor.size(1)
-    if not hasattr(make_positions, 'range_buf'):
-        make_positions.range_buf = torch.arange(padding_idx + 1, 768,
-                                                dtype=tensor.dtype, device=tensor.device)
-    make_positions.range_buf = make_positions.range_buf.type_as(tensor)
-    if make_positions.range_buf.numel() < max_pos:
-        torch.arange(padding_idx + 1, max_pos, out=make_positions.range_buf)
-    mask = tensor.ne(padding_idx)
-    positions = make_positions.range_buf[:tensor.size(1)].expand_as(tensor)
-    if left_pad:
-        positions = positions - mask.size(1) + mask.long().sum(dim=1).unsqueeze(1)
-    return tensor.clone().masked_scatter_(mask, positions[mask])
-
-
-def strip_pad(tensor, pad):
-    return tensor[tensor.ne(pad)]
-
-
-def buffered_arange(max):
-    if not hasattr(buffered_arange, 'buf'):
-        buffered_arange.buf = torch.LongTensor()
-    if max > buffered_arange.buf.numel():
-        torch.arange(max, out=buffered_arange.buf)
-    return buffered_arange.buf[:max]
-
-
-def convert_padding_direction(src_tokens, padding_idx, right_to_left=False, left_to_right=False):
-    assert right_to_left ^ left_to_right
-    pad_mask = src_tokens.eq(padding_idx)
-    if not pad_mask.any():
-        # no padding, return early
-        return src_tokens
-    if left_to_right and not pad_mask[:, 0].any():
-        # already right padded
-        return src_tokens
-    if right_to_left and not pad_mask[:, -1].any():
-        # already left padded
-        return src_tokens
-    max_len = src_tokens.size(1)
-    range = buffered_arange(max_len).type_as(src_tokens).expand_as(src_tokens)
-    num_pads = pad_mask.long().sum(dim=1, keepdim=True)
-    if right_to_left:
-        index = torch.remainder(range - num_pads, max_len)
-    else:
-        index = torch.remainder(range + num_pads, max_len)
-    return src_tokens.gather(1, index)
-
-
-def item(tensor):
-    if hasattr(tensor, 'item'):
-        return tensor.item()
-    if hasattr(tensor, '__getitem__'):
-        return tensor[0]
-    return tensor
-
-
-def clip_grad_norm_(tensor, max_norm):
-    grad_norm = item(torch.norm(tensor))
-    if grad_norm > max_norm > 0:
-        clip_coef = max_norm / (grad_norm + 1e-6)
-        tensor.mul_(clip_coef)
-    return grad_norm
-
-
-def fill_with_neg_inf(t):
-    """FP16-compatible function that fills a tensor with -inf."""
-    return t.float().fill_(float('-inf')).type_as(t)
-
-
-def checkpoint_paths(path, pattern=r'checkpoint(\d+)\.pt'):
-    """Retrieves all checkpoints found in `path` directory.
-
-    Checkpoints are identified by matching filename to the specified pattern. If
-    the pattern contains groups, the result will be sorted by the first group in
-    descending order.
-    """
-    pt_regexp = re.compile(pattern)
-    files = os.listdir(path)
-
-    entries = []
-    for i, f in enumerate(files):
-        m = pt_regexp.fullmatch(f)
-        if m is not None:
-            idx = int(m.group(1)) if len(m.groups()) > 0 else i
-            entries.append((idx, m.group(0)))
-    return [os.path.join(path, x[1]) for x in sorted(entries, reverse=True)]
-
--- a/PyTorch/NLP/Transformer/inference.py
+++ b/PyTorch/NLP/Transformer/inference.py
-#!/usr/bin/env python3 -u
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
-#
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
-#
-#-------------------------------------------------------------------------
-#
-# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import sys
-import os
-import time
-from collections import namedtuple
-
-import numpy as np
-import torch
-from torch.serialization import default_restore_location
-
-from fairseq import data, options, tokenizer, utils, log_helper
-from fairseq.sequence_generator import SequenceGenerator
-from fairseq.meters import StopwatchMeter
-from fairseq.models.transformer import TransformerModel
-import dllogger
-
-from apply_bpe import BPE
-
-
-Batch = namedtuple('Batch', 'srcs tokens lengths')
-Translation = namedtuple('Translation', 'src_str hypos pos_scores alignments')
-
-
-def load_ensemble_for_inference(filenames):
-    """Load an ensemble of models for inference.
-
-    model_arg_overrides allows you to pass a dictionary model_arg_overrides --
-    {'arg_name': arg} -- to override model args that were used during model
-    training
-    """
-    # load model architectures and weights
-    states = []
-    for filename in filenames:
-        if not os.path.exists(filename):
-            raise IOError('Model file not found: {}'.format(filename))
-        state = torch.load(filename, map_location=lambda s, l: default_restore_location(s, 'cpu'))
-        states.append(state)
-
-    ensemble = []
-    for state in states:
-        args = state['args']
-
-        # build model for ensemble
-        model = TransformerModel.build_model(args)
-        model.load_state_dict(state['model'], strict=True)
-        ensemble.append(model)
-
-    src_dict = states[0]['extra_state']['src_dict']
-    tgt_dict = states[0]['extra_state']['tgt_dict']
-
-    return ensemble, args, src_dict, tgt_dict
-
-
-def buffered_read(buffer_size, data_descriptor):
-    buffer = []
-    for src_str in data_descriptor:
-        buffer.append(src_str.strip())
-        if len(buffer) >= buffer_size:
-            yield buffer
-            buffer = []
-
-    if buffer:
-        yield buffer
-
-
-def make_batches(lines, args, src_dict, max_positions, bpe=None):
-    tokens = [
-        tokenizer.Tokenizer.tokenize(
-            src_str,
-            src_dict,
-            tokenize=tokenizer.tokenize_en,
-            add_if_not_exist=False,
-            bpe=bpe
-            ).long()
-        for src_str in lines
-    ]
-    lengths = np.array([t.numel() for t in tokens])
-    itr = data.EpochBatchIterator(
-        dataset=data.LanguagePairDataset(tokens, lengths, src_dict),
-        max_tokens=args.max_tokens,
-        max_sentences=args.max_sentences,
-        max_positions=max_positions,
-    ).next_epoch_itr(shuffle=False)
-    for batch in itr:
-        yield Batch(
-            srcs=[lines[i] for i in batch['id']],
-            tokens=batch['net_input']['src_tokens'],
-            lengths=batch['net_input']['src_lengths'],
-        ), batch['id']
-
-
-def setup_logger(args):
-    if not args.no_dllogger:
-        dllogger.init(backends=[dllogger.JSONStreamBackend(verbosity=1, filename=args.stat_file)])
-        for k, v in vars(args).items():
-            dllogger.log(step='PARAMETER', data={k:v}, verbosity=0)
-        container_setup_info = log_helper.get_framework_env_vars()
-        dllogger.log(step='PARAMETER', data=container_setup_info, verbosity=0)
-        dllogger.metadata('throughput',
-                          {'unit':'tokens/s', 'format':':/3f', 'GOAL':'MAXIMIZE', 'STAGE':'INFER'})
-    else:
-        dllogger.init(backends=[])
-
-
-def main(args):
-    setup_logger(args)
-
-    args.interactive = sys.stdin.isatty() and not args.file # Just make the code more understendable
-    
-    if args.file:
-        data_descriptor = open(args.file, 'r')
-    else:
-        data_descriptor = sys.stdin
-    
-    if args.interactive:
-        args.buffer_size = 1
-    if args.max_tokens is None and args.max_sentences is None:
-        args.max_sentences = 1
-    if args.buffer_size > 50000:
-        print("WARNING: To prevent memory exhaustion buffer size is set to 50000", file=sys.stderr)
-        args.buffer_size = 50000
-
-    assert not args.sampling or args.nbest == args.beam, \
-        '--sampling requires --nbest to be equal to --beam'
-    assert not args.max_sentences or args.max_sentences <= args.buffer_size, \
-        '--max-sentences/--batch-size cannot be larger than --buffer-size'
-
-    print(args, file=sys.stderr)
-
-    use_cuda = torch.cuda.is_available() and not args.cpu
-
-    processing_start = time.time()
-
-    # Load ensemble
-    print('| loading model(s) from {}'.format(args.path), file=sys.stderr)
-    model_paths = args.path.split(':')
-    models, model_args, src_dict, tgt_dict = load_ensemble_for_inference(model_paths)
-    if args.fp16:
-        for model in models:
-            model.half()
-
-    # Optimize ensemble for generation
-    for model in models:
-        model.make_generation_fast_(need_attn=args.print_alignment)
-
-    # Initialize generator
-    translator = SequenceGenerator(
-        models,
-        tgt_dict.get_metadata(),
-        maxlen=args.max_target_positions,
-        beam_size=args.beam,
-        stop_early=(not args.no_early_stop),
-        normalize_scores=(not args.unnormalized),
-        len_penalty=args.lenpen,
-        unk_penalty=args.unkpen,
-        sampling=args.sampling,
-        sampling_topk=args.sampling_topk,
-        minlen=args.min_len,
-        sampling_temperature=args.sampling_temperature
-    )
-
-    if use_cuda:
-        translator.cuda()
-
-    # Load BPE codes file
-    if args.bpe_codes:
-        codes = open(args.bpe_codes, 'r')
-        bpe = BPE(codes)
-    # Load alignment dictionary for unknown word replacement
-    # (None if no unknown word replacement, empty if no path to align dictionary)
-    align_dict = utils.load_align_dict(args.replace_unk)
-
-    def make_result(src_str, hypos):
-        result = Translation(
-            src_str=src_str,
-            hypos=[],
-            pos_scores=[],
-            alignments=[],
-        )
-
-        # Process top predictions
-        for hypo in hypos[:min(len(hypos), args.nbest)]:
-            hypo_tokens, hypo_str, alignment = utils.post_process_prediction(
-                hypo_tokens=hypo['tokens'].int().cpu(),
-                src_str=src_str,
-                alignment=hypo['alignment'].int().cpu() if hypo['alignment'] is not None else None,
-                align_dict=align_dict,
-                tgt_dict=tgt_dict,
-                remove_bpe=args.remove_bpe,
-            )
-            hypo_str = tokenizer.Tokenizer.detokenize(hypo_str, 'de').strip()
-            result.hypos.append((hypo['score'], hypo_str))
-            result.pos_scores.append('P\t' + ' '.join(f'{x:.4f}' for x in hypo['positional_scores'].tolist()))
-            result.alignments.append('A\t' + ' '.join(str(utils.item(x)) for x in alignment)
-                                     if args.print_alignment else None
-                                    )
-
-        return result
-
-    gen_timer = StopwatchMeter()
-
-    def process_batch(batch):
-        tokens = batch.tokens
-        lengths = batch.lengths
-
-        if use_cuda:
-            tokens = tokens.cuda()
-            lengths = lengths.cuda()
-
-        translation_start = time.time()
-        gen_timer.start()
-        translations = translator.generate(
-            tokens,
-            lengths,
-            maxlen=int(args.max_len_a * tokens.size(1) + args.max_len_b),
-        )
-        gen_timer.stop(sum(len(h[0]['tokens']) for h in translations))
-        dllogger.log(step='infer', data={'latency': time.time() - translation_start})
-
-        return [make_result(batch.srcs[i], t) for i, t in enumerate(translations)]
-
-    if args.interactive:
-        print('| Type the input sentence and press return:')
-    for inputs in buffered_read(args.buffer_size, data_descriptor):
-        indices = []
-        results = []
-        for batch, batch_indices in make_batches(inputs, args, src_dict, args.max_positions, bpe):
-            indices.extend(batch_indices)
-            results += process_batch(batch)
-
-        for i in np.argsort(indices):
-            result = results[i]
-            print(result.src_str, file=sys.stderr)
-            for hypo, pos_scores, align in zip(result.hypos, result.pos_scores, result.alignments):
-                print(f'Score {hypo[0]}', file=sys.stderr)
-                print(hypo[1])
-                print(pos_scores, file=sys.stderr)
-                if align is not None:
-                    print(align, file=sys.stderr)
-
-    if args.file:
-        data_descriptor.close()
-
-    log_dict = {
-                'throughput': 1./gen_timer.avg,
-                'latency_avg': sum(gen_timer.intervals)/len(gen_timer.intervals),
-                'latency_p90': gen_timer.p(90),
-                'latency_p95': gen_timer.p(95),
-                'latency_p99': gen_timer.p(99),
-                'total_infernece_time': gen_timer.sum,
-                'total_run_time': time.time() - processing_start,
-                }
-    print('Translation time: {} s'.format(log_dict['total_infernece_time']),
-          file=sys.stderr)
-    print('Model throughput (beam {}): {} tokens/s'.format(args.beam, log_dict['throughput']),
-          file=sys.stderr)
-    print('Latency:\n\tAverage {:.3f}s\n\tp90 {:.3f}s\n\tp95 {:.3f}s\n\tp99 {:.3f}s'.format(
-          log_dict['latency_avg'], log_dict['latency_p90'], log_dict['latency_p95'], log_dict['latency_p99']),
-          file=sys.stderr)
-    print('End to end time: {} s'.format(log_dict['total_run_time']), file=sys.stderr)
-    dllogger.log(step=(), data=log_dict)
-
-
-if __name__ == '__main__':
-    parser = options.get_inference_parser()
-    parser.add_argument('--no-dllogger', action='store_true')
-    ARGS = options.parse_args_and_arch(parser)
-    main(ARGS)
--- a/PyTorch/NLP/Transformer/preprocess.py
+++ b/PyTorch/NLP/Transformer/preprocess.py
-#!/usr/bin/env python3
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
-#
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
-#
-
-import argparse
-from itertools import zip_longest
-import os
-import shutil
-
-from fairseq.data import indexed_dataset, dictionary
-from fairseq.tokenizer import Tokenizer, tokenize_line
-
-
-def get_parser():
-    parser = argparse.ArgumentParser(
-        description='Data pre-processing: Create dictionary and store data in binary format')
-    parser.add_argument('-s', '--source-lang', default=None, metavar='SRC',
-                        help='source language')
-    parser.add_argument('-t', '--target-lang', default=None, metavar='TARGET',
-                        help='target language')
-    parser.add_argument('--trainpref', metavar='FP', default=None,
-                        help='train file prefix')
-    parser.add_argument('--validpref', metavar='FP', default=None,
-                        help='comma separated, valid file prefixes')
-    parser.add_argument('--testpref', metavar='FP', default=None,
-                        help='comma separated, test file prefixes')
-    parser.add_argument('--destdir', metavar='DIR', default='data-bin',
-                        help='destination dir')
-    parser.add_argument('--thresholdtgt', metavar='N', default=0, type=int,
-                        help='map words appearing less than threshold times to unknown')
-    parser.add_argument('--thresholdsrc', metavar='N', default=0, type=int,
-                        help='map words appearing less than threshold times to unknown')
-    parser.add_argument('--tgtdict', metavar='FP', help='reuse given target dictionary')
-    parser.add_argument('--srcdict', metavar='FP', help='reuse given source dictionary')
-    parser.add_argument('--nwordstgt', metavar='N', default=-1, type=int,
-                        help='number of target words to retain')
-    parser.add_argument('--nwordssrc', metavar='N', default=-1, type=int,
-                        help='number of source words to retain')
-    parser.add_argument('--alignfile', metavar='ALIGN', default=None,
-                        help='an alignment file (optional)')
-    parser.add_argument('--output-format', metavar='FORMAT', default='binary', choices=['binary', 'raw'],
-                        help='output format (optional)')
-    parser.add_argument('--joined-dictionary', action='store_true', help='Generate joined dictionary')
-    parser.add_argument('--only-source', action='store_true', help='Only process the source language')
-    parser.add_argument('--padding-factor', metavar='N', default=8, type=int,
-                        help='Pad dictionary size to be multiple of N')
-    return parser
-
-
-def main(args):
-    print(args)
-    os.makedirs(args.destdir, exist_ok=True)
-    target = not args.only_source
-
-    def build_dictionary(filenames):
-        d = dictionary.Dictionary()
-        for filename in filenames:
-            Tokenizer.add_file_to_dictionary(filename, d, tokenize_line)
-        return d
-
-    def train_path(lang):
-        return '{}{}'.format(args.trainpref, ('.' + lang) if lang else '')
-
-    def file_name(prefix, lang):
-        fname = prefix
-        if lang is not None:
-            fname += f'.{lang}'
-        return fname
-
-    def dest_path(prefix, lang):
-        return os.path.join(args.destdir, file_name(prefix, lang))
-
-    def dict_path(lang):
-        return dest_path('dict', lang) + '.txt'
-
-    def dataset_dest_path(output_prefix, lang, extension):
-        base = f'{args.destdir}/{output_prefix}'
-        lang_part = f'.{args.source_lang}-{args.target_lang}.{lang}' if lang is not None else ''
-        return f'{base}{lang_part}.{extension}'
-
-    if args.joined_dictionary:
-        assert not args.srcdict, 'cannot combine --srcdict and --joined-dictionary'
-        assert not args.tgtdict, 'cannot combine --tgtdict and --joined-dictionary'
-        src_dict = build_dictionary({
-            train_path(lang)
-            for lang in [args.source_lang, args.target_lang]
-            })
-        tgt_dict = src_dict
-    else:
-        if args.srcdict:
-            src_dict = dictionary.Dictionary.load(args.srcdict)
-        else:
-            assert args.trainpref, "--trainpref must be set if --srcdict is not specified"
-            src_dict = build_dictionary([train_path(args.source_lang)])
-        if target:
-            if args.tgtdict:
-                tgt_dict = dictionary.Dictionary.load(args.tgtdict)
-            else:
-                assert args.trainpref, "--trainpref must be set if --tgtdict is not specified"
-                tgt_dict = build_dictionary([train_path(args.target_lang)])
-
-    src_dict.finalize(
-        threshold=args.thresholdsrc,
-        nwords=args.nwordssrc,
-        padding_factor=args.padding_factor,
-    )
-    src_dict.save(dict_path(args.source_lang))
-    if target:
-        if not args.joined_dictionary:
-            tgt_dict.finalize(
-                threshold=args.thresholdtgt,
-                nwords=args.nwordstgt,
-                padding_factor=args.padding_factor,
-            )
-        tgt_dict.save(dict_path(args.target_lang))
-
-    def make_binary_dataset(input_prefix, output_prefix, lang):
-        _dict = dictionary.Dictionary.load(dict_path(lang))
-        print('| [{}] Dictionary: {} types'.format(lang, len(_dict) - 1))
-
-        ds = indexed_dataset.IndexedDatasetBuilder(dataset_dest_path(output_prefix, lang, 'bin'))
-
-        def consumer(tensor):
-            ds.add_item(tensor)
-
-        input_file = '{}{}'.format(input_prefix, ('.' + lang) if lang is not None else '')
-        res = Tokenizer.binarize(input_file, _dict, consumer)
-        print('| [{}] {}: {} sents, {} tokens, {:.3}% replaced by {}'.format(
-            lang, input_file, res['nseq'], res['ntok'],
-            100 * res['nunk'] / res['ntok'], _dict.unk_word))
-        ds.finalize(dataset_dest_path(output_prefix, lang, 'idx'))
-
-    def make_dataset(input_prefix, output_prefix, lang):
-        if args.output_format == 'binary':
-            make_binary_dataset(input_prefix, output_prefix, lang)
-        elif args.output_format == 'raw':
-            # Copy original text file to destination folder
-            output_text_file = dest_path(
-                output_prefix + '.{}-{}'.format(args.source_lang, args.target_lang),
-                lang,
-            )
-            shutil.copyfile(file_name(input_prefix, lang), output_text_file)
-
-    def make_all(lang):
-        if args.trainpref:
-            make_dataset(args.trainpref, 'train', lang)
-        if args.validpref:
-            for k, validpref in enumerate(args.validpref.split(',')):
-                outprefix = 'valid{}'.format(k) if k > 0 else 'valid'
-                make_dataset(validpref, outprefix, lang)
-        if args.testpref:
-            for k, testpref in enumerate(args.testpref.split(',')):
-                outprefix = 'test{}'.format(k) if k > 0 else 'test'
-                make_dataset(testpref, outprefix, lang)
-
-    make_all(args.source_lang)
-    if target:
-        make_all(args.target_lang)
-
-    print('| Wrote preprocessed data to {}'.format(args.destdir))
-
-    if args.alignfile:
-        assert args.trainpref, "--trainpref must be set if --alignfile is specified"
-        src_file_name = train_path(args.source_lang)
-        tgt_file_name = train_path(args.target_lang)
-        src_dict = dictionary.Dictionary.load(dict_path(args.source_lang))
-        tgt_dict = dictionary.Dictionary.load(dict_path(args.target_lang))
-        freq_map = {}
-        with open(args.alignfile, 'r') as align_file:
-            with open(src_file_name, 'r') as src_file:
-                with open(tgt_file_name, 'r') as tgt_file:
-                    for a, s, t in zip_longest(align_file, src_file, tgt_file):
-                        si = Tokenizer.tokenize(s, src_dict, add_if_not_exist=False)
-                        ti = Tokenizer.tokenize(t, tgt_dict, add_if_not_exist=False)
-                        ai = list(map(lambda x: tuple(x.split('-')), a.split()))
-                        for sai, tai in ai:
-                            srcidx = si[int(sai)]
-                            tgtidx = ti[int(tai)]
-                            if srcidx != src_dict.unk() and tgtidx != tgt_dict.unk():
-                                assert srcidx != src_dict.pad()
-                                assert srcidx != src_dict.eos()
-                                assert tgtidx != tgt_dict.pad()
-                                assert tgtidx != tgt_dict.eos()
-
-                                if srcidx not in freq_map:
-                                    freq_map[srcidx] = {}
-                                if tgtidx not in freq_map[srcidx]:
-                                    freq_map[srcidx][tgtidx] = 1
-                                else:
-                                    freq_map[srcidx][tgtidx] += 1
-
-        align_dict = {}
-        for srcidx in freq_map:
-            align_dict[srcidx] = max(freq_map[srcidx], key=freq_map[srcidx].get)
-
-        with open(os.path.join(args.destdir, 'alignment.{}-{}.txt'.format(
-                args.source_lang, args.target_lang)), 'w') as f:
-            for k, v in align_dict.items():
-                print('{} {}'.format(src_dict[k], tgt_dict[v]), file=f)
-
-
-if __name__ == '__main__':
-    parser = get_parser()
-    ARGS = parser.parse_args()
-    main(ARGS)
--- a/PyTorch/NLP/Transformer/requirements.txt
+++ b/PyTorch/NLP/Transformer/requirements.txt
-cffi
-numpy
-torch
-tqdm
-tensorboardX
--- a/PyTorch/NLP/Transformer/scripts/deployer.py
+++ b/PyTorch/NLP/Transformer/scripts/deployer.py
-#!/usr/bin/python
-
-# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License. 
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS, 
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
-# See the License for the specific language governing permissions and
-# limitations under the License. 
-
-
-import sys
-import torch
-import argparse
-import deployer_lib
-# 
-import torch
-from fairseq import data
-from fairseq.data import load_dataset_splits, data_utils
-from fairseq.models.transformer import TransformerModel
-from copy import deepcopy
-
-def get_model_and_args(model_args):
-    ''' the arguments initialize_model will receive '''
-    parser = argparse.ArgumentParser()
-    ## Required parameters by the model. 
-    parser.add_argument("--checkpoint", 
-                        default=None, 
-                        type=str, 
-                        required=True, 
-                        help="The checkpoint of the model. ")
-    parser.add_argument('--batch-size', 
-                        default=10240, 
-                        type=int, 
-                        help='Batch size for inference')
-    parser.add_argument('--num-batches',
-                        default=2,
-                        type=int,
-                        help='Number of batches to check accuracy on')
-    parser.add_argument("--data",
-                        default=None,
-                        type=str,
-                        required=True,
-                        help="Path to the dataset")
-    parser.add_argument('--part',
-                        choices=['encoder', 'decoder', 'model'],
-                        default='model',
-                        type=str,
-                        help='Choose the part of the model to export')
-
-    args = parser.parse_args(model_args)
-
-    state_dict = torch.load(args.checkpoint, map_location='cpu')
-
-    model_args = state_dict['args']
-    model_args.data = args.data
-    model_args.num_batches = args.num_batches
-    model_args.max_tokens = args.batch_size
-    model_args.fuse_layer_norm = False
-    model_args.part = args.part
-
-    model = TransformerModel.build_model(model_args)
-    model.load_state_dict(state_dict['model'], strict=True)
-    model.make_generation_fast_(need_attn=False)
-
-    return model, model_args
-
-def get_dataloader(args, encoder=None):
-    ''' return dataloader for inference '''
-    assert not(args.part == 'decoder' and encoder is None), "Cannot export decoder without providing encoder"
-    src_dict, tgt_dict = data_utils.load_dictionaries(args)
-    datasets = load_dataset_splits(args, ['valid'], src_dict, tgt_dict)
-    itr = data.EpochBatchIterator(
-        dataset=datasets['valid'],
-        max_tokens=args.max_tokens,
-        max_positions=args.max_positions,
-    ).next_epoch_itr(shuffle=False)
-
-    def input_itr():
-        for batch in itr:
-            if itr.count > args.num_batches:
-                break
-            ni = batch['net_input']
-            if args.part == 'decoder': #this part works only on GPU
-                with torch.no_grad():
-                    encoder_out = encoder(ni['src_tokens'].cuda(), ni['src_lengths'].cuda()) 
-                yield ni['prev_output_tokens'], encoder_out[0], encoder_out[1]
-            elif args.part == 'encoder':
-                yield ni['src_tokens'], ni['src_lengths']
-            else:
-                yield ni['src_tokens'], ni['src_lengths'], ni['prev_output_tokens']
-
-    return input_itr()
-
-
-if __name__=='__main__':
-    # don't touch this! 
-    deployer, model_argv = deployer_lib.create_deployer(sys.argv[1:]) # deployer and returns removed deployer arguments
-    
-    model, model_args = get_model_and_args(model_argv)
-
-    if model_args.part == 'decoder':
-        encoder = model.encoder
-        encoder.embed_tokens = deepcopy(encoder.embed_tokens)
-        encoder.cuda()
-    else:
-        encoder = None
-    
-    dataloader = get_dataloader(model_args, encoder=encoder)
-
-    if model_args.part == 'encoder':
-        model = model.encoder
-    elif model_args.part == 'decoder':
-        model = model.decoder
-    
-    deployer.deploy(dataloader, model)
-
--- a/PyTorch/NLP/Transformer/scripts/deployer_lib.py
+++ b/PyTorch/NLP/Transformer/scripts/deployer_lib.py
-#!/usr/bin/python
-
-# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License. 
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS, 
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
-# See the License for the specific language governing permissions and
-# limitations under the License. 
-
-
-import os
-import sys
-import time
-import json
-import torch
-import argparse
-import statistics
-from collections import Counter
-
-
-torch_type_to_triton_type = {
-    torch.bool:     'TYPE_BOOL', 
-    torch.int8:     'TYPE_INT8', 
-    torch.int16:    'TYPE_INT16', 
-    torch.int32:    'TYPE_INT32', 
-    torch.int64:    'TYPE_INT64', 
-    torch.uint8:    'TYPE_UINT8', 
-    torch.float16:  'TYPE_FP16', 
-    torch.float32:  'TYPE_FP32', 
-    torch.float64:  'TYPE_FP64'
-}
-
-
-CONFIG_TEMPLATE = r"""
-name: "{model_name}"
-platform: "{platform}"
-max_batch_size: {max_batch_size}
-input [
-    {spec_inputs}
-]
-output [
-    {spec_outputs}
-]
-{dynamic_batching}
-{model_optimizations}
-instance_group [
-    {{
-        count: {engine_count}
-        kind: KIND_GPU
-        gpus: [ {gpu_list} ]
-    }}
-]"""
-
-
-INPUT_TEMPLATE = r"""
-{{
-    name: "input__{num}"
-    data_type: {type}
-    dims: {dims}
-    {reshape}
-}},"""
-
-
-OUTPUT_TEMPLATE = r""" 
-{{
-    name: "output__{num}"
-    data_type: {type}
-    dims: {dims}
-    {reshape}
-}},"""
-
-
-MODEL_OPTIMIZATION_TEMPLATE = r"""
-optimization {{
-  {execution_accelerator}
-  cuda {{
-    graphs: {capture_cuda_graph}
-  }}
-}}"""
-
-
-EXECUTION_ACCELERATOR_TEMPLATE = r"""
-  execution_accelerators {{
-    gpu_execution_accelerator: [
-      {{
-        name: "tensorrt"
-      }}
-    ]
-  }},"""
-
-
-def remove_empty_lines(text):
-    ''' removes empty lines from text, returns the result '''
-    ret = "".join([s for s in text.strip().splitlines(True) if s.strip()])
-    return ret
-
-
-def create_deployer(argv):
-    ''' takes a list of arguments, returns a deployer object and the list of unused arguments '''
-    parser = argparse.ArgumentParser()
-    # required args
-    method = parser.add_mutually_exclusive_group(required=True)
-    method.add_argument('--ts-script',
-                        action='store_true',
-                        help='convert to torchscript using torch.jit.script')
-    method.add_argument('--ts-trace',
-                        action='store_true',
-                        help='convert to torchscript using torch.jit.trace')
-    method.add_argument('--onnx',
-                        action='store_true',
-                        help='convert to onnx using torch.onnx.export')
-    method.add_argument('--trt',
-                        action='store_true',
-                        help='convert to trt using tensorrt')
-    # triton related args
-    arguments = parser.add_argument_group('triton related flags')
-    arguments.add_argument('--triton-no-cuda',
-                            action='store_true',
-                            help='Use the CPU for tracing.')
-    arguments.add_argument('--triton-model-name',
-                            type=str,
-                            default="model",
-                            help="exports to appropriate directory structure for TRTIS")
-    arguments.add_argument("--triton-model-version",
-                            type=int,
-                            default=1,
-                            help="exports to appropriate directory structure for TRTIS")
-    arguments.add_argument("--triton-server-url",
-                            type=str,
-                            default="localhost:8001",
-                            help="exports to appropriate directory structure for TRTIS")
-    arguments.add_argument("--triton-max-batch-size",
-                            type=int,
-                            default=8,
-                            help="Specifies the 'max_batch_size' in the TRTIS model config.\
-                                  See the TRTIS documentation for more info.")
-    arguments.add_argument("--triton-dyn-batching-delay",
-                            type=float,
-                            default=0,
-                            help="Determines the dynamic_batching queue delay in milliseconds(ms) for\
-                                  the TRTIS model config. Use '0' or '-1' to specify static batching.\
-                                  See the TRTIS documentation for more info.")
-    arguments.add_argument("--triton-engine-count",
-                            type=int,
-                            default=1,
-                            help="Specifies the 'instance_group' count value in the TRTIS model config.\
-                                  See the TRTIS documentation for more info.")
-    arguments.add_argument('--save-dir', type=str, default='./triton_models', help='Saved model directory')
-    # optimization args
-    arguments = parser.add_argument_group('optimization flags')
-    arguments.add_argument("--max_workspace_size",
-                            type=int,
-                            default=512*1024*1024,
-                            help="set the size of the workspace for trt export")
-    arguments.add_argument("--trt-fp16",
-                            action='store_true',
-                            help="trt flag ---- export model in mixed precision mode")
-    arguments.add_argument("--capture-cuda-graph",
-                            type=int,
-                            default=1,
-                            help="capture cuda graph for obtaining speedup. possible values: 0, 1. default: 1. ")
-    arguments.add_argument('--quantize',
-                            action='store_true',
-                            help='apply quantization for supported nodes')
-    arguments.add_argument('--calibrate',
-                            action='store_true',
-                            help='apply calibration for supported nodes')
-    # remainder args
-    arguments.add_argument('model_arguments', nargs=argparse.REMAINDER, help='arguments that will be ignored by deployer lib and will be forwarded to your deployer script')
-    # 
-    args = parser.parse_args(argv)
-    deployer = Deployer(args)
-    # 
-    return deployer, args.model_arguments[1:]
-
-
-class DeployerLibrary:
-    def __init__(self, args):
-        self.args = args
-        self.platform = None
-    
-    def set_platform(self, platform):
-        ''' sets the platform
-            :: platform :: "pytorch_libtorch" or "onnxruntime_onnx" or "tensorrt_plan"
-        '''
-        self.platform = platform
-    
-    def build_trt_engine(self, model_file, shapes):
-        ''' takes a path to an onnx file, and shape information, returns a trt engine
-            :: model_file :: path to an onnx model
-            :: shapes :: dictionary containing min shape, max shape, opt shape for the trt engine
-        '''
-        import tensorrt as trt
-        TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
-        builder = trt.Builder(TRT_LOGGER)
-        builder.fp16_mode = self.args.trt_fp16
-        builder.max_batch_size = self.args.triton_max_batch_size
-        # 
-        config = builder.create_builder_config()
-        config.max_workspace_size = self.args.max_workspace_size
-        if self.args.trt_fp16:
-            config.flags |= 1 << int(trt.BuilderFlag.FP16)
-        profile = builder.create_optimization_profile()
-        for s in shapes:
-            profile.set_shape(s['name'], min=s['min'], opt=s['opt'], max=s['max'])
-        config.add_optimization_profile(profile)
-        explicit_batch = 1 << (int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
-        network = builder.create_network(explicit_batch)
-        # 
-        with trt.OnnxParser(network, TRT_LOGGER) as parser:
-            with open(model_file, 'rb') as model:
-                parser.parse(model.read())
-                for i in range(parser.num_errors):
-                    e = parser.get_error(i)
-                    print("||||e", e)
-                engine = builder.build_engine(network, config=config)
-        return engine
-    
-    def load_engine(self, engine_filepath):
-        ''' loads a trt engine from engine_filepath, returns it '''
-        import tensorrt as trt
-        TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
-        with open(engine_filepath, "rb") as f, trt.Runtime(TRT_LOGGER) as runtime:
-            engine = runtime.deserialize_cuda_engine(f.read())
-        return engine
-    
-    def prepare_inputs(self, dataloader, device):
-        ''' load sample inputs to device '''
-        def _move_to_device(maybe_tensor):
-            if torch.is_tensor(maybe_tensor):
-                return maybe_tensor.to(device)
-            elif isinstance(maybe_tensor, dict):
-                return {
-                    key: _move_to_device(value)
-                    for key, value in maybe_tensor.items()
-                }
-            elif isinstance(maybe_tensor, list) or isinstance(maybe_tensor, tuple):
-                return [_move_to_device(x) for x in maybe_tensor]
-            else:
-                return maybe_tensor
-
-        inputs = []
-        for batch in dataloader:
-            batch_d = _move_to_device(batch)
-            if not hasattr(batch_d, '__iter__'):
-                batch_d = (batch_d,)
-            inputs.append(batch_d)
-
-        return inputs
-    
-    def get_list_of_shapes(self, l, fun):
-        ''' returns the list of min/max shapes, depending on fun
-            :: l :: list of tuples of tensors
-            :: fun :: min or max
-        '''
-        tensor_tuple = l[0]
-        shapes = [list(x.shape) for x in tensor_tuple]
-        for tensor_tuple in l:
-            assert len(tensor_tuple) == len(shapes), "tensors with varying shape lengths are not supported"
-            for i,x in enumerate(tensor_tuple):
-                for j in range(len(x.shape)):
-                    shapes[i][j] = fun(shapes[i][j], x.shape[j])
-        return shapes # a list of shapes
-    
-    def get_tuple_of_min_shapes(self, l):
-        ''' returns the tuple of min shapes 
-            :: l :: list of tuples of tensors '''
-        shapes = self.get_list_of_shapes(l, min)
-        min_batch = 1
-        shapes = [[min_batch,*shape[1:]] for shape in shapes]
-        shapes = tuple(shapes)
-        return shapes # tuple of min shapes
-    
-    def get_tuple_of_max_shapes(self, l):
-        ''' returns the tuple of max shapes 
-            :: l :: list of tuples of tensors '''
-        shapes = self.get_list_of_shapes(l, max)
-        max_batch = max(2,shapes[0][0])
-        shapes = [[max_batch,*shape[1:]] for shape in shapes]
-        shapes = tuple(shapes)
-        return shapes # tuple of max shapes
-    
-    def get_tuple_of_opt_shapes(self, l):
-        ''' returns the tuple of opt shapes 
-            :: l :: list of tuples of tensors '''
-        counter = Counter()
-        for tensor_tuple in l:
-            shapes = [tuple(x.shape) for x in tensor_tuple]
-            shapes = tuple(shapes)
-            counter[shapes] += 1
-        shapes = counter.most_common(1)[0][0]
-        return shapes # tuple of most common occuring shapes
-    
-    def get_tuple_of_dynamic_shapes(self, l):
-        ''' returns a tuple of dynamic shapes: variable tensor dimensions 
-            (for ex. batch size) occur as -1 in the tuple
-            :: l :: list of tuples of tensors '''
-        tensor_tuple = l[0]
-        shapes = [list(x.shape) for x in tensor_tuple]
-        for tensor_tuple in l:
-            err_msg = "tensors with varying shape lengths are not supported"
-            assert len(tensor_tuple) == len(shapes), err_msg
-            for i,x in enumerate(tensor_tuple):
-                for j in range(len(x.shape)):
-                    if shapes[i][j] != x.shape[j] or j == 0:
-                        shapes[i][j] = -1
-        shapes = tuple(shapes)
-        return shapes # tuple of dynamic shapes
-    
-    def run_models(self, models, inputs):
-        ''' run the models on inputs, return the outputs and execution times '''
-        ret = []
-        for model in models:
-            torch.cuda.synchronize()
-            time_start = time.time()
-            outputs = []
-            for input in inputs:
-                with torch.no_grad():
-                    output = model(*input)
-                if type(output) is torch.Tensor:
-                    output = [output]
-                elif type(output) is dict:
-                    output = list(output.items())
-                    output.sort(key=lambda x: x[0])
-                    output = [x[0] for x in output]
-                outputs.append(output)
-            torch.cuda.synchronize()
-            time_end = time.time()
-            t = time_end - time_start
-            ret.append(outputs)
-            ret.append(t)
-        return ret
-
-    def compute_tensor_stats(self, tensor):
-        #if tensor is not empty
-        if tensor.numel():
-            return {'std': tensor.std().item(),
-                    'mean': tensor.mean().item(),
-                    'max': tensor.max().item(),
-                    'min': tensor.min().item(),
-            }
-        else:
-            return {'std': 0,
-                    'mean':0,
-                    'max': 0,
-                    'min': 0,
-            }
-
-    def compute_errors(self, outputs_A, outputs_B):
-        ''' returns dictionary with errors statistics '''
-        device = outputs_A[0][0][0].device
-        dtype = outputs_A[0][0][0].dtype
-        num_outputs = len(outputs_A[0])
-        x_values = [torch.zeros(0, device = device, dtype = dtype) for _ in range(num_outputs)]
-        y_values = [torch.zeros(0, device = device, dtype = dtype) for _ in range(num_outputs)]
-        d_values = [torch.zeros(0, device = device, dtype = dtype) for _ in range(num_outputs)]
-        for output_A,output_B in zip(outputs_A,outputs_B):
-            for i,(x,y) in enumerate(zip(output_A, output_B)):
-                x = x.view(-1).float()
-                y = y.view(-1).float()
-                d = abs(x - y)
-                x_values[i] = torch.cat((x_values[i], x), 0)
-                y_values[i] = torch.cat((y_values[i], y), 0)
-                d_values[i] = torch.cat((d_values[i], d), 0)
-        Error_stats = [{'Original': self.compute_tensor_stats(x),
-                       'Converted': self.compute_tensor_stats(y),
-                       'Absolute difference': self.compute_tensor_stats(d),
-                           } for x,y,z in zip(x_values, y_values, d_values)]
-        return Error_stats
-    
-    def print_errors(self, Error_stats):
-        ''' print various statistcs of Linf errors '''
-        print()
-        print("conversion correctness test results")
-        print("-----------------------------------")
-        import pandas as pd
-        for i,e in enumerate(Error_stats):
-            print(f'Output {i}:')
-            print(pd.DataFrame(e))
-    
-    def write_config(self, config_filename, 
-                     input_shapes, input_types, 
-                     output_shapes, output_types):
-        ''' writes TRTIS config file 
-            :: config_filename :: the file to write the config file into
-            :: input_shapes :: tuple of dynamic shapes of the input tensors
-            :: input_types :: tuple of torch types of the input tensors
-            :: output_shapes :: tuple of dynamic shapes of the output tensors
-            :: output_types :: tuple of torch types of the output tensors
-        '''
-        assert self.platform is not None, "error - platform is not set"
-        
-        config_template = CONFIG_TEMPLATE
-        input_template = INPUT_TEMPLATE
-        optimization_template = MODEL_OPTIMIZATION_TEMPLATE
-        accelerator_template = EXECUTION_ACCELERATOR_TEMPLATE
-        
-        spec_inputs = r""""""
-        for i,(shape,typ) in enumerate(zip(input_shapes,input_types)):
-            d = {
-                'num' : str(i), 
-                'type': torch_type_to_triton_type[typ],
-                'dims': str([1]) if len(shape) == 1 else str(list(shape)[1:]) # first dimension is the batch size 
-            }
-            d['reshape'] = 'reshape: { shape: [ ] }' if len(shape) == 1 else ''
-            spec_inputs += input_template.format_map(d)
-        spec_inputs = spec_inputs[:-1]
-        
-        output_template = OUTPUT_TEMPLATE
-        spec_outputs = r""""""
-        for i,(shape,typ) in enumerate(zip(output_shapes,output_types)):
-            d = {
-                'num' : str(i), 
-                'type': torch_type_to_triton_type[typ],
-                'dims': str([1]) if len(shape) == 1 else str(list(shape)[1:]) # first dimension is the batch size 
-            }
-            d['reshape'] = 'reshape: { shape: [ ] }' if len(shape) == 1 else ''
-            spec_outputs += output_template.format_map(d)
-        spec_outputs = spec_outputs[:-1]
-        
-        batching_str = ""
-        max_batch_size = self.args.triton_max_batch_size
-        
-        if (self.args.triton_dyn_batching_delay > 0):
-            # Use only full and half full batches 
-            pref_batch_size = [int(max_batch_size / 2.0), max_batch_size]
-            
-            batching_str = r"""
-dynamic_batching {{
-    preferred_batch_size: [{0}]
-    max_queue_delay_microseconds: {1}
-}}""".format(", ".join([str(x) for x in pref_batch_size]), 
-                        int(self.args.triton_dyn_batching_delay * 1000.0))
-        
-        accelerator_str = ""
-        if self.platform == 'onnxruntime_onnx':
-            accelerator_str = accelerator_template.format_map({})
-        
-        d = {
-            "execution_accelerator":  accelerator_str, 
-            "capture_cuda_graph":     str(self.args.capture_cuda_graph)
-        }
-        optimization_str = optimization_template.format_map(d)
-        
-        config_values = {
-            "model_name":           self.args.triton_model_name, 
-            "platform":             self.platform, 
-            "max_batch_size":       max_batch_size, 
-            "spec_inputs":          spec_inputs, 
-            "spec_outputs":         spec_outputs, 
-            "dynamic_batching":     batching_str, 
-            "model_optimizations" : optimization_str, 
-            "gpu_list":         ", ".join([str(x) for x in range(torch.cuda.device_count())]), 
-            "engine_count":     self.args.triton_engine_count
-        }
-        
-        # write config 
-        with open(config_filename, "w") as file:
-            final_config_str = config_template.format_map(config_values)
-            final_config_str = remove_empty_lines(final_config_str)
-            file.write(final_config_str)
-
-
-class Deployer:
-    def __init__(self, args):
-        self.args = args
-        self.lib = DeployerLibrary(args)
-    
-    def deploy(self, dataloader, model):
-        ''' deploy the model and test for correctness with dataloader '''
-        if self.args.ts_script or self.args.ts_trace:
-            self.lib.set_platform("pytorch_libtorch")
-            print("deploying model " + self.args.triton_model_name + " in format " + self.lib.platform)
-            self.to_triton_torchscript(dataloader, model)
-        elif self.args.onnx:
-            self.lib.set_platform("onnxruntime_onnx")
-            print("deploying model " + self.args.triton_model_name + " in format " + self.lib.platform)
-            self.to_triton_onnx(dataloader, model)
-        elif self.args.trt:
-            self.lib.set_platform("tensorrt_plan")
-            print("deploying model " + self.args.triton_model_name + " in format " + self.lib.platform)
-            self.to_triton_trt(dataloader, model)
-        else:
-            assert False, "error"
-        print("done")
-    
-    def to_triton_trt(self, dataloader, model):
-        ''' export the model to trt and test correctness on dataloader '''
-        import tensorrt as trt
-        # setup device
-        if self.args.triton_no_cuda:
-            device = torch.device('cpu')
-        else:
-            device = torch.device('cuda')
-
-        assert not self.args.quantize, 'quantize flag not supported by trt'
-        assert not self.args.calibrate, 'calibrate flag not supported by trt'
-
-        # prepare model 
-        model.to(device)
-        model.eval()
-        assert not model.training, "internal error - model should be in eval() mode! "
-        
-        # prepare inputs
-        inputs = self.lib.prepare_inputs(dataloader, device)
-        
-        # generate outputs
-        outputs = []
-        for input in inputs:
-            with torch.no_grad():
-                output = model(*input)
-            if type(output) is torch.Tensor:
-                output = [output]
-            outputs.append(output)
-        
-        # generate input shapes - dynamic tensor shape support 
-        input_shapes = self.lib.get_tuple_of_dynamic_shapes(inputs)
-        
-        # generate output shapes - dynamic tensor shape support 
-        output_shapes = self.lib.get_tuple_of_dynamic_shapes(outputs)
-        
-        # generate input types 
-        input_types = [x.dtype for x in inputs[0]]
-        
-        # generate output types
-        output_types = [x.dtype for x in outputs[0]]
-        
-        # get input names
-        rng = range(len(input_types))
-        input_names = ["input__" + str(num) for num in rng]
-        
-        # get output names
-        rng = range(len(output_types))
-        output_names = ["output__" + str(num) for num in rng]
-        
-        # prepare save path
-        model_folder = os.path.join(self.args.save_dir, self.args.triton_model_name)
-        version_folder = os.path.join(model_folder, str(self.args.triton_model_version))
-        if not os.path.exists(version_folder):
-            os.makedirs(version_folder)
-        final_model_path = os.path.join(version_folder, 'model.plan')
-        
-        # get indices of dynamic input and output shapes
-        dynamic_axes = {}
-        for input_name,shape in zip(input_names,input_shapes):
-            dynamic_axes[input_name] = [i for i,x in enumerate(shape) if x == -1]
-        for output_name,shape in zip(output_names,output_shapes):
-            dynamic_axes[output_name] = [i for i,x in enumerate(shape) if x == -1]
-        
-        # export the model to onnx first
-        with torch.no_grad():
-            torch.onnx.export(model, inputs[0], final_model_path, verbose=False, 
-                              input_names=input_names, output_names=output_names, 
-                              dynamic_axes=dynamic_axes, opset_version=11)
-      
-        # get shapes
-        min_shapes = self.lib.get_tuple_of_min_shapes(inputs)
-        opt_shapes = self.lib.get_tuple_of_opt_shapes(inputs)
-        max_shapes = self.lib.get_tuple_of_max_shapes(inputs)
-        
-        zipped = zip(input_names, min_shapes, opt_shapes, max_shapes)
-        shapes = []
-        for name,min_shape,opt_shape,max_shape in zipped:
-            d = {
-                 "name":name, 
-                 "min": min_shape, 
-                 "opt": opt_shape, 
-                 "max": max_shape
-                }
-            shapes.append(d)
-        
-        # build trt engine
-        engine = self.lib.build_trt_engine(final_model_path, shapes)
-        assert engine is not None, " trt export failure "
-        
-        # write trt engine
-        with open(final_model_path, 'wb') as f:
-            f.write(engine.serialize())
-        
-        # load the model
-        engine = self.lib.load_engine(final_model_path)
-        
-        class TRT_model:
-            def __init__(self, engine, input_names, output_names, output_types, device):
-                self.engine = engine
-                self.context = self.engine.create_execution_context()
-                self.input_names = input_names
-                self.output_names = output_names
-                self.output_types = output_types
-                self.device = device
-            
-            def is_dimension_dynamic(self, dim):
-                return dim is None or dim <= 0
-            
-            def is_shape_dynamic(self, shape):
-                return any([self.is_dimension_dynamic(dim) for dim in shape])
-            
-            def __call__(self, *inputs):
-                # get input shapes
-                input_shapes = [x.shape for x in inputs]
-                # bindings
-                bindings = [None] * self.engine.num_bindings
-                # set input shapes, bind input tensors
-                zipped = zip(self.input_names, inputs)
-                for key,input in zipped:
-                    idx = self.engine.get_binding_index(key)
-                    bindings[idx] = input.data_ptr()
-                    if self.engine.is_shape_binding(idx) and self.is_shape_dynamic(self.context.get_shape(idx)):
-                        self.context.set_shape_input(idx, input)
-                    elif self.is_shape_dynamic(self.engine.get_binding_shape(idx)):
-                        self.context.set_binding_shape(idx, input.shape)
-                assert self.context.all_binding_shapes_specified, "trt error"
-                assert self.context.all_shape_inputs_specified, "trt error"
-                # calculate output shapes, allocate output tensors and bind them
-                outputs = []
-                zipped = zip(self.output_names, self.output_types)
-                for key,dtype in zipped:
-                    idx = self.engine.get_binding_index(key)
-                    shape = self.context.get_binding_shape(idx)
-                    shape = tuple(shape)
-                    assert -1 not in shape, "trt error"
-                    tensor = torch.zeros(shape, dtype=dtype, device=self.device)
-                    outputs.append(tensor)
-                    bindings[idx] = outputs[-1].data_ptr()
-                # run inference
-                self.context.execute_v2(bindings=bindings)
-                # return the result
-                if len(outputs) == 1:
-                    outputs = outputs[0]
-                return outputs
-        
-        model_trt = TRT_model(engine, input_names, output_names, output_types, device)
-        
-        # run both models on inputs
-        assert not model.training, "internal error - model should be in eval() mode! "
-        models = (model, model_trt)
-        outputs, time_model, outputs_trt, time_model_trt = self.lib.run_models(models, inputs)
-        
-        # check for errors
-        Error_stats = self.lib.compute_errors(outputs, outputs_trt)
-        self.lib.print_errors(Error_stats)
-        print('time of error check of native model: ', time_model, 'seconds')
-        print('time of error check of trt model: ', time_model_trt, 'seconds')
-        print()
-        
-        # write TRTIS config
-        config_filename = os.path.join(model_folder, "config.pbtxt")
-        self.lib.write_config(config_filename, 
-                              input_shapes, input_types, 
-                              output_shapes, output_types)
-    
-    def name_onnx_nodes(self, model_path):
-        '''
-        Name all unnamed nodes in ONNX model
-            parameter model_path: path  ONNX model
-            return: none
-        '''
-        model = onnx.load(model_path)
-        node_id = 0
-        for node in model.graph.node:
-            if len(node.name) == 0:
-                node.name = "unnamed_node_%d" % node_id
-            node_id += 1
-        # This check partially validates model
-        onnx.checker.check_model(model)
-        onnx.save(model, model_path)
-        # Only inference really checks ONNX model for some issues
-        # like duplicated node names
-        onnxruntime.InferenceSession(model_path, None)
-    
-    def to_triton_onnx(self, dataloader, model):
-        ''' export the model to onnx and test correctness on dataloader '''
-        import onnx as local_onnx
-        global onnx
-        onnx = local_onnx
-        import onnxruntime as local_onnxruntime
-        global onnxruntime
-        onnxruntime = local_onnxruntime
-        # setup device
-        if self.args.triton_no_cuda:
-            device = torch.device('cpu')
-        else:
-            device = torch.device('cuda')
-        
-        if self.args.calibrate:
-            assert self.args.quantize, ("calibrate flag not supported "
-                                        "without quantize")
-        if self.args.quantize:
-           try:
-               from quantize import quantize, QuantizationMode
-           except ImportError as error:
-               print('quantize scripts are not present')
-               raise error
-        
-        if self.args.calibrate:
-            try:
-                import calibrate
-            except ImportError as error:
-                print('calibrate scripts are not present')
-                raise error
-        
-        # prepare model 
-        model.to(device)
-        model.eval()
-        assert not model.training, "internal error - model should be in eval() mode! "
-        
-        # prepare inputs
-        inputs = self.lib.prepare_inputs(dataloader, device)
-        
-        # generate outputs
-        outputs = []
-        for input in inputs:
-            with torch.no_grad():
-                output = model(*input)
-            if type(output) is torch.Tensor:
-                output = [output]
-            outputs.append(output)
-        
-        # generate input shapes - dynamic tensor shape support 
-        input_shapes = self.lib.get_tuple_of_dynamic_shapes(inputs)
-        
-        # generate output shapes - dynamic tensor shape support 
-        output_shapes = self.lib.get_tuple_of_dynamic_shapes(outputs)
-        
-        # generate input types 
-        input_types = [x.dtype for x in inputs[0]]
-        
-        # generate output types
-        output_types = [x.dtype for x in outputs[0]]
-        
-        # get input names
-        rng = range(len(input_types))
-        input_names = ["input__" + str(num) for num in rng]
-        
-        # get output names
-        rng = range(len(output_types))
-        output_names = ["output__" + str(num) for num in rng]
-        
-        # prepare save path
-        model_folder = os.path.join(self.args.save_dir, self.args.triton_model_name)
-        version_folder = os.path.join(model_folder, str(self.args.triton_model_version))
-        if not os.path.exists(version_folder):
-            os.makedirs(version_folder)
-        final_model_path = os.path.join(version_folder, 'model.onnx')
-        
-        # get indices of dynamic input and output shapes
-        dynamic_axes = {}
-        for input_name,input_shape in zip(input_names,input_shapes):
-            dynamic_axes[input_name] = [i for i,x in enumerate(input_shape) if x == -1]
-        for output_name,output_shape in zip(output_names,output_shapes):
-            dynamic_axes[output_name] = [i for i,x in enumerate(output_shape) if x == -1]
-        
-        # export the model
-        assert not model.training, "internal error - model should be in eval() mode! "
-        with torch.no_grad():
-            torch.onnx.export(model, inputs[0], final_model_path, verbose=False, 
-                              input_names=input_names, output_names=output_names, 
-                              dynamic_axes=dynamic_axes, opset_version=11)
-        
-        # syntactic error check
-        converted_model = onnx.load(final_model_path)
-        # check that the IR is well formed
-        onnx.checker.check_model(converted_model)
-
-        # Name unnamed nodes - it helps for some other processing tools
-        self.name_onnx_nodes(final_model_path)
-        converted_model = onnx.load(final_model_path)
-        
-        # quantize model
-        if self.args.quantize:
-            if not self.args.calibrate:
-                quantized_model = quantize(
-                    converted_model,
-                    quantization_mode = QuantizationMode.IntegerOps,
-                )
-                # check that the IR is well formed
-                try:
-                    onnx.checker.check_model(quantized_model)
-                except onnx.onnx_cpp2py_export.checker.ValidationError as error:
-                    # FIXME: It is unclear, why checker fails for quantized model so
-                    # this error is ignored currently. Inference works for
-                    # some quantized models so lets show warning here
-                    print("model check failed with warning: [", error, "]")
-                    print("Warning during onnx.checker.check_model in quantized model ignored")
-                onnx.save(quantized_model, final_model_path)
-            else:
-
-                #assert not self.args.calibrate, 'calibrate flag not supported by ONNX'
-                # Parsing command-line arguments
-                #parser = argparse.ArgumentParser(description='parsing model and test data set paths')
-                #parser.add_argument('--model_path', required=True)
-                #parser.add_argument('--dataset_path', required=True)
-                #parser.add_argument('--output_model_path', type=str, default='calibrated_quantized_model.onnx')
-                #parser.add_argument('--dataset_size', type=int, default=0, help="Number of images or tensors to load. Default is 0 which means all samples")
-                #parser.add_argument('--data_preprocess', type=str, required=True, choices=['preprocess_method1', 'preprocess_method2', 'None'], help="Refer to Readme.md for guidance on choosing this option.")
-                #args = parser.parse_args()
-                #model_path = args.model_path
-                #output_model_path = args.output_model_path
-                #images_folder = args.dataset_path
-                calib_mode = "naive"
-                size_limit = 0 # int(args.dataset_size)
-                
-                # Generating augmented ONNX model
-                # FIXME: use proper temporary file path
-                augmented_model_path = 'augmented_model.onnx'
-                #model = onnx.load(model_path)
-                augmented_model = calibrate.augment_graph(converted_model)
-                onnx.checker.check_model(augmented_model)
-                #onnx.save(augmented_model, final_model_path)
-                onnx.save(augmented_model, augmented_model_path)
-                
-                # Conducting inference
-                #session = onnxruntime.InferenceSession(final_model_path, None)
-                print(augmented_model_path)
-                session = onnxruntime.InferenceSession(augmented_model_path, None)
-                #session = onnxruntime.InferenceSession('augmented_modelv3.onnx', None)
-                (samples, channels, height, width) = session.get_inputs()[0].shape
-                print(session.get_inputs()[0].shape)
-                #return
-                
-                # Generating inputs for quantization
-                #if args.data_preprocess == "None":
-                #    inputs = load_pb_file(images_folder, args.dataset_size, samples, channels, height, width)
-                #else:
-                #    inputs = load_batch(images_folder, height, width, args.data_preprocess, size_limit)
-                
-                import numpy as np
-                inputs_calibrate_tmp = inputs[0][0].cpu().numpy()
-                
-                dict_for_quantization = calibrate.get_intermediate_outputs(
-                    final_model_path,
-                    session,
-                    inputs_calibrate_tmp,
-                    calib_mode,
-                )
-                quantization_params_dict = calibrate.calculate_quantization_params(
-                    augmented_model,
-                    quantization_thresholds = dict_for_quantization,
-                )
-                calibrated_quantized_model = quantize(
-                    converted_model,
-                    quantization_mode = QuantizationMode.QLinearOps,
-                    quantization_params = quantization_params_dict,
-                )
-                onnx.save(calibrated_quantized_model, final_model_path)
-                
-                print("Calibrated, quantized model saved.")
-        
-        # load the model
-        session = onnxruntime.InferenceSession(final_model_path, None)
-        
-        class ONNX_model:
-            def __init__(self, session, input_names, device):
-                self.session = session
-                self.input_names = input_names
-                        
-            def to_numpy(self, tensor):
-                return tensor.detach().cpu().numpy() if tensor.requires_grad else tensor.cpu().numpy()
-            
-            def __call__(self, *inputs):
-                inp = [(input_name, inputs[i]) for i,input_name in enumerate(self.input_names)]
-                inp = {input_name : self.to_numpy(x) for input_name,x in inp}
-                outputs = self.session.run(None, inp)
-                outputs = [torch.from_numpy(output) for output in outputs]
-                outputs = [output.to(device) for output in outputs]
-                if len(outputs) == 1:
-                    outputs = outputs[0]
-                return outputs
-        
-        # switch to eval mode
-        model_onnx = ONNX_model(session, input_names, device)
-        
-        # run both models on inputs
-        assert not model.training, "internal error - model should be in eval() mode! "
-        models = (model, model_onnx)
-        outputs, time_model, outputs_onnx, time_model_onnx = self.lib.run_models(models, inputs)
-        
-        # check for errors
-        Error_stats = self.lib.compute_errors(outputs, outputs_onnx)
-        self.lib.print_errors(Error_stats)
-        print('time of error check of native model: ', time_model, 'seconds')
-        print('time of error check of onnx model: ', time_model_onnx, 'seconds')
-        print()
-        
-        # write TRTIS config
-        config_filename = os.path.join(model_folder, "config.pbtxt")
-        self.lib.write_config(config_filename, 
-                              input_shapes, input_types, 
-                              output_shapes, output_types)
-    
-    def to_triton_torchscript(self, dataloader, model):
-        ''' export the model to torchscript and test correctness on dataloader '''
-        # setup device
-        if self.args.triton_no_cuda:
-            device = torch.device('cpu')
-        else:
-            device = torch.device('cuda')
-        
-        # prepare model 
-        model.to(device)
-        model.eval()
-        assert not model.training, "internal error - model should be in eval() mode! "
-
-        #TODO: support quantize
-        assert not self.args.quantize, 'quantize flag not supported by torchscript yet'
-        
-        # prepare inputs
-        inputs = self.lib.prepare_inputs(dataloader, device)
-        
-        # generate input shapes - dynamic tensor shape support 
-        input_shapes = self.lib.get_tuple_of_dynamic_shapes(inputs)
-        
-        # generate input types 
-        input_types = [x.dtype for x in inputs[0]]
-        
-        # prepare save path 
-        model_folder = os.path.join(self.args.save_dir, self.args.triton_model_name)
-        version_folder = os.path.join(model_folder, str(self.args.triton_model_version))
-        if not os.path.exists(version_folder):
-            os.makedirs(version_folder)
-        final_model_path = os.path.join(version_folder, 'model.pt')
-        
-        # convert the model 
-        with torch.no_grad():
-            if self.args.ts_trace: # trace it 
-                model_ts = torch.jit.trace(model, inputs[0])
-            if self.args.ts_script: # script it 
-                model_ts = torch.jit.script(model)
-        
-        # save the model 
-        torch.jit.save(model_ts, final_model_path)
-        
-        # load the model 
-        model_ts = torch.jit.load(final_model_path)
-        model_ts.eval() # WAR for bug : by default, model_ts gets loaded in training mode
-        
-        # run both models on inputs
-        assert not model.training, "internal error - model should be in eval() mode! "
-        assert not model_ts.training, "internal error - converted model should be in eval() mode! "
-        models = (model, model_ts)
-        outputs, time_model, outputs_ts, time_model_ts = self.lib.run_models(models, inputs)
-        
-        # check for errors
-        Error_stats = self.lib.compute_errors(outputs, outputs_ts)
-        self.lib.print_errors(Error_stats)
-        print('time of error check of native model: ', time_model, 'seconds')
-        print('time of error check of ts model: ', time_model_ts, 'seconds')
-        print()
-        
-        # generate output shapes - dynamic tensor shape support 
-        output_shapes = self.lib.get_tuple_of_dynamic_shapes(outputs)
-        
-        # generate output types 
-        output_types = [x.dtype for x in outputs[0]]
-        
-        # now we build the config for TRTIS 
-        config_filename = os.path.join(model_folder, "config.pbtxt")
-        self.lib.write_config(config_filename, 
-                              input_shapes, input_types, 
-                              output_shapes, output_types)
-
- 
--- a/PyTorch/NLP/Transformer/scripts/docker/build.sh
+++ b/PyTorch/NLP/Transformer/scripts/docker/build.sh
-docker build . --network=host -t transformer_pyt
--- a/PyTorch/NLP/Transformer/scripts/docker/launch.sh
+++ b/PyTorch/NLP/Transformer/scripts/docker/launch.sh
-#!/bin/bash
-
-CMD=${1:-/bin/bash}
-NV_VISIBLE_DEVICES=${2:-"0,1,2,3,4,5,6,7,8"}
-DOCKER_BRIDGE=${3:-"host"}
-
-nvidia-docker run -it --rm \
-  --net=$DOCKER_BRIDGE \
-  --shm-size=1g \
-  --ulimit memlock=-1 \
-  --ulimit stack=67108864 \
-  -e NVIDIA_VISIBLE_DEVICES=${NV_VISIBLE_DEVICES} \
-  -v $PWD/results:/results \
-  -v $PWD/data:/data \
-  transformer_pyt $CMD
--- a/PyTorch/NLP/Transformer/scripts/draw_summary.py
+++ b/PyTorch/NLP/Transformer/scripts/draw_summary.py
-import json
-import argparse
-from collections import defaultdict, OrderedDict
-import matplotlib.pyplot as plt
-import numpy as np
-
-def smooth_moving_average(x, n):
-    fil = np.ones(n)/n
-    smoothed = np.convolve(x, fil, mode='valid')
-    smoothed = np.concatenate((x[:n-1], smoothed), axis=0)
-    
-    return smoothed
-
-def moving_stdev(x, n):
-    fil = np.ones(n)/n
-    avg_sqare = np.convolve(np.power(x, 2), fil, mode='valid')
-    squared_avg = np.power(np.convolve(x, fil, mode='valid'), 2)
-    var = avg_sqare - squared_avg
-    stdev = np.sqrt(var)
-    #pad first few values
-    stdev = np.concatenate(([0]*(n-1), stdev), axis=0)
-    
-    return stdev
-
-def get_plot(log):
-    steps = [x[0] for x in log if isinstance(x[0], int)]
-    values = [x[2] for x in log if isinstance(x[0], int)]
-    return steps, values
-
-def highlight_max_point(plot, color):
-    point = max(zip(*plot), key=lambda x: x[1])
-    plt.plot(point[0], point[1], 'bo-', color=color)
-    plt.annotate("{:.2f}".format(point[1]), point)
-    return point
-
-def main(args):
-    jlog = defaultdict(list)
-    jlog['parameters'] = {}
-
-    with open(args.log_file, 'r') as f:
-        for line in f.readlines():
-            line_dict = json.loads(line[5:])
-            if line_dict['type'] == 'LOG':
-                if line_dict['step'] == 'PARAMETER':
-                    jlog['parameters'].update(line_dict['data'])
-                elif line_dict['step'] == [] and 'training_summary' not in jlog:
-                    jlog['training_summary']=line_dict['data']
-                else:
-                    for k, v in line_dict['data'].items():
-                        jlog[k].append((line_dict['step'], line_dict['elapsedtime'], v))
-
-    fig, ax1 = plt.subplots(figsize=(20,5))
-    fig.suptitle(args.title, fontsize=16)
-    ax1.set_xlabel('steps')
-    ax1.set_ylabel('loss')
-
-    # Define colors for specific curves
-    VAL_LOSS_COLOR = 'blue'
-    VAL_BLEU_COLOR = 'red'
-    TEST_BLEU_COLOR = 'pink'
-
-    # Plot smoothed loss curve
-    steps, loss = get_plot(jlog['loss'])
-    smoothed_loss = smooth_moving_average(loss, 150)
-    stdev = moving_stdev(loss, 150)
-
-    ax1.plot(steps, smoothed_loss, label='Training loss')
-    ax1.plot(steps, smoothed_loss + stdev, '--', color='orange', linewidth=0.3, label='Stdev')
-    ax1.plot(steps, smoothed_loss - stdev, '--', color='orange', linewidth=0.3)
-
-    # Plot validation loss curve
-    val_steps, val_loss = get_plot(jlog['val_loss'])
-    ax1.plot(val_steps, val_loss, color='blue', label='Validation loss')
-
-    min_val_loss_step = val_steps[np.argmin(val_loss)]
-    ax1.axvline(min_val_loss_step, linestyle='dashed', color=VAL_LOSS_COLOR, linewidth=0.5, label='Validation loss minimum')
-
-    # Plot BLEU curves
-    ax2 = ax1.twinx()
-    ax2.set_ylabel('BLEU')
-    val_steps, val_bleu = get_plot(jlog['val_bleu'])
-    ax2.plot(val_steps, val_bleu, color=VAL_BLEU_COLOR, label='Validation BLEU')
-    mvb_step, _ =highlight_max_point((val_steps,val_bleu), color=VAL_BLEU_COLOR)
-
-    # values to be labeled on plot
-    max_val_bleu_step = val_steps[np.argmax(val_bleu)]
-    max_val_bleu = val_bleu[val_steps.index(max_val_bleu_step)]
-    min_loss_bleu = val_bleu[val_steps.index(min_val_loss_step)]
-
-
-    if 'test_bleu' in jlog:
-        test_steps, test_bleu = get_plot(jlog['test_bleu'])
-        ax2.plot(val_steps, test_bleu, color=TEST_BLEU_COLOR, label='Test BLEU')
-        highlight_max_point((test_steps, test_bleu), color=TEST_BLEU_COLOR)
-    ax2.tick_params(axis='y')
-
-    # Annotate points with highest BLEU score as well as those for minimal validation loss
-    ax2.plot(min_val_loss_step, min_loss_bleu, 'bo-', color=VAL_BLEU_COLOR)
-    ax2.annotate("{:.2f}".format(min_loss_bleu), (min_val_loss_step, min_loss_bleu))
-
-    if 'test_bleu' in jlog:
-        min_loss_test_bleu = test_bleu[val_steps.index(min_val_loss_step)] #BLEU score on test set when validation loss is minimal
-        ax2.plot(min_val_loss_step, min_loss_test_bleu, 'bo-', color=TEST_BLEU_COLOR)
-        ax2.annotate("{:.2f}".format(min_loss_test_bleu), (min_val_loss_step, min_loss_test_bleu))
-
-        max_val_bleu_test = test_bleu[val_steps.index(max_val_bleu_step)] #BLEU score on test set when BLEU score on dev set is maximal
-        ax2.plot(mvb_step, max_val_bleu_test, 'bo-', color=TEST_BLEU_COLOR)
-        ax2.annotate("{:.2f}".format(max_val_bleu_test), (max_val_bleu_step, max_val_bleu_test))
-
-    ax1.legend(loc='lower left', bbox_to_anchor=(1,0))
-    ax2.legend(loc='upper left', bbox_to_anchor=(1,1))
-    plt.grid()
-    plt.savefig(args.output)
-
-    # Produce json with training summary
-    if args.dump_json:
-        summary = OrderedDict()
-        summary['args'] = OrderedDict(jlog['parameters'])
-        summary['min_val_loss'] = min(val_loss)
-        summary['max_val_bleu'] = max(val_bleu)
-        summary['max_test_bleu'] = max(test_bleu)
-        summary['final_values'] = jlog['training_summary']
-        summary['avg_epoch_loss'] = [x.mean() for x in np.array_split(np.array(loss), jlog['parameters']['max_epoch'])]
-        summary['min_val_loss_step'] = min_val_loss_step
-        json.dump(summary, open(args.dump_json, 'w'))
-
-if __name__=='__main__':
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--title', type=str)
-    parser.add_argument('--log-file', type=str)
-    parser.add_argument('--output' ,'-o', type=str)
-    parser.add_argument('--dump-json', '-j', type=str)
-    args = parser.parse_args()
-    main(args)
--- a/PyTorch/NLP/Transformer/scripts/export_model.sh
+++ b/PyTorch/NLP/Transformer/scripts/export_model.sh
-#!/bin/bash
-
-# Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License. 
-
-NV_VISIBLE_DEVICES=${1:-"0"}
-DOCKER_BRIDGE=${2:-"host"}
-checkpoint=${3:-"/checkpoints/checkpoint_jit.pt"}
-batch_size=${4:-"5120"}
-WORKSPACE=${5:-"/workspace/translation"}
-triton_model_version=${6:-1}
-triton_model_name=${7:-"transformer"}
-triton_dyn_batching_delay=${8:-0}
-triton_engine_count=${9:-1}
-triton_model_overwrite=${10:-"False"}
-
-DEPLOYER="deployer.py"
-
-#TODO: add fp16 option
-
-CMD="python triton/${DEPLOYER} \
-    --ts-script \
-    --save-dir ${WORKSPACE}/triton/triton_models \
-    --triton-model-name ${triton_model_name} \
-    --triton-model-version ${triton_model_version} \
-    --triton-max-batch-size ${batch_size} \
-    --triton-dyn-batching-delay ${triton_dyn_batching_delay} \
-    --triton-engine-count ${triton_engine_count} "
-
-ENCODER_EXPORT_CMD="$CMD --triton-model-name ${triton_model_name}-encoder"
-DECODER_EXPORT_CMD="$CMD --triton-model-name ${triton_model_name}-decoder"
-
-MODEL_ARGS=" -- --checkpoint ${checkpoint} \
-    --batch-size=${batch_size} \
-    --num-batches=2 \
-    --data /data "
-
-ENCODER_EXPORT_CMD+="${MODEL_ARGS} --part encoder"
-DECODER_EXPORT_CMD+="${MODEL_ARGS} --part decoder"
-
-echo Exporting encoder...
-bash scripts/docker/launch.sh "${ENCODER_EXPORT_CMD}" ${NV_VISIBLE_DEVICES} ${DOCKER_BRIDGE}
-echo Exporting decoder...
-bash scripts/docker/launch.sh "${DECODER_EXPORT_CMD}" ${NV_VISIBLE_DEVICES} ${DOCKER_BRIDGE}
--- a/PyTorch/NLP/Transformer/scripts/run_DGX1_AMP_8GPU.sh
+++ b/PyTorch/NLP/Transformer/scripts/run_DGX1_AMP_8GPU.sh
-#! /bin/bash
-#
-# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-nvidia-smi
-
-RESULTS_DIR='/results'
-CHECKPOINTS_DIR='/results/checkpoints'
-STAT_FILE=${RESULTS_DIR}/DGX1_amp_8GPU.json
-mkdir -p $CHECKPOINTS_DIR
-
-SEED=${1:-1}
-LR=${2:-0.000846}
-WARMUP=${3:-4000}
-NUM_EPOCHS=${4:-40}
-BATCH_SIZE=${5:-10240}
-NUM_GPU=${6:-8}
-
-DISTRIBUTED="-m torch.distributed.launch --nproc_per_node=${NUM_GPU}"
-
-python ${DISTRIBUTED} /workspace/translation/train.py \
-  /data/wmt14_en_de_joined_dict \
-  --arch transformer_wmt_en_de_big_t2t \
-  --share-all-embeddings \
-  --optimizer adam \
-  --adam-betas 0.9 0.997 \
-  --adam-eps 1e-9 \
-  --clip-norm 0.0 \
-  --lr-scheduler inverse_sqrt \
-  --warmup-init-lr 0.0 \
-  --warmup-updates ${WARMUP} \
-  --lr $LR \
-  --min-lr 0.0 \ --dropout 0.1 \
-  --weight-decay 0.0 \
-  --criterion label_smoothed_cross_entropy \
-  --label-smoothing 0.1 \
-  --max-tokens ${BATCH_SIZE} \
-  --seed ${SEED} \
-  --max-epoch ${NUM_EPOCHS} \
-  --no-epoch-checkpoints \
-  --fuse-layer-norm \
-  --online-eval \
-  --log-interval 500 \
-  --save-dir ${RESULTS_DIR} \
-  --stat-file ${STAT_FILE} \
-  --amp 
--- a/PyTorch/NLP/Transformer/scripts/run_DGX1_FP32_8GPU.sh
+++ b/PyTorch/NLP/Transformer/scripts/run_DGX1_FP32_8GPU.sh
-#! /bin/bash
-#
-# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-nvidia-smi
-
-RESULTS_DIR='/results'
-CHECKPOINTS_DIR='/results/checkpoints'
-STAT_FILE=${RESULTS_DIR}/DGX1_fp32_8GPU.json
-mkdir -p $CHECKPOINTS_DIR
-
-SEED=${1:-1}
-LR=${2:-0.0006}
-WARMUP=${3:-4000}
-NUM_EPOCHS=${4:-40}
-BATCH_SIZE=${5:-5120}
-NUM_GPU=${6:-8}
-
-DISTRIBUTED="-m torch.distributed.launch --nproc_per_node=${NUM_GPU}"
-
-python ${DISTRIBUTED} /workspace/translation/train.py \
-  /data/wmt14_en_de_joined_dict \
-  --arch transformer_wmt_en_de_big_t2t \
-  --share-all-embeddings \
-  --optimizer adam \
-  --adam-betas 0.9 0.997 \
-  --adam-eps 1e-9 \
-  --clip-norm 0.0 \
-  --lr-scheduler inverse_sqrt \
-  --warmup-init-lr 0.0 \
-  --warmup-updates ${WARMUP} \
-  --lr $LR \
-  --min-lr 0.0 \
-  --dropout 0.1 \
-  --weight-decay 0.0 \
-  --criterion label_smoothed_cross_entropy \
-  --label-smoothing 0.1 \
-  --max-tokens ${BATCH_SIZE} \
-  --seed ${SEED} \
-  --max-epoch ${NUM_EPOCHS} \
-  --no-epoch-checkpoints \
-  --fuse-layer-norm \
-  --online-eval \
-  --log-interval 500 \
-  --save-dir ${RESULTS_DIR} \
-  --stat-file ${STAT_FILE}
--- a/PyTorch/NLP/Transformer/scripts/run_DGXA100_AMP_8GPU.sh
+++ b/PyTorch/NLP/Transformer/scripts/run_DGXA100_AMP_8GPU.sh
-#! /bin/bash
-#
-# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-nvidia-smi
-
-RESULTS_DIR='/results'
-CHECKPOINTS_DIR='/results/checkpoints'
-STAT_FILE=${RESULTS_DIR}/DGXA100_amp_8GPU_log.json
-mkdir -p $CHECKPOINTS_DIR
-
-SEED=${1:-1}
-LR=${2:-0.000846}
-WARMUP=${3:-4000}
-NUM_EPOCHS=${4:-40}
-BATCH_SIZE=${5:-10240}
-NUM_GPU=${6:-8}
-
-DISTRIBUTED="-m torch.distributed.launch --nproc_per_node=${NUM_GPU}"
-
-python ${DISTRIBUTED} /workspace/translation/train.py \
-  /data/wmt14_en_de_joined_dict \
-  --arch transformer_wmt_en_de_big_t2t \
-  --share-all-embeddings \
-  --optimizer adam \
-  --adam-betas 0.9 0.997 \
-  --adam-eps 1e-9 \
-  --clip-norm 0.0 \
-  --lr-scheduler inverse_sqrt \
-  --warmup-init-lr 0.0 \
-  --warmup-updates ${WARMUP} \
-  --lr $LR \
-  --min-lr 0.0 \
-  --dropout 0.1 \
-  --weight-decay 0.0 \
-  --criterion label_smoothed_cross_entropy \
-  --label-smoothing 0.1 \
-  --max-tokens ${BATCH_SIZE} \
-  --seed ${SEED} \
-  --max-epoch ${NUM_EPOCHS} \
-  --no-epoch-checkpoints \
-  --fuse-layer-norm \
-  --online-eval \
-  --log-interval 500 \
-  --save-dir ${RESULTS_DIR} \
-  --stat-file ${STAT_FILE} \
-  --amp
--- a/PyTorch/NLP/Transformer/scripts/run_DGXA100_TF32_8GPU.sh
+++ b/PyTorch/NLP/Transformer/scripts/run_DGXA100_TF32_8GPU.sh
-#! /bin/bash
-#
-# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-nvidia-smi
-
-RESULTS_DIR='/results'
-CHECKPOINTS_DIR='/results/checkpoints'
-STAT_FILE=${RESULTS_DIR}/DGXA100_tf32_8GPU_log.json
-mkdir -p $CHECKPOINTS_DIR
-
-PREC=${1:-'tf32'}
-SEED=${2:-1}
-LR=${3:-0.000846}
-WARMUP=${4:-4000}
-NUM_EPOCHS=${5:-40}
-BATCH_SIZE=${6:-10240}
-NUM_GPU=${7:-8}
-
-DISTRIBUTED="-m torch.distributed.launch --nproc_per_node=${NUM_GPU}"
-
-if [ "$PREC" = "fp32" ];
-then
-    PREC=''
-    export NVIDIA_TF32_OVERRIDE=0
-else
-    PREC=''
-fi
-
-python ${DISTRIBUTED} /workspace/translation/train.py \
-  /data/wmt14_en_de_joined_dict \
-  --arch transformer_wmt_en_de_big_t2t \
-  --share-all-embeddings \
-  --optimizer adam \
-  --adam-betas 0.9 0.997 \
-  --adam-eps 1e-9 \
-  --clip-norm 0.0 \
-  --lr-scheduler inverse_sqrt \
-  --warmup-init-lr 0.0 \
-  --warmup-updates ${WARMUP} \
-  --lr $LR \
-  --min-lr 0.0 \
-  --dropout 0.1 \
-  --weight-decay 0.0 \
-  --criterion label_smoothed_cross_entropy \
-  --label-smoothing 0.1 \
-  --max-tokens ${BATCH_SIZE} \
-  --seed ${SEED} \
-  --max-epoch ${NUM_EPOCHS} \
-  --no-epoch-checkpoints \
-  --fuse-layer-norm \
-  --online-eval \
-  --log-interval 500 \
-  --save-dir ${RESULTS_DIR} \
-  --stat-file ${STAT_FILE}
--- a/PyTorch/NLP/Transformer/scripts/run_preprocessing.sh
+++ b/PyTorch/NLP/Transformer/scripts/run_preprocessing.sh
-# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-DATASET_DIR=/data/wmt14_en_de_joined_dict
-TEXT=examples/translation/wmt14_en_de
-
-(
-  cd examples/translation
-  bash prepare-wmt14en2de.sh --scaling18
-)
-
-python preprocess.py \
-  --source-lang en \
-  --target-lang de \
-  --trainpref $TEXT/train \
-  --validpref $TEXT/valid \
-  --testpref $TEXT/test \
-  --destdir ${DATASET_DIR} \
-  --nwordssrc 33712 \
-  --nwordstgt 33712 \
-  --joined-dictionary
-
-cp $TEXT/code $DATASET_DIR/code
-cp $TEXT/tmp/valid.raw.de $DATASET_DIR/valid.raw.de
-sacrebleu -t wmt14/full -l en-de --echo ref > $DATASET_DIR/test.raw.de
--- a/PyTorch/NLP/Transformer/scripts/run_training.sh
+++ b/PyTorch/NLP/Transformer/scripts/run_training.sh
-#! /bin/bash
-#
-# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-nvidia-smi
-
-RESULTS_DIR='/results'
-CHECKPOINTS_DIR='/results/checkpoints'
-STAT_FILE=${RESULTS_DIR}/run_log.json
-mkdir -p $CHECKPOINTS_DIR
-
-: ${PREC:='amp'}
-: ${SEED:=1}
-: ${LR:=0.000846}
-: ${WARMUP:=4000}
-: ${NUM_EPOCHS:=40}
-: ${BS:=5120}
-: ${NUM_GPU:=8}
-: ${USE_SLURM:=0}
-: ${USE_DISTRIBUTED:=1}
-
-DISTRIBUTED=""
-[ ${USE_DISTRIBUTED} = 1 ] && DISTRIBUTED+="-m torch.distributed.launch --nproc_per_node=${NUM_GPU}"
-[ ${USE_DISTRIBUTED} = 1 ] && [ ${USE_SLURM} = 1 ] && DISTRIBUTED+=" --nnodes ${WORLD_SIZE} --node_rank ${SLURM_NODEID}  \
-            --master_addr ${MASTER_ADDR} --master_port ${MASTER_PORT} "
-
-if [ "$PREC" = "amp" ];
-then
-    PREC='--amp '
-else
-    PREC=''
-fi
-
-python ${DISTRIBUTED} /workspace/translation/train.py \
-  /data/ \
-  --arch transformer_wmt_en_de_big_t2t \
-  --share-all-embeddings \
-  --optimizer adam \
-  --adam-betas 0.9 0.997 \
-  --adam-eps 1e-9 \
-  --clip-norm 0.0 \
-  --lr-scheduler inverse_sqrt \
-  --warmup-init-lr 0.0 \
-  --warmup-updates ${WARMUP} \
-  --lr $LR \
-  --min-lr 0.0 \
-  --dropout 0.1 \
-  --weight-decay 0.0 \
-  --criterion label_smoothed_cross_entropy \
-  --label-smoothing 0.1 \
-  --max-tokens ${BS} \
-  --seed ${SEED} \
-  --max-epoch ${NUM_EPOCHS} \
-  --no-save \
-  --fuse-layer-norm \
-  --online-eval \
-  --log-interval 500 \
-  --save-dir ${RESULTS_DIR} \
-  --stat-file ${STAT_FILE} \
-  ${PREC}
--- a/PyTorch/NLP/Transformer/setup.py
+++ b/PyTorch/NLP/Transformer/setup.py
-#!/usr/bin/env python3
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
-#
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
-#
-#-------------------------------------------------------------------------
-#
-# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from setuptools import setup, find_packages, Extension
-from torch.utils.cpp_extension import BuildExtension, CUDAExtension, CppExtension
-import sys
-
-
-if sys.version_info < (3,):
-    sys.exit('Sorry, Python3 is required for fairseq.')
-
-with open('README.md') as f:
-    readme = f.read()
-
-with open('LICENSE') as f:
-    license = f.read()
-
-with open('requirements.txt') as f:
-    reqs = f.read()
-
-
-extra_compile_args = {'cxx' : ['-O2']}
-extra_compile_args['nvcc'] = ['-O3',
-                              '-I./cutlass/',
-                              '-U__CUDA_NO_HALF_OPERATORS__',
-                              '-U__CUDA_NO_HALF_CONVERSIONS__',
-                              '-gencode', 'arch=compute_70,code=sm_70',
-                              '-gencode', 'arch=compute_70,code=compute_70',
-                              '-gencode', 'arch=compute_80,code=sm_80',
-                              '-gencode', 'arch=compute_80,code=compute_80',
-                              ]
-
-strided_batched_gemm = CUDAExtension(
-                        name='strided_batched_gemm',
-                        sources=['fairseq/modules/strided_batched_gemm/strided_batched_gemm.cpp', 'fairseq/modules/strided_batched_gemm/strided_batched_gemm_cuda.cu'],
-                        extra_compile_args=extra_compile_args
-)
-
-batch_utils = CppExtension(
-                        name='fairseq.data.batch_C',
-                        sources=['fairseq/data/csrc/make_batches.cpp'],
-                        extra_compile_args={
-                                'cxx': ['-O2',],
-                        }
-)
-setup(
-    name='fairseq',
-    version='0.5.0',
-    description='Facebook AI Research Sequence-to-Sequence Toolkit',
-    long_description=readme,
-    license=license,
-    install_requires=reqs.strip().split('\n'),
-    packages=find_packages(),
-    ext_modules=[strided_batched_gemm, batch_utils],
-    cmdclass={
-                'build_ext': BuildExtension.with_options(use_ninja=False)
-    },
-    test_suite='tests',
-)