updata GNMT-v2

01bc05b7 · Pan,Huiwen · 20291e9d · 01bc05b7 · 01bc05b7 · 01bc05b7
Commit 01bc05b7 authored Apr 20, 2022 by Pan,Huiwen
20 changed files
--- a/PyTorch/NLP/gnmt/Dockerfile
+++ b/PyTorch/NLP/gnmt/Dockerfile
 # Copyright (c) 2018-2020, NVIDIA CORPORATION. All rights reserved.
 #
-# Licensed under the Apache License, Version 2.0 (the "License");
+# Permission is hereby granted, free of charge, to any person obtaining a copy
-# you may not use this file except in compliance with the License.
+# of this software and associated documentation files (the "Software"), to deal
-# You may obtain a copy of the License at
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
 #
-#     http://www.apache.org/licenses/LICENSE-2.0
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
 #
-# Unless required by applicable law or agreed to in writing, software
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# distributed under the License is distributed on an "AS IS" BASIS,
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# See the License for the specific language governing permissions and
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# limitations under the License.
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
 ARG FROM_IMAGE_NAME=nvcr.io/nvidia/pytorch:20.06-py3
 FROM ${FROM_IMAGE_NAME}
-# Install dependencies for system configuration logger
+ENV LANG C.UTF-8
-RUN apt-get update && apt-get install -y --no-install-recommends \
+ENV LC_ALL C.UTF-8
-        infiniband-diags \
-        pciutils && \
-    rm -rf /var/lib/apt/lists/*
-# Install Python dependencies
-WORKDIR /workspace/rnn_translator
-COPY requirements.txt .
+RUN pip install --global-option="--cpp_ext" --global-option="--cuda_ext" git+https://github.com/NVIDIA/apex.git#egg=apex
-RUN pip install --no-cache-dir https://github.com/mlperf/logging/archive/9ea0afa.zip \
- && pip install --no-cache-dir -r requirements.txt
-# Copy & build extensions
+WORKDIR /workspace/gnmt
-COPY seq2seq/csrc seq2seq/csrc
-COPY setup.py .
-RUN pip install .
-# Copy GNMT code
+COPY requirements.txt .
-COPY . .
+RUN pip install --no-cache-dir -r requirements.txt
+RUN pip install --no-cache-dir git+https://github.com/NVIDIA/dllogger.git#egg=dllogger
-# Configure environment variables
+ADD . /workspace/gnmt
-ENV LANG C.UTF-8
-ENV LC_ALL C.UTF-8
--- a/PyTorch/NLP/gnmt/LICENSE
+++ b/PyTorch/NLP/gnmt/LICENSE
-MIT License
 Copyright (c) 2017 Elad Hoffer
-Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
+Copyright (c) 2018-2020, NVIDIA CORPORATION. All rights reserved.
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal

--- a/PyTorch/NLP/gnmt/README.md
+++ b/PyTorch/NLP/gnmt/README.md
-# 1. 简介
+# GNMT v2 For PyTorch
-该脚本是基于NLP领域gnmt模型的功能测试用例，参考mlperf工程，当target-bleu指标达到24.0时，视为模型达到收敛标准并成功结束作业运行。
+This repository provides a script and recipe to train the GNMT v2 model to
-# 2. 运行
+achieve state of the art accuracy, and is tested and maintained by NVIDIA.
-## 安装依赖项
+## Table Of Contents
-    pip install sacrebleu==1.2.10
-    pip3 install --no-cache-dir https://github.com/mlperf/logging/archive/9ea0afa.zip
+<!-- TOC GFM -->
-    apex
-    seq2seq中gpu相关依赖 CC=hipcc CXX=hipcc python3 setup.py install
+* [Model overview](#model-overview)
+  * [Model architecture](#model-architecture)
-## 数据集下载
+  * [Default configuration](#default-configuration)
-    bash scripts/wmt16_en_de.sh
+  * [Feature support matrix](#feature-support-matrix)
-* 关于数据集的更详细介绍可以参考README_orgin.md中第3部分
+    * [Features](#features)
+  * [Mixed precision training](#mixed-precision-training)
-## 预处理
+    * [Enabling mixed precision](#enabling-mixed-precision)
-    python3 preprocess_data.py --dataset-dir /path/to/download/wmt16_de_en/ --preproc-data-dir =/path/to/save/preprocess/data --max-length-train "75" --math fp32
+    * [Enabling TF32](#enabling-tf32)
+* [Setup](#setup)
-## 单机单卡
+  * [Requirements](#requirements)
-    HIP_VISIBLE_DEVICES=0 python3 train.py \
+* [Quick Start Guide](#quick-start-guide)
-        --save ${RESULTS_DIR} \
+* [Advanced](#advanced)
-        --dataset-dir ${DATASET_DIR} \
+  * [Scripts and sample code](#scripts-and-sample-code)
-        --preproc-data-dir ${PREPROC_DATADIR}/${MAX_SEQ_LEN} \
+  * [Parameters](#parameters)
-        --target-bleu $TARGET \
+  * [Command-line options](#command-line-options)
-        --epochs "${NUMEPOCHS}" \
+  * [Getting the data](#getting-the-data)
-        --math ${MATH} \
+    * [Dataset guidelines](#dataset-guidelines)
-        --max-length-train ${MAX_SEQ_LEN} \
+  * [Training process](#training-process)
-        --print-freq 10 \
+  * [Inference process](#inference-process)
-        --train-batch-size $TRAIN_BATCH_SIZE \
+* [Performance](#performance)
-        --test-batch-size $TEST_BATCH_SIZE \
+  * [Benchmarking](#benchmarking)
-        --optimizer Adam \
+    * [Training performance benchmark](#training-performance-benchmark)
-        --lr $LR \
+    * [Inference performance benchmark](#inference-performance-benchmark)
-        --warmup-steps $WARMUP_STEPS \
+  * [Results](#results)
-        --remain-steps $REMAIN_STEPS \
+    * [Training accuracy results](#training-accuracy-results)
-        --decay-interval $DECAY_INTERVAL \
+      * [Training accuracy: NVIDIA DGX A100 (8x A100 40GB)](#training-accuracy-nvidia-dgx-a100-8x-a100-40gb)
-        --no-log-all-ranks
+      * [Training accuracy: NVIDIA DGX-1 (8x V100 16GB)](#training-accuracy-nvidia-dgx-1-8x-v100-16gb)
+      * [Training accuracy: NVIDIA DGX-2H (16x V100 32GB)](#training-accuracy-nvidia-dgx-2h-16x-v100-32gb)
-* 可参考run_fp32_singleCard.sh
+      * [Training stability test](#training-stability-test)
+    * [Training throughput results](#training-throughput-results)
-## 单机多卡
+      * [Training throughput: NVIDIA DGX A100 (8x A100 40GB)](#training-throughput-nvidia-dgx-a100-8x-a100-40gb)
-    bash run_fp32_node.sh
+      * [Training throughput: NVIDIA DGX-1 (8x V100 16GB)](#training-throughput-nvidia-dgx-1-8x-v100-16gb)
+      * [Training throughput: NVIDIA DGX-2H (16x V100 32GB)](#training-throughput-nvidia-dgx-2h-16x-v100-32gb)
-* 可参考run_fp32_node.sh
+    * [Inference accuracy results](#inference-accuracy-results)
+      * [Inference accuracy: NVIDIA A100 40GB](#inference-accuracy-nvidia-a100-40gb)
-## 多机多卡
+      * [Inference accuracy: NVIDIA Tesla V100 16GB](#inference-accuracy-nvidia-tesla-v100-16gb)
-    bash run_fp32_multi.sh
+      * [Inference accuracy: NVIDIA T4](#inference-accuracy-nvidia-t4)
+    * [Inference throughput results](#inference-throughput-results)
-# 3. 模型
+      * [Inference throughput: NVIDIA A100 40GB](#inference-throughput-nvidia-a100-40gb)
-### Publication/Attribution
+      * [Inference throughput: NVIDIA T4](#inference-throughput-nvidia-t4)
+    * [Inference latency results](#inference-latency-results)
-Implemented model is similar to the one from [Google's Neural Machine
+      * [Inference latency: NVIDIA A100 40GB](#inference-latency-nvidia-a100-40gb)
-Translation System: Bridging the Gap between Human and Machine
+      * [Inference latency: NVIDIA T4](#inference-latency-nvidia-t4)
+* [Release notes](#release-notes)
+  * [Changelog](#changelog)
+  * [Known issues](#known-issues)
+<!-- /TOC -->
+## Model overview
+The GNMT v2 model is similar to the one discussed in the [Google's Neural
+Machine Translation System: Bridging the Gap between Human and Machine
 Translation](https://arxiv.org/abs/1609.08144) paper.
-Most important difference is in the attention mechanism. This repository
+The most important difference between the two models is in the attention
-implements `gnmt_v2` attention: output from first LSTM layer of decoder goes
+mechanism. In our model, the output from the first LSTM layer of the decoder
-into attention, then re-weighted context is concatenated with inputs to all
+goes into the attention module, then the re-weighted context is concatenated
-subsequent LSTM layers in decoder at current timestep.
+with inputs to all subsequent LSTM layers in the decoder at the current
+time step.
+The same attention mechanism is also implemented in the default GNMT-like
+models from [TensorFlow Neural Machine Translation
+Tutorial](https://github.com/tensorflow/nmt) and [NVIDIA OpenSeq2Seq
+Toolkit](https://github.com/NVIDIA/OpenSeq2Seq).
-The same attention mechanism is also implemented in default
+### Model architecture
-GNMT-like models from [tensorflow/nmt](https://github.com/tensorflow/nmt) and
+![ModelArchitecture](./img/diagram.png)
-[NVIDIA/OpenSeq2Seq](https://github.com/NVIDIA/OpenSeq2Seq).
-### Structure
+### Default configuration
+The following features were implemented in this model:
 * general:
  * encoder and decoder are using shared embeddings
-  * data-parallel multi-gpu training
+  * data-parallel multi-GPU training
+  * dynamic loss scaling with backoff for Tensor Cores (mixed precision)
+    training
  * trained with label smoothing loss (smoothing factor 0.1)
 * encoder:
-  * 4-layer LSTM, hidden size 1024, first layer is bidirectional, the rest of
+  * 4-layer LSTM, hidden size 1024, first layer is bidirectional, the rest are
-    layers are unidirectional
+    unidirectional
-  * with residual connections starting from 3rd LSTM layer
+  * with residual connections starting from 3rd layer
-  * uses standard pytorch nn.LSTM layer
+  * uses standard PyTorch nn.LSTM layer
  * dropout is applied on input to all LSTM layers, probability of dropout is
    set to 0.2
  * hidden state of LSTM layers is initialized with zeros
-  * weights and bias of LSTM layers is initialized with uniform(-0.1, 0.1)
+  * weights and bias of LSTM layers is initialized with uniform(-0.1,0.1)
    distribution
 * decoder:
  * 4-layer unidirectional LSTM with hidden size 1024 and fully-connected
    classifier
-  * with residual connections starting from 3rd LSTM layer
+  * with residual connections starting from 3rd layer
-  * uses standard pytorch nn.LSTM layer
+  * uses standard PyTorch nn.LSTM layer
  * dropout is applied on input to all LSTM layers, probability of dropout is
    set to 0.2
  * hidden state of LSTM layers is initialized with zeros
-  * weights and bias of LSTM layers is initialized with uniform(-0.1, 0.1)
+  * weights and bias of LSTM layers is initialized with uniform(-0.1,0.1)
    distribution
  * weights and bias of fully-connected classifier is initialized with
-    uniform(-0.1, 0.1) distribution
+    uniform(-0.1,0.1) distribution
 * attention:
  * normalized Bahdanau attention
-  * model uses `gnmt_v2` attention mechanism
+  * output from first LSTM layer of decoder goes into attention, then
-  * output from first LSTM layer of decoder goes into attention,
+    re-weighted context is concatenated with the input to all subsequent LSTM
-  then re-weighted context is concatenated with the input to all subsequent
+    layers of the decoder at the current timestep
-  LSTM layers in decoder at the current timestep
+  * linear transform of keys and queries is initialized with uniform(-0.1,
-  * linear transform of keys and queries is initialized with uniform(-0.1, 0.1),
+    0.1), normalization scalar is initialized with 1.0/sqrt(1024),
-  normalization scalar is initialized with 1.0 / sqrt(1024),
    normalization bias is initialized with zero
 * inference:
-  * beam search with beam size of 5
+  * beam search with default beam size of 5
  * with coverage penalty and length normalization, coverage penalty factor is
    set to 0.1, length normalization factor is set to 0.6 and length
    normalization constant is set to 5.0
-  * BLEU computed by [sacrebleu](https://pypi.org/project/sacrebleu/)
+  * de-tokenized BLEU computed by
+    [SacreBLEU](https://github.com/mjpost/sacrebleu)
+  * [motivation](https://github.com/mjpost/sacrebleu#motivation) for choosing
+    SacreBLEU
+When comparing the BLEU score, there are various tokenization approaches and
+BLEU calculation methodologies; therefore, ensure you align similar metrics.
+Code from this repository can be used to train a larger, 8-layer GNMT v2 model.
+Our experiments show that a 4-layer model is significantly faster to train and
+yields comparable accuracy on the public [WMT16
+English-German](http://www.statmt.org/wmt16/translation-task.html) dataset. The
+number of LSTM layers is controlled by the `--num-layers` parameter in the
+`train.py` training script.
+### Feature support matrix
+The following features are supported by this model.
+| **Feature** | **GNMT v2** |
+|:------------|------------:|
+|[Apex AMP](https://nvidia.github.io/apex/amp.html) | Yes |
+|[Apex DistributedDataParallel](https://nvidia.github.io/apex/parallel.html#apex.parallel.DistributedDataParallel) | Yes |
+#### Features
+[Apex AMP](https://nvidia.github.io/apex/amp.html) - a tool that enables Tensor
+Core-accelerated training. Refer to the [Enabling mixed
+precision](#enabling-mixed-precision) section for more details.
+[Apex
+DistributedDataParallel](https://nvidia.github.io/apex/parallel.html#apex.parallel.DistributedDataParallel) -
+a module wrapper that enables easy multiprocess distributed data parallel
+training, similar to
+[torch.nn.parallel.DistributedDataParallel](https://pytorch.org/docs/stable/nn.html#torch.nn.parallel.DistributedDataParallel).
+`DistributedDataParallel` is optimized for use with
+[NCCL](https://github.com/NVIDIA/nccl). It achieves high performance by
+overlapping communication with computation during `backward()` and bucketing
+smaller gradient transfers to reduce the total number of transfers required.
+### Mixed precision training
+Mixed precision is the combined use of different numerical precisions in a
+computational method.
+[Mixed precision](https://arxiv.org/abs/1710.03740) training offers significant
+computational speedup by performing operations in half-precision format, while
+storing minimal information in single-precision to retain as much information
+as possible in critical parts of the network. Since the introduction of [Tensor
+Cores](https://developer.nvidia.com/tensor-cores) in Volta, and following with
+both the Turing and Ampere architectures, significant training speedups are
+experienced by switching to mixed precision -- up to 3x overall speedup on the
+most arithmetically intense model architectures. Using mixed precision training
+previously required two steps:
+1. Porting the model to use the FP16 data type where appropriate.
+2. Manually adding loss scaling to preserve small gradient values.
+The ability to train deep learning networks with lower precision was introduced
+in the Pascal architecture and first supported in [CUDA
+8](https://devblogs.nvidia.com/parallelforall/tag/fp16/) in the NVIDIA Deep
+Learning SDK.
+For information about:
+* How to train using mixed precision, see the [Mixed Precision
+  Training](https://arxiv.org/abs/1710.03740) paper and [Training With Mixed
+  Precision](https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html)
+  documentation.
+* Techniques used for mixed precision training, see the [Mixed-Precision
+  Training of Deep Neural
+  Networks](https://devblogs.nvidia.com/mixed-precision-training-deep-neural-networks/)
+  blog.
+* APEX tools for mixed precision training, see the [NVIDIA Apex: Tools for Easy
+  Mixed-Precision Training in
+  PyTorch](https://devblogs.nvidia.com/apex-pytorch-easy-mixed-precision-training/)
+  .
+#### Enabling mixed precision
+Mixed precision is enabled in PyTorch by using the Automatic Mixed Precision
+(AMP), library from [APEX](https://github.com/NVIDIA/apex) that casts variables
+to half-precision upon retrieval, while storing variables in single-precision
+format. Furthermore, to preserve small gradient magnitudes in backpropagation,
+a [loss
+scaling](https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html#lossscaling)
+step must be included when applying gradients. In PyTorch, loss scaling can be
+easily applied by using `scale_loss()` method provided by AMP. The scaling
+value to be used can be
+[dynamic](https://nvidia.github.io/apex/amp.html#apex.amp.initialize) or fixed.
+For an in-depth walk through on AMP, check out sample usage
+[here](https://nvidia.github.io/apex/amp.html#).
+[APEX](https://github.com/NVIDIA/apex) is a PyTorch extension that contains
+utility libraries, such as AMP, which require minimal network code changes to
+leverage Tensor Cores performance.
+The following steps were needed to enable mixed precision training in GNMT:
+* Import AMP from APEX (file: `seq2seq/train/trainer.py`):
+```
+from apex import amp
+```
+* Initialize AMP and wrap the model and the optimizer (file:
+  `seq2seq/train/trainer.py`, class: `Seq2SeqTrainer`):
+```
+self.model, self.optimizer = amp.initialize(
+    self.model,
+    self.optimizer,
+    cast_model_outputs=torch.float16,
+    keep_batchnorm_fp32=False,
+    opt_level='O2')
+```
+* Apply `scale_loss` context manager (file: `seq2seq/train/fp_optimizers.py`,
+  class: `AMPOptimizer`):
+```
+with amp.scale_loss(loss, optimizer) as scaled_loss:
+    scaled_loss.backward()
+```
+* Apply gradient clipping on single precision master weights (file:
+  `seq2seq/train/fp_optimizers.py`, class: `AMPOptimizer`):
+```
+if self.grad_clip != float('inf'):
+    clip_grad_norm_(amp.master_params(optimizer), self.grad_clip)
+```
+#### Enabling TF32
+TensorFloat-32 (TF32) is the new math mode in [NVIDIA
+A100](https://www.nvidia.com/en-us/data-center/a100/) GPUs for handling the
+matrix math also called tensor operations. TF32 running on Tensor Cores in A100
+GPUs can provide up to 10x speedups compared to single-precision floating-point
+math (FP32) on Volta GPUs.
+TF32 Tensor Cores can speed up networks using FP32, typically with no loss of
+accuracy. It is more robust than FP16 for models which require high dynamic
+range for weights or activations.
+For more information, refer to the [TensorFloat-32 in the A100 GPU Accelerates
+AI Training, HPC up to
+20x](https://blogs.nvidia.com/blog/2020/05/14/tensorfloat-32-precision-format/)
+blog post.
+TF32 is supported in the NVIDIA Ampere GPU architecture and is enabled by
+default.
+## Setup
+The following section lists the requirements in order to start training the
+GNMT v2 model.
+### Requirements
+This repository contains `Dockerfile` which extends the PyTorch NGC container
+and encapsulates some dependencies. Aside from these dependencies, ensure you
+have the following components:
+* [NVIDIA Docker](https://github.com/NVIDIA/nvidia-docker)
+* [PyTorch 20.06-py3 NGC container](https://ngc.nvidia.com/registry/nvidia-pytorch)
+* GPU architecture:
+  * [NVIDIA Volta](https://www.nvidia.com/en-us/data-center/volta-gpu-architecture/)
+  * [NVIDIA Turing](https://www.nvidia.com/en-us/geforce/turing/)
+  * [NVIDIA Ampere architecture](https://www.nvidia.com/en-us/data-center/nvidia-ampere-gpu-architecture/)
+For more information about how to get started with NGC containers, see the
+following sections from the NVIDIA GPU Cloud Documentation and the Deep
+Learning DGX Documentation:
+* [Getting Started Using NVIDIA GPU Cloud](https://docs.nvidia.com/ngc/ngc-getting-started-guide/index.html),
+* [Accessing And Pulling From The NGC container registry](https://docs.nvidia.com/deeplearning/dgx/user-guide/index.html#accessing_registry),
+* [Running PyTorch](https://docs.nvidia.com/deeplearning/dgx/pytorch-release-notes/running.html#running).
+For those unable to use the Pytorch NGC container, to set up the required
+environment or create your own container, see the versioned [NVIDIA Container
+Support
+Matrix](https://docs.nvidia.com/deeplearning/frameworks/support-matrix/index.html).
+## Quick Start Guide
+To train your model using mixed or TF32 precision with Tensor Cores or using
+FP32, perform the following steps using the default parameters of the GNMT v2
+model on the WMT16 English German dataset. For the specifics concerning
+training and inference, see the [Advanced](#advanced) section.
+**1. Clone the repository.**
+```
+git clone https://github.com/NVIDIA/DeepLearningExamples
+cd DeepLearningExamples/PyTorch/Translation/GNMT
+```
+**2. Build the GNMT v2 Docker container.**
+```
+bash scripts/docker/build.sh
+```
+**3. Start an interactive session in the container to run training/inference.**
+```
+bash scripts/docker/interactive.sh
+```
+**4. Download and preprocess the dataset.**
+Data will be downloaded to the `data` directory (on the host). The `data`
+directory is mounted to the `/workspace/gnmt/data` location in the Docker
+container.
+```
+bash scripts/wmt16_en_de.sh
+```
+**5. Start training.**
+The training script saves only one checkpoint with the lowest value of the loss
+function on the validation dataset. All results and logs are saved to the
+`gnmt` directory (on the host) or to the `/workspace/gnmt/gnmt` directory
+(in the container). By default, the `train.py` script will launch mixed
+precision training with Tensor Cores. You can change this behavior by setting:
+* the `--math fp32` flag to launch single precision training (for NVIDIA Volta
+  and NVIDIA Turing architectures) or
+* the `--math tf32` flag to launch TF32 training with Tensor Cores (for NVIDIA
+  Ampere architecture) 
+for the `train.py` training script.
+To launch mixed precision training on 1, 4 or 8 GPUs, run:
+```
+python3 -m torch.distributed.launch --nproc_per_node=<#GPUs> train.py --seed 2 --train-global-batch-size 1024
+```
+To launch mixed precision training on 16 GPUs, run:
+```
+python3 -m torch.distributed.launch --nproc_per_node=16 train.py --seed 2 --train-global-batch-size 2048
+```
+By default, the training script will launch training with batch size 128 per
+GPU. If `--train-global-batch-size` is specified and larger than 128 times the
+number of GPUs available for the training then the training script will
+accumulate gradients over consecutive iterations and then perform the weight
+update. For example, 1 GPU training with `--train-global-batch-size 1024` will
+accumulate gradients over 8 iterations before doing the weight update with
+accumulated gradients.
+**6. Start evaluation.**
+The training process automatically runs evaluation and outputs the BLEU score
+after each training epoch. Additionally, after the training is done, you can
+manually run inference on the test dataset with the checkpoint saved during the
+training.
+To launch FP16 inference on the `newstest2014.en` test set, run:
+```
+python3 translate.py \
+  --input data/wmt16_de_en/newstest2014.en \
+  --reference data/wmt16_de_en/newstest2014.de \
+  --output /tmp/output \
+  --model gnmt/model_best.pth
+```
+The script will load the checkpoint specified by the `--model` option, then it
+will launch inference on the file specified by the `--input` option, and
+compute BLEU score against the reference translation specified by the
+`--reference` option. Outputs will be stored to the location specified by the
+`--output` option.
+Additionally, one can pass the input text directly from the command-line:
+```
+python3 translate.py \
+  --input-text "The quick brown fox jumps over the lazy dog" \
+  --model gnmt/model_best.pth
+```
+Translated output will be printed to the console:
+```
+(...)
+0: Translated output:
+Der schnelle braune Fuchs springt über den faulen Hund
+```
+By default, the `translate.py` script will launch FP16 inference with Tensor
+Cores. You can change this behavior by setting:
+* the `--math fp32` flag to launch single precision inference (for NVIDIA Volta
+  and NVIDIA Turing architectures) or
+* the `--math tf32` flag to launch TF32 inference with Tensor Cores (for NVIDIA
+  Ampere architecture)
+for the `translate.py` inference script.
+## Advanced
+The following sections provide greater details of the dataset, running training
+and inference, and the training results.
+### Scripts and sample code
+In the `root` directory, the most important files are:
+* `train.py`: serves as the entry point to launch the training
+* `translate.py`: serves as the entry point to launch inference
+* `Dockerfile`: container with the basic set of dependencies to run GNMT v2
+* `requirements.txt`: set of extra requirements for running GNMT v2
+The `seq2seq/model` directory contains the implementation of GNMT v2 building
+blocks:
+* `attention.py`: implementation of normalized Bahdanau attention
+* `encoder.py`: implementation of recurrent encoder
+* `decoder.py`: implementation of recurrent decoder with attention
+* `seq2seq_base.py`: base class for seq2seq models
+* `gnmt.py`: implementation of GNMT v2 model
+The `seq2seq/train` directory encapsulates the necessary tools to execute
+training:
+* `trainer.py`: implementation of training loop
+* `smoothing.py`: implementation of cross-entropy with label smoothing
+* `lr_scheduler.py`: implementation of exponential learning rate warmup and
+  step decay
+* `fp_optimizers.py`: implementation of optimizers for various floating point
+  precisions
+The `seq2seq/inference` directory contains scripts required to run inference:
+* `beam_search.py`: implementation of beam search with length normalization and
+  length penalty
+* `translator.py`: implementation of auto-regressive inference
+The `seq2seq/data` directory contains implementation of components needed for
+data loading:
+* `dataset.py`: implementation of text datasets
+* `sampler.py`: implementation of batch samplers with bucketing by sequence
+  length
+* `tokenizer.py`: implementation of tokenizer (maps integer vocabulary indices
+  to text)
+### Parameters
+Training
+The complete list of available parameters for the `train.py` training script
+contains:
+```
+dataset setup:
+  --dataset-dir DATASET_DIR
+                        path to the directory with training/test data
+                        (default: data/wmt16_de_en)
+  --src-lang SRC_LANG   source language (default: en)
+  --tgt-lang TGT_LANG   target language (default: de)
+  --vocab VOCAB         path to the vocabulary file (relative to DATASET_DIR
+                        directory) (default: vocab.bpe.32000)
+  -bpe BPE_CODES, --bpe-codes BPE_CODES
+                        path to the file with bpe codes (relative to
+                        DATASET_DIR directory) (default: bpe.32000)
+  --train-src TRAIN_SRC
+                        path to the training source data file (relative to
+                        DATASET_DIR directory) (default:
+                        train.tok.clean.bpe.32000.en)
+  --train-tgt TRAIN_TGT
+                        path to the training target data file (relative to
+                        DATASET_DIR directory) (default:
+                        train.tok.clean.bpe.32000.de)
+  --val-src VAL_SRC     path to the validation source data file (relative to
+                        DATASET_DIR directory) (default:
+                        newstest_dev.tok.clean.bpe.32000.en)
+  --val-tgt VAL_TGT     path to the validation target data file (relative to
+                        DATASET_DIR directory) (default:
+                        newstest_dev.tok.clean.bpe.32000.de)
+  --test-src TEST_SRC   path to the test source data file (relative to
+                        DATASET_DIR directory) (default:
+                        newstest2014.tok.bpe.32000.en)
+  --test-tgt TEST_TGT   path to the test target data file (relative to
+                        DATASET_DIR directory) (default: newstest2014.de)
+  --train-max-size TRAIN_MAX_SIZE
+                        use at most TRAIN_MAX_SIZE elements from training
+                        dataset (useful for benchmarking), by default uses
+                        entire dataset (default: None)
+results setup:
+  --save-dir SAVE_DIR   path to directory with results, it will be
+                        automatically created if it does not exist (default:
+                        gnmt)
+  --print-freq PRINT_FREQ
+                        print log every PRINT_FREQ batches (default: 10)
+model setup:
+  --hidden-size HIDDEN_SIZE
+                        hidden size of the model (default: 1024)
+  --num-layers NUM_LAYERS
+                        number of RNN layers in encoder and in decoder
+                        (default: 4)
+  --dropout DROPOUT     dropout applied to input of RNN cells (default: 0.2)
+  --share-embedding     use shared embeddings for encoder and decoder (use '--
+                        no-share-embedding' to disable) (default: True)
+  --smoothing SMOOTHING
+                        label smoothing, if equal to zero model will use
+                        CrossEntropyLoss, if not zero model will be trained
+                        with label smoothing loss (default: 0.1)
+general setup:
+  --math {fp16,fp32,tf32,manual_fp16}
+                        precision (default: fp16)
+  --seed SEED           master seed for random number generators, if "seed" is
+                        undefined then the master seed will be sampled from
+                        random.SystemRandom() (default: None)
+  --prealloc-mode {off,once,always}
+                        controls preallocation (default: always)
+  --dllog-file DLLOG_FILE
+                        Name of the DLLogger output file (default:
+                        train_log.json)
+  --eval                run validation and test after every epoch (use '--no-
+                        eval' to disable) (default: True)
+  --env                 print info about execution env (use '--no-env' to
+                        disable) (default: True)
+  --cuda                enables cuda (use '--no-cuda' to disable) (default:
+                        True)
+  --cudnn               enables cudnn (use '--no-cudnn' to disable) (default:
+                        True)
+  --log-all-ranks       enables logging from all distributed ranks, if
+                        disabled then only logs from rank 0 are reported (use
+                        '--no-log-all-ranks' to disable) (default: True)
+training setup:
+  --train-batch-size TRAIN_BATCH_SIZE
+                        training batch size per worker (default: 128)
+  --train-global-batch-size TRAIN_GLOBAL_BATCH_SIZE
+                        global training batch size, this argument does not
+                        have to be defined, if it is defined it will be used
+                        to automatically compute train_iter_size using the
+                        equation: train_iter_size = train_global_batch_size //
+                        (train_batch_size * world_size) (default: None)
+  --train-iter-size N   training iter size, training loop will accumulate
+                        gradients over N iterations and execute optimizer
+                        every N steps (default: 1)
+  --epochs EPOCHS       max number of training epochs (default: 6)
+  --grad-clip GRAD_CLIP
+                        enables gradient clipping and sets maximum norm of
+                        gradients (default: 5.0)
+  --train-max-length TRAIN_MAX_LENGTH
+                        maximum sequence length for training (including
+                        special BOS and EOS tokens) (default: 50)
+  --train-min-length TRAIN_MIN_LENGTH
+                        minimum sequence length for training (including
+                        special BOS and EOS tokens) (default: 0)
+  --train-loader-workers TRAIN_LOADER_WORKERS
+                        number of workers for training data loading (default:
+                        2)
+  --batching {random,sharding,bucketing}
+                        select batching algorithm (default: bucketing)
+  --shard-size SHARD_SIZE
+                        shard size for "sharding" batching algorithm, in
+                        multiples of global batch size (default: 80)
+  --num-buckets NUM_BUCKETS
+                        number of buckets for "bucketing" batching algorithm
+                        (default: 5)
+optimizer setup:
+  --optimizer OPTIMIZER
+                        training optimizer (default: Adam)
+  --lr LR               learning rate (default: 0.002)
+  --optimizer-extra OPTIMIZER_EXTRA
+                        extra options for the optimizer (default: {})
+mixed precision loss scaling setup:
+  --init-scale INIT_SCALE
+                        initial loss scale (default: 8192)
+  --upscale-interval UPSCALE_INTERVAL
+                        loss upscaling interval (default: 128)
+learning rate scheduler setup:
+  --warmup-steps WARMUP_STEPS
+                        number of learning rate warmup iterations (default:
+                        200)
+  --remain-steps REMAIN_STEPS
+                        starting iteration for learning rate decay (default:
+                        0.666)
+  --decay-interval DECAY_INTERVAL
+                        interval between learning rate decay steps (default:
+                        None)
+  --decay-steps DECAY_STEPS
+                        max number of learning rate decay steps (default: 4)
+  --decay-factor DECAY_FACTOR
+                        learning rate decay factor (default: 0.5)
+validation setup:
+  --val-batch-size VAL_BATCH_SIZE
+                        batch size for validation (default: 64)
+  --val-max-length VAL_MAX_LENGTH
+                        maximum sequence length for validation (including
+                        special BOS and EOS tokens) (default: 125)
+  --val-min-length VAL_MIN_LENGTH
+                        minimum sequence length for validation (including
+                        special BOS and EOS tokens) (default: 0)
+  --val-loader-workers VAL_LOADER_WORKERS
+                        number of workers for validation data loading
+                        (default: 0)
+test setup:
+  --test-batch-size TEST_BATCH_SIZE
+                        batch size for test (default: 128)
+  --test-max-length TEST_MAX_LENGTH
+                        maximum sequence length for test (including special
+                        BOS and EOS tokens) (default: 150)
+  --test-min-length TEST_MIN_LENGTH
+                        minimum sequence length for test (including special
+                        BOS and EOS tokens) (default: 0)
+  --beam-size BEAM_SIZE
+                        beam size (default: 5)
+  --len-norm-factor LEN_NORM_FACTOR
+                        length normalization factor (default: 0.6)
+  --cov-penalty-factor COV_PENALTY_FACTOR
+                        coverage penalty factor (default: 0.1)
+  --len-norm-const LEN_NORM_CONST
+                        length normalization constant (default: 5.0)
+  --intra-epoch-eval N  evaluate within training epoch, this option will
+                        enable extra N equally spaced evaluations executed
+                        during each training epoch (default: 0)
+  --test-loader-workers TEST_LOADER_WORKERS
+                        number of workers for test data loading (default: 0)
+checkpointing setup:
+  --start-epoch START_EPOCH
+                        manually set initial epoch counter (default: 0)
+  --resume PATH         resumes training from checkpoint from PATH (default:
+                        None)
+  --save-all            saves checkpoint after every epoch (default: False)
+  --save-freq SAVE_FREQ
+                        save checkpoint every SAVE_FREQ batches (default:
+                        5000)
+  --keep-checkpoints KEEP_CHECKPOINTS
+                        keep only last KEEP_CHECKPOINTS checkpoints, affects
+                        only checkpoints controlled by --save-freq option
+                        (default: 0)
+benchmark setup:
+  --target-perf TARGET_PERF
+                        target training performance (in tokens per second)
+                        (default: None)
+  --target-bleu TARGET_BLEU
+                        target accuracy (default: None)
+```
+Inference
+The complete list of available parameters for the `translate.py` inference
+script contains:
+```
+data setup:
+  -o OUTPUT, --output OUTPUT
+                        full path to the output file if not specified, then
+                        the output will be printed (default: None)
+  -r REFERENCE, --reference REFERENCE
+                        full path to the file with reference translations (for
+                        sacrebleu, raw text) (default: None)
+  -m MODEL, --model MODEL
+                        full path to the model checkpoint file (default: None)
+  --synthetic           use synthetic dataset (default: False)
+  --synthetic-batches SYNTHETIC_BATCHES
+                        number of synthetic batches to generate (default: 64)
+  --synthetic-vocab SYNTHETIC_VOCAB
+                        size of synthetic vocabulary (default: 32320)
+  --synthetic-len SYNTHETIC_LEN
+                        sequence length of synthetic samples (default: 50)
+  -i INPUT, --input INPUT
+                        full path to the input file (raw text) (default: None)
+  -t INPUT_TEXT [INPUT_TEXT ...], --input-text INPUT_TEXT [INPUT_TEXT ...]
+                        raw input text (default: None)
+  --sort                sorts dataset by sequence length (use '--no-sort' to
+                        disable) (default: False)
+inference setup:
+  --batch-size BATCH_SIZE [BATCH_SIZE ...]
+                        batch size per GPU (default: [128])
+  --beam-size BEAM_SIZE [BEAM_SIZE ...]
+                        beam size (default: [5])
+  --max-seq-len MAX_SEQ_LEN
+                        maximum generated sequence length (default: 80)
+  --len-norm-factor LEN_NORM_FACTOR
+                        length normalization factor (default: 0.6)
+  --cov-penalty-factor COV_PENALTY_FACTOR
+                        coverage penalty factor (default: 0.1)
+  --len-norm-const LEN_NORM_CONST
+                        length normalization constant (default: 5.0)
+general setup:
+  --math {fp16,fp32,tf32} [{fp16,fp32,tf32} ...]
+                        precision (default: ['fp16'])
+  --env                 print info about execution env (use '--no-env' to
+                        disable) (default: False)
+  --bleu                compares with reference translation and computes BLEU
+                        (use '--no-bleu' to disable) (default: True)
+  --cuda                enables cuda (use '--no-cuda' to disable) (default:
+                        True)
+  --cudnn               enables cudnn (use '--no-cudnn' to disable) (default:
+                        True)
+  --batch-first         uses (batch, seq, feature) data format for RNNs
+                        (default: True)
+  --seq-first           uses (seq, batch, feature) data format for RNNs
+                        (default: True)
+  --save-dir SAVE_DIR   path to directory with results, it will be
+                        automatically created if it does not exist (default:
+                        gnmt)
+  --dllog-file DLLOG_FILE
+                        Name of the DLLogger output file (default:
+                        eval_log.json)
+  --print-freq PRINT_FREQ, -p PRINT_FREQ
+                        print log every PRINT_FREQ batches (default: 1)
+benchmark setup:
+  --target-perf TARGET_PERF
+                        target inference performance (in tokens per second)
+                        (default: None)
+  --target-bleu TARGET_BLEU
+                        target accuracy (default: None)
+  --repeat REPEAT [REPEAT ...]
+                        loops over the dataset REPEAT times, flag accepts
+                        multiple arguments, one for each specified batch size
+                        (default: [1])
+  --warmup WARMUP       warmup iterations for performance counters (default:
+                        0)
+  --percentiles PERCENTILES [PERCENTILES ...]
+                        Percentiles for confidence intervals for
+                        throughput/latency benchmarks (default: (90, 95, 99))
+  --tables              print accuracy, throughput and latency results in
+                        tables (use '--no-tables' to disable) (default: False)
+```
+### Command-line options
+To see the full list of available options and their descriptions, use the `-h`
+or `--help` command line option. For example, for training:
+```
+python3 train.py --help
+usage: train.py [-h] [--dataset-dir DATASET_DIR] [--src-lang SRC_LANG]
+                [--tgt-lang TGT_LANG] [--vocab VOCAB] [-bpe BPE_CODES]
+                [--train-src TRAIN_SRC] [--train-tgt TRAIN_TGT]
+                [--val-src VAL_SRC] [--val-tgt VAL_TGT] [--test-src TEST_SRC]
+                [--test-tgt TEST_TGT] [--save-dir SAVE_DIR]
+                [--print-freq PRINT_FREQ] [--hidden-size HIDDEN_SIZE]
+                [--num-layers NUM_LAYERS] [--dropout DROPOUT]
+                [--share-embedding] [--smoothing SMOOTHING]
+                [--math {fp16,fp32,tf32,manual_fp16}] [--seed SEED]
+                [--prealloc-mode {off,once,always}] [--dllog-file DLLOG_FILE]
+                [--eval] [--env] [--cuda] [--cudnn] [--log-all-ranks]
+                [--train-max-size TRAIN_MAX_SIZE]
+                [--train-batch-size TRAIN_BATCH_SIZE]
+                [--train-global-batch-size TRAIN_GLOBAL_BATCH_SIZE]
+                [--train-iter-size N] [--epochs EPOCHS]
+                [--grad-clip GRAD_CLIP] [--train-max-length TRAIN_MAX_LENGTH]
+                [--train-min-length TRAIN_MIN_LENGTH]
+                [--train-loader-workers TRAIN_LOADER_WORKERS]
+                [--batching {random,sharding,bucketing}]
+                [--shard-size SHARD_SIZE] [--num-buckets NUM_BUCKETS]
+                [--optimizer OPTIMIZER] [--lr LR]
+                [--optimizer-extra OPTIMIZER_EXTRA] [--init-scale INIT_SCALE]
+                [--upscale-interval UPSCALE_INTERVAL]
+                [--warmup-steps WARMUP_STEPS] [--remain-steps REMAIN_STEPS]
+                [--decay-interval DECAY_INTERVAL] [--decay-steps DECAY_STEPS]
+                [--decay-factor DECAY_FACTOR]
+                [--val-batch-size VAL_BATCH_SIZE]
+                [--val-max-length VAL_MAX_LENGTH]
+                [--val-min-length VAL_MIN_LENGTH]
+                [--val-loader-workers VAL_LOADER_WORKERS]
+                [--test-batch-size TEST_BATCH_SIZE]
+                [--test-max-length TEST_MAX_LENGTH]
+                [--test-min-length TEST_MIN_LENGTH] [--beam-size BEAM_SIZE]
+                [--len-norm-factor LEN_NORM_FACTOR]
+                [--cov-penalty-factor COV_PENALTY_FACTOR]
+                [--len-norm-const LEN_NORM_CONST] [--intra-epoch-eval N]
+                [--test-loader-workers TEST_LOADER_WORKERS]
+                [--start-epoch START_EPOCH] [--resume PATH] [--save-all]
+                [--save-freq SAVE_FREQ] [--keep-checkpoints KEEP_CHECKPOINTS]
+                [--target-perf TARGET_PERF] [--target-bleu TARGET_BLEU]
+                [--local_rank LOCAL_RANK]
+```
+For example, for inference:
+```
+python3 translate.py --help
+usage: translate.py [-h] [-o OUTPUT] [-r REFERENCE] [-m MODEL] [--synthetic]
+                    [--synthetic-batches SYNTHETIC_BATCHES]
+                    [--synthetic-vocab SYNTHETIC_VOCAB]
+                    [--synthetic-len SYNTHETIC_LEN]
+                    [-i INPUT | -t INPUT_TEXT [INPUT_TEXT ...]] [--sort]
+                    [--batch-size BATCH_SIZE [BATCH_SIZE ...]]
+                    [--beam-size BEAM_SIZE [BEAM_SIZE ...]]
+                    [--max-seq-len MAX_SEQ_LEN]
+                    [--len-norm-factor LEN_NORM_FACTOR]
+                    [--cov-penalty-factor COV_PENALTY_FACTOR]
+                    [--len-norm-const LEN_NORM_CONST]
+                    [--math {fp16,fp32,tf32} [{fp16,fp32,tf32} ...]] [--env]
+                    [--bleu] [--cuda] [--cudnn] [--batch-first | --seq-first]
+                    [--save-dir SAVE_DIR] [--dllog-file DLLOG_FILE]
+                    [--print-freq PRINT_FREQ] [--target-perf TARGET_PERF]
+                    [--target-bleu TARGET_BLEU] [--repeat REPEAT [REPEAT ...]]
+                    [--warmup WARMUP]
+                    [--percentiles PERCENTILES [PERCENTILES ...]] [--tables]
+                    [--local_rank LOCAL_RANK]
+```
+### Getting the data
+The GNMT v2 model was trained on the [WMT16
+English-German](http://www.statmt.org/wmt16/translation-task.html) dataset.
+Concatenation of the newstest2015 and newstest2016 test sets are used as a
+validation dataset and the newstest2014 is used as a testing dataset.
+This repository contains the `scripts/wmt16_en_de.sh` download script which
+automatically downloads and preprocesses the training, validation and test
+datasets. By default, data is downloaded to the `data` directory.
+Our download script is very similar to the `wmt16_en_de.sh` script from the
+[tensorflow/nmt](https://github.com/tensorflow/nmt/blob/master/nmt/scripts/wmt16_en_de.sh)
+repository. Our download script contains an extra preprocessing step, which
+discards all pairs of sentences which can't be decoded by *latin-1* encoder.
+The `scripts/wmt16_en_de.sh` script uses the
+[subword-nmt](https://github.com/rsennrich/subword-nmt) package to segment text
+into subword units (Byte Pair Encodings -
+[BPE](https://en.wikipedia.org/wiki/Byte_pair_encoding)). By default, the
+script builds the shared vocabulary of 32,000 tokens.
+In order to test with other datasets, the script needs to be customized
+accordingly.
+#### Dataset guidelines
+The process of downloading and preprocessing the data can be found in the
+`scripts/wmt16_en_de.sh` script.
+Initially, data is downloaded from [www.statmt.org](www.statmt.org). Then
+`europarl-v7`, `commoncrawl` and `news-commentary` corpora are concatenated to
+form the training dataset, similarly `newstest2015` and `newstest2016` are
+concatenated to form the validation dataset. Raw data is preprocessed with
+[Moses](https://github.com/moses-smt/mosesdecoder), first by launching [Moses
+tokenizer](https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/tokenizer.perl)
+(tokenizer breaks up text into individual words), then by launching
+[clean-corpus-n.perl](https://github.com/moses-smt/mosesdecoder/blob/master/scripts/training/clean-corpus-n.perl)
+which removes invalid sentences and does initial filtering by sequence length.
+Second stage of preprocessing is done by launching the
+`scripts/filter_dataset.py` script, which discards all pairs of sentences that
+can't be decoded by latin-1 encoder.
+Third state of preprocessing uses the
+[subword-nmt](https://github.com/rsennrich/subword-nmt) package. First it
+builds shared [byte pair
+encoding](https://en.wikipedia.org/wiki/Byte_pair_encoding) vocabulary with
+32,000 merge operations (command `subword-nmt learn-bpe`), then it applies
+generated vocabulary to training, validation and test corpora (command
+`subword-nmt apply-bpe`).
+### Training process
+The default training configuration can be launched by running the `train.py`
+training script. By default, the training script saves only one checkpoint with
+the lowest value of the loss function on the validation dataset. An evaluation
+is then performed after each training epoch. Results are stored in the
+`gnmt` directory.
+The training script launches data-parallel training with batch size 128 per GPU
+on all available GPUs. We have tested reliance on up to 16 GPUs on a single
+node.
+After each training epoch, the script runs an evaluation on the validation
+dataset and outputs a BLEU score on the test dataset (newstest2014). BLEU is
+computed by the [SacreBLEU](https://github.com/mjpost/sacreBLEU) package. Logs
+from the training and evaluation are saved to the `gnmt` directory.
+The summary after each training epoch is printed in the following format:
+```
+0: Summary: Epoch: 3	Training Loss: 3.1336	Validation Loss: 2.9587	Test BLEU: 23.18
+0: Performance: Epoch: 3	Training: 418772 Tok/s	Validation: 1445331 Tok/s
+```
+The training loss is averaged over an entire training epoch, the validation
+loss is averaged over the validation dataset and the BLEU score is computed on
+the test dataset. Performance is reported in total tokens per second. The
+result is averaged over an entire training epoch and summed over all GPUs
+participating in the training.
+By default, the `train.py` script will launch mixed precision training with
+Tensor Cores. You can change this behavior by setting:
+* the `--math fp32` flag to launch single precision training (for NVIDIA Volta
+  and NVIDIA Turing architectures) or
+* the `--math tf32` flag to launch TF32 training with Tensor Cores (for NVIDIA
+  Ampere architecture)
+for the `train.py` training script.
+To view all available options for training, run `python3 train.py --help`.
+### Inference process
+Inference can be run by launching the `translate.py` inference script,
+although, it requires a pre-trained model checkpoint and tokenized input.
+The inference script, `translate.py`, supports batched inference. By default,
+it launches beam search with beam size of 5, coverage penalty term and length
+normalization term. Greedy decoding can be enabled by setting the beam size to
+1.
+To view all available options for inference, run `python3 translate.py --help`.
+## Performance
+The performance measurements in this document were conducted at the time of
+publication and may not reflect the performance achieved from NVIDIA’s latest
+software release. For the most up-to-date performance measurements, go to
+[NVIDIA Data Center Deep Learning Product
+Performance](https://developer.nvidia.com/deep-learning-performance-training-inference).
+### Benchmarking
+The following section shows how to run benchmarks measuring the model
+performance in training and inference modes.
+#### Training performance benchmark
+Training is launched on batches of text data, different batches have different
+sequence lengths (number of tokens in the longest sequence). Sequence length
+and batch efficiency (ratio of non-pad tokens to total number of tokens) affect
+performance of the training, therefore it's recommended to run the training on
+a large chunk of training dataset to get a stable and reliable average training
+performance. Ideally at least one full epoch of training should be launched to
+get a good estimate of training performance.
+The following commands will launch one epoch of training:
+To launch mixed precision training on 1, 4 or 8 GPUs, run:
+```
+python3 -m torch.distributed.launch --nproc_per_node=<#GPUs> train.py --seed 2 --train-global-batch-size 1024 --epochs 1 --math fp16
+```
+To launch mixed precision training on 16 GPUs, run:
+```
+python3 -m torch.distributed.launch --nproc_per_node=16 train.py --seed 2 --train-global-batch-size 2048 --epochs 1 --math fp16
+```
+Change `--math fp16` to `--math fp32` to launch single precision training (for
+NVIDIA Volta and NVIDIA Turing architectures) or to `--math tf32` to launch
+TF32 training with Tensor Cores (for NVIDIA Ampere architecture).
+After the training is completed, the `train.py` script prints a summary to
+standard output. Performance results are printed in the following format:
+```
+(...)
+0: Performance: Epoch: 0	Training: 418926 Tok/s	Validation: 1430828 Tok/s
+(...)
+```
+`Training: 418926 Tok/s` represents training throughput averaged over an entire
+training epoch and summed over all GPUs participating in the training.
+#### Inference performance benchmark
+The inference performance and accuracy benchmarks require a checkpoint from a
+fully trained model.
+Command to launch the inference accuracy benchmark on NVIDIA Volta or on NVIDIA
+Turing architectures:
+```
+python3 translate.py \
+  --model gnmt/model_best.pth \
+  --input data/wmt16_de_en/newstest2014.en \
+  --reference data/wmt16_de_en/newstest2014.de \
+  --output /tmp/output \
+  --math fp16 fp32 \
+  --batch-size 128 \
+  --beam-size 1 2 5 \
+  --tables
+```
+Command to launch the inference accuracy benchmark on NVIDIA Ampere architecture:
+```
+python3 translate.py \
+  --model gnmt/model_best.pth \
+  --input data/wmt16_de_en/newstest2014.en \
+  --reference data/wmt16_de_en/newstest2014.de \
+  --output /tmp/output \
+  --math fp16 tf32 \
+  --batch-size 128 \
+  --beam-size 1 2 5 \
+  --tables
+```
+Command to launch the inference throughput and latency benchmarks on NVIDIA
+Volta or NVIDIA Turing architectures:
+```
+python3 translate.py \
+  --model gnmt/model_best.pth \
+  --input data/wmt16_de_en/newstest2014.en \
+  --reference data/wmt16_de_en/newstest2014.de \
+  --output /tmp/output \
+  --math fp16 fp32 \
+  --batch-size 1 2 4 8 32 128 512 \
+  --repeat 1 1 1 1 2 8 16 \
+  --beam-size 1 2 5 \
+  --warmup 5 \
+  --tables
+```
+Command to launch the inference throughput and latency benchmarks on NVIDIA
+Ampere architecture:
+```
+python3 translate.py \
+  --model gnmt/model_best.pth \
+  --input data/wmt16_de_en/newstest2014.en \
+  --reference data/wmt16_de_en/newstest2014.de \
+  --output /tmp/output \
+  --math fp16 tf32 \
+  --batch-size 1 2 4 8 32 128 512 \
+  --repeat 1 1 1 1 2 8 16 \
+  --beam-size 1 2 5 \
+  --warmup 5 \
+  --tables
+```
+### Results
+The following sections provide details on how we achieved our performance and
+accuracy in training and inference.
+#### Training accuracy results
+##### Training accuracy: NVIDIA DGX A100 (8x A100 40GB)
+Our results were obtained by running the `train.py` script with the default
+batch size = 128 per GPU in the pytorch-20.06-py3 NGC container on NVIDIA DGX
+A100 with 8x A100 40GB GPUs.
+Command to launch the training:
+```
+python3 -m torch.distributed.launch --nproc_per_node=<#GPUs> train.py --seed 2 --train-global-batch-size 1024 --math fp16
+```
+Change `--math fp16` to `--math tf32` to launch TF32 training with Tensor Cores.
+| **GPUs** | **Batch Size / GPU** | **Accuracy - TF32 (BLEU)** | **Accuracy - Mixed precision (BLEU)** | **Time to Train - TF32 (minutes)** | **Time to Train - Mixed precision (minutes)** | **Time to Train Speedup (TF32 to Mixed precision)** |
+| --- | --- | ----- | ----- | ----- | ------ | ---- |
+| 8   | 128 | 24.46 | 24.60 | 34.7  | 22.7   | 1.53 |
+To achieve these same results, follow the [Quick Start Guide](#quick-start-guide)
+outlined above.
+##### Training accuracy: NVIDIA DGX-1 (8x V100 16GB)
+Our results were obtained by running the `train.py` script with the default
+batch size = 128 per GPU in the pytorch-20.06-py3 NGC container on NVIDIA DGX-1
+with 8x V100 16GB GPUs.
+Command to launch the training:
+```
+python3 -m torch.distributed.launch --nproc_per_node=<#GPUs> train.py --seed 2 --train-global-batch-size 1024 --math fp16
+```
+Change `--math fp16` to `--math fp32` to launch single precision training.
+| **GPUs** | **Batch Size / GPU** | **Accuracy - FP32 (BLEU)** | **Accuracy - Mixed precision (BLEU)** | **Time to Train - FP32 (minutes)** | **Time to Train - Mixed precision (minutes)** | **Time to Train Speedup (FP32 to Mixed precision)** |
+| --- | --- | ----- | ----- | ----- | ------ | ---- |
+| 1   | 128 | 24.41 | 24.42 | 810.0 | 224.0  | 3.62 |
+| 4   | 128 | 24.40 | 24.33 | 218.2 | 69.5   | 3.14 |
+| 8   | 128 | 24.45 | 24.38 | 112.0 | 38.6   | 2.90 |
+To achieve these same results, follow the [Quick Start Guide](#quick-start-guide)
+outlined above.
+##### Training accuracy: NVIDIA DGX-2H (16x V100 32GB)
+Our results were obtained by running the `train.py` script with the default
+batch size = 128 per GPU in the pytorch-20.06-py3 NGC container on NVIDIA DGX-2H
+with 16x V100 32GB GPUs.
+To launch mixed precision training on 16 GPUs, run:
+```
+python3 -m torch.distributed.launch --nproc_per_node=16 train.py --seed 2 --train-global-batch-size 2048 --math fp16
+```
+Change `--math fp16` to `--math fp32` to launch single precision training.
+| **GPUs** | **Batch Size / GPU** | **Accuracy - FP32 (BLEU)** | **Accuracy - Mixed precision (BLEU)** | **Time to Train - FP32 (minutes)** | **Time to Train - Mixed precision (minutes)** | **Time to Train Speedup (FP32 to Mixed precision)** |
+| --- | --- | ----- | ----- | ------ | ----- | ---- |
+| 16  | 128 | 24.41 | 24.38 | 52.1   | 19.4  | 2.69 |
+To achieve these same results, follow the [Quick Start Guide](#quick-start-guide)
+outlined above.
+![TrainingLoss](./img/training_loss.png)
+##### Training stability test
+The GNMT v2 model was trained for 6 epochs, starting from 32 different initial
+random seeds. After each training epoch, the model was evaluated on the test
+dataset and the BLEU score was recorded. The training was performed in the
+pytorch-20.06-py3 Docker container on NVIDIA DGX A100 with 8x A100 40GB GPUs.
+The following table summarizes the results of the stability test.
+In the following table, the BLEU scores after each training epoch for different
+initial random seeds are displayed.
+| **Epoch** | **Average** | **Standard deviation** | **Minimum** | **Maximum** | **Median** |
+| --- | ------ | ----- | ------ | ------ | ------ |
+| 1   | 19.959 | 0.238 | 19.410 | 20.390 | 19.970 |
+| 2   | 21.772 | 0.293 | 20.960 | 22.280 | 21.820 |
+| 3   | 22.435 | 0.264 | 21.740 | 22.870 | 22.465 |
+| 4   | 23.167 | 0.166 | 22.870 | 23.620 | 23.195 |
+| 5   | 24.233 | 0.149 | 23.820 | 24.530 | 24.235 |
+| 6   | 24.416 | 0.131 | 24.140 | 24.660 | 24.390 |
+#### Training throughput results
+##### Training throughput: NVIDIA DGX A100 (8x A100 40GB)
+Our results were obtained by running the `train.py` training script in the
+pytorch-20.06-py3 NGC container on NVIDIA DGX A100 with 8x A100 40GB GPUs.
+Throughput performance numbers (in tokens per second) were averaged over an
+entire training epoch.
+| **GPUs** | **Batch size / GPU** | **Throughput - TF32 (tok/s)** | **Throughput - Mixed precision (tok/s)** | **Throughput speedup (TF32 to Mixed precision)** | **Strong Scaling - TF32** | **Strong Scaling - Mixed precision** |
+| --- | --- | ------ | ------ | ----- | ----- | ----- |
+| 1   | 128 | 83214  | 140909 | 1.693 | 1.000 | 1.000 |
+| 4   | 128 | 278576 | 463144 | 1.663 | 3.348 | 3.287 |
+| 8   | 128 | 519952 | 822024 | 1.581 | 6.248 | 5.834 |
+To achieve these same results, follow the [Quick Start Guide](#quick-start-guide)
+outlined above.
+##### Training throughput: NVIDIA DGX-1 (8x V100 16GB)
+Our results were obtained by running the `train.py` training script in the
+pytorch-20.06-py3 NGC container on NVIDIA DGX-1 with 8x V100 16GB GPUs.
+Throughput performance numbers (in tokens per second) were averaged over an
+entire training epoch.
+| **GPUs** | **Batch size / GPU** | **Throughput - FP32 (tok/s)** | **Throughput - Mixed precision (tok/s)** | **Throughput speedup (FP32 to Mixed precision)** | **Strong Scaling - FP32** | **Strong Scaling - Mixed precision** |
+| --- | --- | ------ | ------ | ----- | ----- | ----- |
+| 1   | 128 | 21860  | 76438  | 3.497 | 1.000 | 1.000 |
+| 4   | 128 | 80224  | 249168 | 3.106 | 3.670 | 3.260 |
+| 8   | 128 | 154168 | 447832 | 2.905 | 7.053 | 5.859 |
+To achieve these same results, follow the [Quick Start Guide](#quick-start-guide)
+outlined above.
+##### Training throughput: NVIDIA DGX-2H (16x V100 32GB)
+Our results were obtained by running the `train.py` training script in the
+pytorch-20.06-py3 NGC container on NVIDIA DGX-2H with 16x V100 32GB GPUs.
+Throughput performance numbers (in tokens per second) were averaged over an
+entire training epoch.
+| **GPUs** | **Batch size / GPU** | **Throughput - FP32 (tok/s)** | **Throughput - Mixed precision (tok/s)** | **Throughput speedup (FP32 to Mixed precision)** | **Strong Scaling - FP32** | **Strong Scaling - Mixed precision** |
+| --- | --- | ------ | ------ | ----- | ------ | ------ |
+| 1  | 128 | 25583  | 87829   | 3.433 | 1.000  | 1.000  |
+| 4  | 128 | 91400  | 290640  | 3.180 | 3.573  | 3.309  |
+| 8  | 128 | 176616 | 522008  | 2.956 | 6.904  | 5.943  |
+| 16 | 128 | 351792 | 1010880 | 2.874 | 13.751 | 11.510 |
+To achieve these same results, follow the [Quick Start Guide](#quick-start-guide)
+outlined above.
+#### Inference accuracy results
+##### Inference accuracy: NVIDIA A100 40GB
+Our results were obtained by running the `translate.py` script in the
+pytorch-20.06-py3 NGC Docker container with NVIDIA A100 40GB GPU. Full
+command to launch the inference accuracy benchmark was provided in the
+[Inference performance benchmark](#inference-performance-benchmark) section.
+| **Batch Size** | **Beam Size** | **Accuracy - TF32 (BLEU)** | **Accuracy - FP16 (BLEU)** |
+| -------------: | ------------: | -------------------------: | -------------------------: |
+| 128            | 1             | 23.07                      | 23.07                      |
+| 128            | 2             | 23.81                      | 23.81                      |
+| 128            | 5             | 24.41                      | 24.43                      |
+##### Inference accuracy: NVIDIA Tesla V100 16GB
+Our results were obtained by running the `translate.py` script in the
+pytorch-20.06-py3 NGC Docker container with NVIDIA Tesla V100 16GB GPU. Full
+command to launch the inference accuracy benchmark was provided in the
+[Inference performance benchmark](#inference-performance-benchmark) section.
+| **Batch Size** | **Beam Size** | **Accuracy - FP32 (BLEU)** | **Accuracy - FP16 (BLEU)** |
+| -------------: | ------------: | -------------------------: | -------------------------: |
+| 128            | 1             | 23.07                      | 23.07                      |
+| 128            | 2             | 23.81                      | 23.79                      |
+| 128            | 5             | 24.40                      | 24.43                      |
+##### Inference accuracy: NVIDIA T4
+Our results were obtained by running the `translate.py` script in the
+pytorch-20.06-py3 NGC Docker container with NVIDIA Tesla T4. Full command to
+launch the inference accuracy benchmark was provided in the [Inference
+performance benchmark](#inference-performance-benchmark) section.
+| **Batch Size** | **Beam Size** | **Accuracy - FP32 (BLEU)** | **Accuracy - FP16 (BLEU)** |
+| -------------: | ------------: | -------------------------: | -------------------------: |
+| 128            | 1             | 23.07                      | 23.08                      |
+| 128            | 2             | 23.81                      | 23.80                      |
+| 128            | 5             | 24.40                      | 24.39                      |
+To achieve these same results, follow the [Quick Start Guide](#quick-start-guide)
+outlined above.
+#### Inference throughput results
+Tables presented in this section show the average inference throughput (columns
+**Avg (tok/s)**) and inference throughput for various confidence intervals
+(columns **N% (ms)**, where `N` denotes the confidence interval). Inference
+throughput is measured in tokens per second. Speedups reported in FP16
+subsections are relative to FP32 (for NVIDIA Volta and NVIDIA Turing) and
+relative to TF32 (for NVIDIA Ampere) numbers for corresponding configuration.
+##### Inference throughput: NVIDIA A100 40GB
+Our results were obtained by running the `translate.py` script in the
+pytorch-20.06-py3 NGC Docker container with NVIDIA A100 40GB.
+Full command to launch the inference throughput benchmark was provided in the
+[Inference performance benchmark](#inference-performance-benchmark) section.
+**FP16**
+|**Batch Size**|**Beam Size**|**Avg (tok/s)**|**Speedup**|**90% (tok/s)**|**Speedup**|**95% (tok/s)**|**Speedup**|**99% (tok/s)**|**Speedup**|
+|-------------:|------------:|--------------:|----------:|--------------:|----------:|--------------:|----------:|--------------:|----------:|
+|             1|            1|         1291.6|      1.031|         1195.7|      1.029|         1165.8|      1.029|         1104.7|      1.030|
+|             1|            2|          882.7|      1.019|          803.4|      1.015|          769.2|      1.015|          696.7|      1.017|
+|             1|            5|          848.3|      1.042|          753.0|      1.037|          715.0|      1.043|          636.4|      1.033|
+|             2|            1|         2060.5|      1.034|         1700.8|      1.032|         1621.8|      1.032|         1487.4|      1.022|
+|             2|            2|         1445.7|      1.026|         1197.6|      1.024|         1132.5|      1.023|         1043.7|      1.033|
+|             2|            5|         1402.3|      1.063|         1152.4|      1.056|         1100.5|      1.053|          992.9|      1.053|
+|             4|            1|         3465.6|      1.046|         2838.3|      1.040|         2672.7|      1.043|         2392.8|      1.043|
+|             4|            2|         2425.4|      1.041|         2002.5|      1.028|         1898.3|      1.033|         1690.2|      1.028|
+|             4|            5|         2364.4|      1.075|         1930.0|      1.067|         1822.0|      1.065|         1626.1|      1.058|
+|             8|            1|         6151.1|      1.099|         5078.0|      1.087|         4786.5|      1.096|         4206.9|      1.090|
+|             8|            2|         4241.9|      1.075|         3494.1|      1.066|         3293.6|      1.066|         2970.9|      1.064|
+|             8|            5|         4117.7|      1.118|         3430.9|      1.103|         3224.5|      1.104|         2833.5|      1.110|
+|            32|            1|        18830.4|      1.147|        16210.0|      1.152|        15563.9|      1.138|        13973.2|      1.135|
+|            32|            2|        12698.2|      1.133|        10812.3|      1.114|        10256.1|      1.145|         9330.2|      1.101|
+|            32|            5|        11802.6|      1.355|         9998.8|      1.318|         9671.6|      1.329|         9058.4|      1.335|
+|           128|            1|        53394.5|      1.350|        48867.6|      1.342|        46898.5|      1.414|        40670.6|      1.305|
+|           128|            2|        34876.4|      1.483|        31687.4|      1.491|        30025.4|      1.505|        27677.1|      1.421|
+|           128|            5|        28201.3|      1.986|        25660.5|      1.997|        24306.0|      1.967|        23326.2|      2.007|
+|           512|            1|       119675.3|      1.904|       112400.5|      1.971|       109694.8|      1.927|       108781.3|      1.919|
+|           512|            2|        74514.7|      2.126|        69578.9|      2.209|        69348.1|      2.210|        69253.7|      2.212|
+|           512|            5|        47003.2|      2.760|        43348.2|      2.893|        43080.3|      2.884|        42878.4|      2.881|
+##### Inference throughput: NVIDIA T4
+Our results were obtained by running the `translate.py` script in the
+pytorch-20.06-py3 NGC Docker container with NVIDIA T4.
+Full command to launch the inference throughput benchmark was provided in the
+[Inference performance benchmark](#inference-performance-benchmark) section.
+**FP16**
-Implementation:
+|**Batch Size**|**Beam Size**|**Avg (tok/s)**|**Speedup**|**90% (tok/s)**|**Speedup**|**95% (tok/s)**|**Speedup**|**99% (tok/s)**|**Speedup**|
-* base Seq2Seq model: `pytorch/seq2seq/models/seq2seq_base.py`, class `Seq2Seq`
+|-------------:|------------:|--------------:|----------:|--------------:|----------:|--------------:|----------:|--------------:|----------:|
-* GNMT model: `pytorch/seq2seq/models/gnmt.py`, class `GNMT`
+|             1|            1|         1133.8|      1.266|         1059.1|      1.253|         1036.6|      1.251|          989.5|      1.242|
-* encoder: `pytorch/seq2seq/models/encoder.py`, class `ResidualRecurrentEncoder`
+|             1|            2|          793.9|      1.169|          728.3|      1.165|          698.1|      1.163|          637.1|      1.157|
-* decoder: `pytorch/seq2seq/models/decoder.py`, class `ResidualRecurrentDecoder`
+|             1|            5|          766.8|      1.343|          685.6|      1.335|          649.3|      1.335|          584.1|      1.318|
-* attention: `pytorch/seq2seq/models/attention.py`, class `BahdanauAttention`
+|             2|            1|         1759.8|      1.233|         1461.6|      1.239|         1402.3|      1.242|         1302.1|      1.242|
-* inference (including BLEU evaluation and detokenization): `pytorch/seq2seq/inference/inference.py`, class `Translator`
+|             2|            2|         1313.3|      1.186|         1088.7|      1.185|         1031.6|      1.180|          953.2|      1.178|
-* beam search: `pytorch/seq2seq/inference/beam_search.py`, class `SequenceGenerator`
+|             2|            5|         1257.2|      1.301|         1034.1|      1.316|          990.3|      1.313|          886.3|      1.265|
+|             4|            1|         2974.0|      1.261|         2440.3|      1.255|         2294.6|      1.257|         2087.7|      1.261|
+|             4|            2|         2204.7|      1.320|         1826.3|      1.283|         1718.9|      1.260|         1548.4|      1.260|
+|             4|            5|         2106.1|      1.340|         1727.8|      1.345|         1625.7|      1.353|         1467.7|      1.346|
+|             8|            1|         5076.6|      1.423|         4207.9|      1.367|         3904.4|      1.360|         3475.3|      1.355|
+|             8|            2|         3761.7|      1.311|         3108.1|      1.285|         2931.6|      1.300|         2628.7|      1.300|
+|             8|            5|         3578.2|      1.660|         2998.2|      1.614|         2812.1|      1.609|         2447.6|      1.523|
+|            32|            1|        14637.8|      1.636|        12702.5|      1.644|        12070.3|      1.634|        11036.9|      1.647|
+|            32|            2|        10627.3|      1.818|         9198.3|      1.818|         8431.6|      1.725|         8000.0|      1.773|
+|            32|            5|         8205.7|      2.598|         7117.6|      2.476|         6825.2|      2.497|         6293.2|      2.437|
+|           128|            1|        33800.5|      2.755|        30824.5|      2.816|        27685.2|      2.661|        26580.9|      2.694|
+|           128|            2|        20829.4|      2.795|        18665.2|      2.778|        17372.1|      2.639|        16820.5|      2.821|
+|           128|            5|        11753.9|      3.309|        10658.1|      3.273|        10308.7|      3.205|         9630.7|      3.328|
+|           512|            1|        44474.6|      3.327|        40108.1|      3.394|        39816.6|      3.378|        39708.0|      3.381|
+|           512|            2|        26057.9|      3.295|        23197.3|      3.294|        23019.8|      3.284|        22951.4|      3.284|
+|           512|            5|        12161.5|      3.428|        10777.5|      3.418|        10733.1|      3.414|        10710.5|      3.420|
-### Loss function
+To achieve these same results, follow the [Quick Start Guide](#quick-start-guide)
-Cross entropy loss with label smoothing (smoothing factor = 0.1), padding is not
+outlined above.
-considered part of the loss.
-Loss function is implemented in `pytorch/seq2seq/train/smoothing.py`, class
+#### Inference latency results
-`LabelSmoothing`.
+Tables presented in this section show the average inference latency (columns **Avg
+(ms)**) and inference latency for various confidence intervals (columns **N%
+(ms)**, where `N` denotes the confidence interval). Inference latency is
+measured in milliseconds. Speedups reported in FP16 subsections are relative to
+FP32 (for NVIDIA Volta and NVIDIA Turing) and relative to TF32 (for NVIDIA
+Ampere) numbers for corresponding configuration.
-### Optimizer
+##### Inference latency: NVIDIA A100 40GB
-Adam optimizer with learning rate 1e-3, beta1 = 0.9, beta2 = 0.999, epsilon =
+Our results were obtained by running the `translate.py` script in the
-1e-8 and no weight decay.
+pytorch-20.06-py3 NGC Docker container with NVIDIA A100 40GB.
-Network is trained with gradient clipping, max L2 norm of gradients is set to 5.0.
+Full command to launch the inference latency benchmark was provided in the
+[Inference performance benchmark](#inference-performance-benchmark) section.
-Optimizer is implemented in `pytorch/seq2seq/train/fp_optimizers.py`, class
+**FP16**
-`Fp32Optimizer`.
-### Learning rate schedule
+|**Batch Size**|**Beam Size**|**Avg (ms)**|**Speedup**|**90% (ms)**|**Speedup**|**95% (ms)**|**Speedup**|**99% (ms)**|**Speedup**|
-Model is trained with exponential learning rate warmup for 200 steps and with
+|-------------:|------------:|-----------:|----------:|-----------:|----------:|-----------:|----------:|-----------:|----------:|
-step learning rate decay. Decay is started after 2/3 of training steps, decays
+|             1|            1|       44.69|      1.032|       74.04|      1.035|       84.61|      1.034|       99.14|      1.042|
-for a total of 4 times, at regularly spaced intervals, decay factor is 0.5.
+|             1|            2|       64.76|      1.020|      105.18|      1.018|      118.92|      1.019|      139.42|      1.023|
+|             1|            5|       67.06|      1.043|      107.56|      1.049|      121.82|      1.054|      143.85|      1.054|
+|             2|            1|       56.57|      1.034|       85.59|      1.037|       92.55|      1.038|      107.59|      1.046|
+|             2|            2|       80.22|      1.027|      119.22|      1.027|      128.43|      1.030|      150.06|      1.028|
+|             2|            5|       82.54|      1.063|      121.37|      1.067|      132.35|      1.069|      156.34|      1.059|
+|             4|            1|       67.29|      1.047|       92.69|      1.048|      100.08|      1.056|      112.63|      1.064|
+|             4|            2|       95.86|      1.041|      129.83|      1.040|      139.48|      1.044|      162.34|      1.045|
+|             4|            5|       98.34|      1.075|      133.83|      1.076|      142.70|      1.068|      168.30|      1.075|
+|             8|            1|       75.60|      1.099|       97.87|      1.103|      104.13|      1.099|      117.40|      1.102|
+|             8|            2|      109.38|      1.074|      137.71|      1.079|      147.69|      1.069|      168.79|      1.065|
+|             8|            5|      112.71|      1.116|      143.50|      1.104|      153.17|      1.118|      172.60|      1.113|
+|            32|            1|       98.40|      1.146|      117.02|      1.153|      123.42|      1.150|      129.01|      1.128|
+|            32|            2|      145.87|      1.133|      171.71|      1.159|      184.01|      1.127|      188.64|      1.141|
+|            32|            5|      156.82|      1.357|      189.10|      1.374|      194.95|      1.392|      196.65|      1.419|
+|           128|            1|      137.97|      1.350|      150.04|      1.348|      151.52|      1.349|      154.52|      1.434|
+|           128|            2|      211.58|      1.484|      232.96|      1.490|      237.46|      1.505|      239.86|      1.567|
+|           128|            5|      261.44|      1.990|      288.54|      2.017|      291.63|      2.052|      298.73|      2.136|
+|           512|            1|      245.93|      1.906|      262.51|      1.998|      264.24|      1.999|      265.23|      2.000|
+|           512|            2|      395.61|      2.129|      428.54|      2.219|      431.58|      2.224|      433.86|      2.227|
+|           512|            5|      627.21|      2.767|      691.72|      2.878|      696.01|      2.895|      702.13|      2.887|
-Learning rate scheduler is implemented in
+##### Inference latency: NVIDIA T4
-`pytorch/seq2seq/train/lr_scheduler.py`, class `WarmupMultiStepLR`.
+Our results were obtained by running the `translate.py` script in the
+pytorch-20.06-py3 NGC Docker container with NVIDIA T4.
+Full command to launch the inference latency benchmark was provided in the
+[Inference performance benchmark](#inference-performance-benchmark) section.
-# 4. 评估
+**FP16**
-### Quality metric
+|**Batch Size**|**Beam Size**|**Avg (ms)**|**Speedup**|**90% (ms)**|**Speedup**|**95% (ms)**|**Speedup**|**99% (ms)**|**Speedup**|
-Uncased BLEU score on newstest2014 en-de dataset.
+|-------------:|------------:|-----------:|----------:|-----------:|----------:|-----------:|----------:|-----------:|----------:|
-BLEU scores reported by [sacrebleu](https://pypi.org/project/sacrebleu/)
+|             1|            1|       51.08|      1.261|       84.82|      1.254|       97.45|      1.251|       114.6|      1.257|
-package (version 1.2.10). Sacrebleu is executed with the following flags:
+|             1|            2|       72.05|      1.168|      117.41|      1.165|      132.33|      1.170|       155.8|      1.174|
-`--score-only -lc --tokenize intl`.
+|             1|            5|       74.20|      1.345|      119.45|      1.352|      135.07|      1.354|       160.3|      1.354|
+|             2|            1|       66.31|      1.232|      100.90|      1.232|      108.52|      1.235|       126.9|      1.238|
+|             2|            2|       88.35|      1.185|      131.47|      1.188|      141.46|      1.185|       164.7|      1.191|
+|             2|            5|       92.12|      1.305|      136.30|      1.310|      148.66|      1.309|       174.8|      1.320|
+|             4|            1|       78.54|      1.260|      108.53|      1.256|      117.19|      1.259|       133.7|      1.259|
+|             4|            2|      105.54|      1.315|      142.74|      1.317|      154.36|      1.307|       178.7|      1.303|
+|             4|            5|      110.43|      1.351|      150.62|      1.388|      161.61|      1.397|       191.2|      1.427|
+|             8|            1|       91.65|      1.418|      117.92|      1.421|      126.60|      1.405|       144.0|      1.411|
+|             8|            2|      123.39|      1.315|      156.00|      1.337|      167.34|      1.347|       193.4|      1.340|
+|             8|            5|      129.69|      1.666|      165.01|      1.705|      178.18|      1.723|       200.3|      1.765|
+|            32|            1|      126.53|      1.641|      153.23|      1.689|      159.58|      1.692|       167.0|      1.700|
+|            32|            2|      174.37|      1.822|      209.04|      1.899|      219.59|      1.877|       228.6|      1.878|
+|            32|            5|      226.15|      2.598|      277.38|      2.636|      290.27|      2.648|       299.4|      2.664|
+|           128|            1|      218.29|      2.755|      238.94|      2.826|      243.18|      2.843|       267.1|      2.828|
+|           128|            2|      354.83|      2.796|      396.63|      2.832|      410.53|      2.803|       433.2|      2.866|
+|           128|            5|      628.32|      3.311|      699.57|      3.353|      723.98|      3.323|       771.0|      3.337|
+|           512|            1|      663.07|      3.330|      748.62|      3.388|      753.20|      3.388|       758.0|      3.378|
+|           512|            2|     1134.04|      3.295|     1297.85|      3.283|     1302.25|      3.304|      1306.9|      3.308|
+|           512|            5|     2428.82|      3.428|     2771.72|      3.415|     2801.32|      3.427|      2817.6|      3.422|
-### Quality target
+To achieve these same results, follow the [Quick Start Guide](#quick-start-guide)
-Uncased BLEU score of 24.00.
+outlined above.
-### Evaluation frequency
+## Release notes
-Evaluation of BLEU score is done after every epoch.
+### Changelog
+* July 2020
+  * Added support for NVIDIA DGX A100
+  * Default container updated to NGC PyTorch 20.06-py3
+* June 2019
+  * Default container updated to NGC PyTorch 19.05-py3
+  * Mixed precision training implemented using APEX AMP
+  * Added inference throughput and latency results on NVIDIA T4 and NVIDIA
+    Tesla V100 16GB
+  * Added option to run inference on user-provided raw input text from command
+    line
+* February 2019
+  * Different batching algorithm (bucketing with 5 equal-width buckets)
+  * Additional dropouts before first LSTM layer in encoder and in decoder
+  * Weight initialization changed to uniform (-0.1,0.1)
+  * Switched order of dropout and concatenation with attention in decoder
+  * Default container updated to NGC PyTorch 19.01-py3
+* December 2018
+  * Added exponential warm-up and step learning rate decay
+  * Multi-GPU (distributed) inference and validation
+  * Default container updated to NGC PyTorch 18.11-py3
+  * General performance improvements
+* August 2018
+  * Initial release
-### Evaluation thoroughness
+### Known issues
-Evaluation uses all of `newstest2014.en` (3003 sentences).
+There are no known issues in this release.
--- a/PyTorch/NLP/gnmt/README_orgin.md
+++ b/PyTorch/NLP/gnmt/README_orgin.md
-# 1. Problem
-This problem uses recurrent neural network to do language translation.
-## Requirements
-* [nvidia-docker](https://github.com/NVIDIA/nvidia-docker)
-* [PyTorch 20.06-py3 NGC container](https://ngc.nvidia.com/registry/nvidia-pytorch)
-* Slurm with [Pyxis](https://github.com/NVIDIA/pyxis) and [Enroot](https://github.com/NVIDIA/enroot)
-# 2. Directions
-## Steps to download and verify data
-Download the data using the following command:
-```
-cd ..
-bash download_dataset.sh
-cd -
-```
-Verify data with:
-```
-cd ..
-bash verify_dataset.sh
-cd -
-```
-## Steps to launch training
-### NVIDIA DGX A100 (single node)
-Launch configuration and system-specific hyperparameters for the NVIDIA DGX A100
-single node submission are in the `config_DGXA100.sh` script.
-Steps required to launch single node training on NVIDIA DGX A100:
-1. Build the container and push to a docker registry:
-```
-docker build --pull -t <docker/registry>/mlperf-nvidia:rnn_translator .
-docker push <docker/registry>/mlperf-nvidia:rnn_translator
-```
-2. Launch the training:
-```
-source config_DGXA100.sh
-CONT="<docker/registry>/mlperf-nvidia:rnn_translator" DATADIR=<path/to/data/dir> LOGDIR=<path/to/output/dir> sbatch -N $DGXNNODES -t $WALLTIME run.sub
-```
-#### Alternative launch with nvidia-docker
-When generating results for the official v0.7 submission with one node, the
-benchmark was launched onto a cluster managed by a SLURM scheduler. The
-instructions in [NVIDIA DGX A100 (single node)](#nvidia-dgx-a100-single-node) explain
-how that is done.
-However, to make it easier to run this benchmark on a wider set of machine
-environments, we are providing here an alternate set of launch instructions
-that can be run using nvidia-docker. Note that performance or functionality may
-vary from the tested SLURM instructions.
-```
-docker build --pull -t mlperf-nvidia:rnn_translator .
-source config_DGXA100.sh
-CONT="mlperf-nvidia:rnn_translator" DATADIR=<path/to/data/dir> LOGDIR=<path/to/output/dir> ./run_with_docker.sh
-```
-### NVIDIA DGX-2H (single node)
-Launch configuration and system-specific hyperparameters for the NVIDIA DGX-2H
-single node submission are in the `config_DGX2.sh` script.
-Steps required to launch single node training on NVIDIA DGX-2H:
-1. Build the container and push to a docker registry:
-```
-docker build --pull -t <docker/registry>/mlperf-nvidia:rnn_translator .
-docker push <docker/registry>/mlperf-nvidia:rnn_translator
-```
-2. Launch the training:
-```
-source config_DGX2.sh
-CONT="<docker/registry>/mlperf-nvidia:rnn_translator" DATADIR=<path/to/data/dir> LOGDIR=<path/to/output/dir> sbatch -N $DGXNNODES -t $WALLTIME run.sub
-```
-#### Alternative launch with nvidia-docker
-When generating results for the official v0.7 submission with one node, the
-benchmark was launched onto a cluster managed by a SLURM scheduler. The
-instructions in [NVIDIA DGX-2H (single node)](#nvidia-dgx-2h-single-node) explain
-how that is done.
-However, to make it easier to run this benchmark on a wider set of machine
-environments, we are providing here an alternate set of launch instructions
-that can be run using nvidia-docker. Note that performance or functionality may
-vary from the tested SLURM instructions.
-```
-docker build --pull -t mlperf-nvidia:rnn_translator .
-source config_DGX2.sh
-CONT="mlperf-nvidia:rnn_translator" DATADIR=<path/to/data/dir> LOGDIR=<path/to/output/dir> ./run_with_docker.sh
-```
-### NVIDIA DGX-1 (single node)
-Launch configuration and system-specific hyperparameters for the NVIDIA DGX-1
-single node submission are in the `config_DGX1.sh` script.
-Steps required to launch single node training on NVIDIA DGX-1:
-1. Build the container and push to a docker registry:
-```
-docker build --pull -t <docker/registry>/mlperf-nvidia:rnn_translator .
-docker push <docker/registry>/mlperf-nvidia:rnn_translator
-```
-2. Launch the training:
-```
-source config_DGX1.sh
-CONT="<docker/registry>/mlperf-nvidia:rnn_translator" DATADIR=<path/to/data/dir> LOGDIR=<path/to/output/dir> sbatch -N $DGXNNODES -t $WALLTIME run.sub
-```
-#### Alternative launch with nvidia-docker
-When generating results for the official v0.7 submission with one node, the
-benchmark was launched onto a cluster managed by a SLURM scheduler. The
-instructions in [NVIDIA DGX-1 (single node)](#nvidia-dgx-1-single-node) explain
-how that is done.
-However, to make it easier to run this benchmark on a wider set of machine
-environments, we are providing here an alternate set of launch instructions
-that can be run using nvidia-docker. Note that performance or functionality may
-vary from the tested SLURM instructions.
-```
-docker build --pull -t mlperf-nvidia:rnn_translator .
-source config_DGX1.sh
-CONT="mlperf-nvidia:rnn_translator" DATADIR=<path/to/data/dir> LOGDIR=<path/to/output/dir> ./run_with_docker.sh
-```
-### NVIDIA DGX A100 (multi node)
-Launch configuration and system-specific hyperparameters for the NVIDIA DGX A100
-multi node submission are in the following scripts:
-* for the 2-node NVIDIA DGX A100 submission: `config_DGXA100_multi_2x8x192_dist.sh`
-* for the 32-node NVIDIA DGX A100 submission: `config_DGXA100_multi_32x8x32_dist.sh`
-* for the 128-node NVIDIA DGX A100 submission: `config_DGXA100_multi_128x8x16_dist.sh`
-Steps required to launch multi node training on NVIDIA DGX A100:
-1. Build the docker container and push to a docker registry
-```
-docker build --pull -t <docker/registry>/mlperf-nvidia:rnn_translator .
-docker push <docker/registry>/mlperf-nvidia:rnn_translator
-```
-2. Launch the training
-2-node NVIDIA DGX A100 training:
-```
-source config_DGXA100_multi_2x8x192_dist.sh
-CONT="<docker/registry>/mlperf-nvidia:rnn_translator" DATADIR=<path/to/data/dir> LOGDIR=<path/to/output/dir> sbatch -N $DGXNNODES -t $WALLTIME --ntasks-per-node $DGXNGPU run.sub
-```
-32-node NVIDIA DGX A100 training:
-```
-source config_DGXA100_multi_32x8x32_dist.sh
-CONT="<docker/registry>/mlperf-nvidia:rnn_translator" DATADIR=<path/to/data/dir> LOGDIR=<path/to/output/dir> sbatch -N $DGXNNODES -t $WALLTIME --ntasks-per-node $DGXNGPU run.sub
-```
-128-node NVIDIA DGX A100 training:
-```
-source config_DGXA100_multi_128x8x16_dist.sh
-CONT="<docker/registry>/mlperf-nvidia:rnn_translator" DATADIR=<path/to/data/dir> LOGDIR=<path/to/output/dir> sbatch -N $DGXNNODES -t $WALLTIME --ntasks-per-node $DGXNGPU run.sub
-```
-### NVIDIA DGX-2H (multi node)
-Launch configuration and system-specific hyperparameters for the NVIDIA DGX-2H
-multi node submission are the following scripts:
-* for the 16-node NVIDIA DGX-2H submission: `config_DGX2_multi_16x16x32.sh`
-* for the 64-node NVIDIA DGX-2H submission: `config_DGX2_multi_64x16x16.sh`
-Steps required to launch multi node training on NVIDIA DGX-2H:
-1. Build the docker container and push to a docker registry
-```
-docker build --pull -t <docker/registry>/mlperf-nvidia:rnn_translator .
-docker push <docker/registry>/mlperf-nvidia:rnn_translator
-```
-2. Launch the training
-16-node NVIDIA DGX-2H training:
-```
-source config_DGX2_multi_16x16x32.sh
-CONT="<docker/registry>/mlperf-nvidia:rnn_translator" DATADIR=<path/to/data/dir> LOGDIR=<path/to/output/dir> sbatch -N $DGXNNODES -t $WALLTIME --ntasks-per-node $DGXNGPU run.sub
-```
-64-node NVIDIA DGX-2H training:
-```
-source config_DGX2_multi_64x16x16.sh
-CONT="<docker/registry>/mlperf-nvidia:rnn_translator" DATADIR=<path/to/data/dir> LOGDIR=<path/to/output/dir> sbatch -N $DGXNNODES -t $WALLTIME --ntasks-per-node $DGXNGPU run.sub
-```
-# 3. Dataset/Environment
-### Publication/Attribution
-We use [WMT16 English-German](http://www.statmt.org/wmt16/translation-task.html)
-for training.
-### Data preprocessing
-Script uses [subword-nmt](https://github.com/rsennrich/subword-nmt) package to
-segment text into subword units (BPE), by default it builds shared vocabulary of
-32,000 tokens.
-Preprocessing removes all pairs of sentences that can't be decoded by latin-1
-encoder.
-### Vocabulary
-Vocabulary is generated by the following lines from the `download_dataset.sh`
-script:
-```
-# Create vocabulary file for BPE
-cat "${OUTPUT_DIR}/train.tok.clean.bpe.${merge_ops}.en" "${OUTPUT_DIR}/train.tok.clean.bpe.${merge_ops}.de" | \
-  ${OUTPUT_DIR}/subword-nmt/get_vocab.py | cut -f1 -d ' ' > "${OUTPUT_DIR}/vocab.bpe.${merge_ops}"
-```
-Vocabulary is stored to the `rnn_translator/data/vocab.bpe.32000` plain text
-file. Tokens are separated with a newline character, one token per line. The
-vocabulary file doesn't contain special tokens like for example BOS
-(begin-of-string) or EOS (end-of-string) tokens.
-Here are first 10 lines from the `rnn_translator/data/vocab.bpe.32000` file:
-```
-,
-.
-the
-in
-of
-and
-die
-der
-to
-und
-```
-### Text datasets
-The `download_dataset.sh` script automatically creates training, validation and
-test datasets. Datasets are stored as plain text files. Sentences are separated
-with a newline character, and tokens within each sentence are separated with a
-single space character.
-Training data:
-* source language (English): `rnn_translator/data/train.tok.clean.bpe.32000.en`
-* target language (German): `rnn_translator/data/train.tok.clean.bpe.32000.de`
-Validation data:
-* source language (English): `rnn_translator/data/newstest_dev.tok.clean.bpe.32000.en`
-* target language (German): `rnn_translator/data/newstest_dev.tok.clean.bpe.32000.de`
-Test data:
-* source language (English): `rnn_translator/data/newstest2014.tok.bpe.32000.en`
-* target language (German): `rnn_translator/data/newstest2014.de`
-  * notice that the `newstest2014.de` file isn't tokenized, BLEU evaluation is
-    performed by the sacrebleu package and sacrebleu expects plain text raw data
-    (tokenization is performed internally by sacrebleu)
-Here are first 5 lines from the `rnn_translator/data/train.tok.clean.bpe.32000.en` file:
-```
-Res@@ um@@ ption of the session
-I declare resumed the session of the European Parliament ad@@ jour@@ ned on Friday 17 December 1999 , and I would like once again to wish you a happy new year in the hope that you enjoyed a pleasant fes@@ tive period .
-Although , as you will have seen , the d@@ read@@ ed &apos; millenn@@ ium bug &apos; failed to materi@@ alise , still the people in a number of countries suffered a series of natural disasters that truly were d@@ read@@ ful .
-You have requested a debate on this subject in the course of the next few days , during this part-session .
-In the meantime , I should like to observe a minute &apos; s silence , as a number of Members have requested , on behalf of all the victims concerned , particularly those of the terrible stor@@ ms , in the various countries of the European Union .
-```
-And here are first 5 lines from the `rnn_translator/data/train.tok.clean.bpe.32000.de` file:
-```
-Wiederaufnahme der Sitzungsperiode
-Ich erkläre die am Freitag , dem 17. Dezember unterbro@@ ch@@ ene Sitzungsperiode des Europäischen Parlaments für wieder@@ aufgenommen , wünsche Ihnen nochmals alles Gute zum Jahres@@ wechsel und hoffe , daß Sie schöne Ferien hatten .
-Wie Sie feststellen konnten , ist der ge@@ für@@ ch@@ tete &quot; Mill@@ en@@ i@@ um-@@ Bu@@ g &quot; nicht eingetreten . Doch sind Bürger einiger unserer Mitgliedstaaten Opfer von schrecklichen Naturkatastrophen geworden .
-Im Parlament besteht der Wunsch nach einer Aussprache im Verlauf dieser Sitzungsperiode in den nächsten Tagen .
-Heute möchte ich Sie bitten - das ist auch der Wunsch einiger Kolleginnen und Kollegen - , allen Opfern der St@@ ür@@ me , insbesondere in den verschiedenen Ländern der Europäischen Union , in einer Schwei@@ ge@@ minute zu ge@@ denken .
-```
-### Training and test data separation
-Training uses WMT16 English-German dataset, validation is on concatenation of
-newstest2015 and newstest2016, BLEU evaluation is done on newstest2014.
-### Data filtering
-Training is executed only on pairs of sentences which satisfy the following equation:
-```
-    min_len <= src sentence sequence length <= max_len AND
-    min_len <= tgt sentence sequence length <= max_len
-```
-`min_len` is set to 0, `max_len` is set to 75. Source and target sequence
-lengths include special BOS (begin-of-sentence) and EOS (end-of-sentence)
-tokens.
-Filtering is implemented in `pytorch/seq2seq/data/dataset.py`, class
-`LazyParallelDataset`.
-### Training data order
-Training script does bucketing by sequence length. Bucketing algorithm uses 5
-equal-width buckets (`num_buckets = 5`). Pairs of training sentences are
-assigned to buckets by the value of
-`max(src_sentence_len // bucket_width, tgt_sentence_len // bucket_width)`, where
-`bucket_width = (max_len + num_buckets - 1) // num_buckets`.
-Before each training epoch batches are randomly sampled from the buckets (last
-incomplete batches are dropped for each bucket), then all batches are
-reshuffled.
-Bucketing is implemented in `pytorch/seq2seq/data/sampler.py`, class
-`BucketingSampler`.
-# 4. Model
-### Publication/Attribution
-Implemented model is similar to the one from [Google's Neural Machine
-Translation System: Bridging the Gap between Human and Machine
-Translation](https://arxiv.org/abs/1609.08144) paper.
-Most important difference is in the attention mechanism. This repository
-implements `gnmt_v2` attention: output from first LSTM layer of decoder goes
-into attention, then re-weighted context is concatenated with inputs to all
-subsequent LSTM layers in decoder at current timestep.
-The same attention mechanism is also implemented in default
-GNMT-like models from [tensorflow/nmt](https://github.com/tensorflow/nmt) and
-[NVIDIA/OpenSeq2Seq](https://github.com/NVIDIA/OpenSeq2Seq).
-### Structure
-* general:
-  * encoder and decoder are using shared embeddings
-  * data-parallel multi-gpu training
-  * trained with label smoothing loss (smoothing factor 0.1)
-* encoder:
-  * 4-layer LSTM, hidden size 1024, first layer is bidirectional, the rest of
-    layers are unidirectional
-  * with residual connections starting from 3rd LSTM layer
-  * uses standard pytorch nn.LSTM layer
-  * dropout is applied on input to all LSTM layers, probability of dropout is
-    set to 0.2
-  * hidden state of LSTM layers is initialized with zeros
-  * weights and bias of LSTM layers is initialized with uniform(-0.1, 0.1)
-    distribution
-* decoder:
-  * 4-layer unidirectional LSTM with hidden size 1024 and fully-connected
-    classifier
-  * with residual connections starting from 3rd LSTM layer
-  * uses standard pytorch nn.LSTM layer
-  * dropout is applied on input to all LSTM layers, probability of dropout is
-    set to 0.2
-  * hidden state of LSTM layers is initialized with zeros
-  * weights and bias of LSTM layers is initialized with uniform(-0.1, 0.1)
-    distribution
-  * weights and bias of fully-connected classifier is initialized with
-    uniform(-0.1, 0.1) distribution
-* attention:
-  * normalized Bahdanau attention
-  * model uses `gnmt_v2` attention mechanism
-  * output from first LSTM layer of decoder goes into attention,
-  then re-weighted context is concatenated with the input to all subsequent
-  LSTM layers in decoder at the current timestep
-  * linear transform of keys and queries is initialized with uniform(-0.1, 0.1),
-  normalization scalar is initialized with 1.0 / sqrt(1024),
-    normalization bias is initialized with zero
-* inference:
-  * beam search with beam size of 5
-  * with coverage penalty and length normalization, coverage penalty factor is
-    set to 0.1, length normalization factor is set to 0.6 and length
-    normalization constant is set to 5.0
-  * BLEU computed by [sacrebleu](https://pypi.org/project/sacrebleu/)
-Implementation:
-* base Seq2Seq model: `pytorch/seq2seq/models/seq2seq_base.py`, class `Seq2Seq`
-* GNMT model: `pytorch/seq2seq/models/gnmt.py`, class `GNMT`
-* encoder: `pytorch/seq2seq/models/encoder.py`, class `ResidualRecurrentEncoder`
-* decoder: `pytorch/seq2seq/models/decoder.py`, class `ResidualRecurrentDecoder`
-* attention: `pytorch/seq2seq/models/attention.py`, class `BahdanauAttention`
-* inference (including BLEU evaluation and detokenization): `pytorch/seq2seq/inference/inference.py`, class `Translator`
-* beam search: `pytorch/seq2seq/inference/beam_search.py`, class `SequenceGenerator`
-### Loss function
-Cross entropy loss with label smoothing (smoothing factor = 0.1), padding is not
-considered part of the loss.
-Loss function is implemented in `pytorch/seq2seq/train/smoothing.py`, class
-`LabelSmoothing`.
-### Optimizer
-Adam optimizer with learning rate 1e-3, beta1 = 0.9, beta2 = 0.999, epsilon =
-1e-8 and no weight decay.
-Network is trained with gradient clipping, max L2 norm of gradients is set to 5.0.
-Optimizer is implemented in `pytorch/seq2seq/train/fp_optimizers.py`, class
-`Fp32Optimizer`.
-### Learning rate schedule
-Model is trained with exponential learning rate warmup for 200 steps and with
-step learning rate decay. Decay is started after 2/3 of training steps, decays
-for a total of 4 times, at regularly spaced intervals, decay factor is 0.5.
-Learning rate scheduler is implemented in
-`pytorch/seq2seq/train/lr_scheduler.py`, class `WarmupMultiStepLR`.
-# 5. Quality
-### Quality metric
-Uncased BLEU score on newstest2014 en-de dataset.
-BLEU scores reported by [sacrebleu](https://pypi.org/project/sacrebleu/)
-package (version 1.2.10). Sacrebleu is executed with the following flags:
-`--score-only -lc --tokenize intl`.
-### Quality target
-Uncased BLEU score of 24.00.
-### Evaluation frequency
-Evaluation of BLEU score is done after every epoch.
-### Evaluation thoroughness
-Evaluation uses all of `newstest2014.en` (3003 sentences).
--- a/PyTorch/NLP/gnmt/__pycache__/bind_launch.cpython-36.pyc
+++ b/PyTorch/NLP/gnmt/__pycache__/bind_launch.cpython-36.pyc
--- a/PyTorch/NLP/gnmt/bind.sh
+++ b/PyTorch/NLP/gnmt/bind.sh
-#! /bin/bash
-set -euo pipefail
-print_usage() {
-    cat << EOF
-${0} [options] [--] COMMAND [ARG...]
-Control binding policy for each task. Assumes one rank will be launched for each GPU.
-Options:
-    --cpu=MODE
-        * exclusive -- bind each rank to an exclusive set of cores near its GPU
-        * exclusive,nosmt -- bind each rank to an exclusive set of cores near its GPU, without hyperthreading
-        * node -- bind each rank to all cores in the NUMA node nearest its GPU [default]
-	* *.sh -- bind each rank using the bash associative array bind_cpu_cores or bind_cpu_nodes from a file
-        * off -- don't bind
-    --mem=MODE
-        * node -- bind each rank to the nearest NUMA node [default]
-	* *.sh -- bind each rank using the bash associative array bind_mem from a file
-        * off -- don't bind
-    --ib=MODE
-        * single -- bind each rank to a single IB device near its GPU
-        * off -- don't bind [default]
-    --cluster=CLUSTER
-        Select which cluster is being used. May be required if system params cannot be detected.
-EOF
-}
-################################################################################
-# Argument parsing
-################################################################################
-cpu_mode='node'
-mem_mode='node'
-ib_mode='off'
-cluster=''
-while [ $# -gt 0 ]; do
-    case "$1" in
-        -h|--help) print_usage ; exit 0 ;;
-        --cpu=*) cpu_mode="${1/*=/}"; shift ;;
-        --cpu)   cpu_mode="$2"; shift 2 ;;
-        --mem=*) mem_mode="${1/*=/}"; shift ;;
-        --mem)   mem_mode="$2"; shift 2 ;;
-        --ib=*) ib_mode="${1/*=/}"; shift ;;
-        --ib)   ib_mode="$2"; shift 2 ;;
-        --cluster=*) cluster="${1/*=/}"; shift ;;
-        --cluster)   cluster="$2"; shift 2 ;;
-        --) shift; break ;;
-        *) break ;;
-    esac
-done
-if [ $# -lt 1 ]; then
-    echo 'ERROR: no command given' 2>&1
-    print_usage
-    exit 1
-fi
-################################################################################
-# Get system params
-################################################################################
-# LOCAL_RANK is set with an enroot hook for Pytorch containers
-# SLURM_LOCALID is set by Slurm
-# OMPI_COMM_WORLD_LOCAL_RANK is set by mpirun
-readonly local_rank="${LOCAL_RANK:=${SLURM_LOCALID:=${OMPI_COMM_WORLD_LOCAL_RANK:-}}}"
-if [ -z "${local_rank}" ]; then
-    echo 'ERROR: cannot read LOCAL_RANK from env' >&2
-    exit 1
-fi
-num_gpus=$(nvidia-smi -i 0 --query-gpu=count --format=csv,noheader,nounits)
-if [ "${local_rank}" -ge "${num_gpus}" ]; then
-    echo "ERROR: local rank is ${local_rank}, but there are only ${num_gpus} gpus available" >&2
-    exit 1
-fi
-get_lscpu_value() {
-    awk -F: "(\$1 == \"${1}\"){gsub(/ /, \"\", \$2); print \$2; found=1} END{exit found!=1}"
-}
-lscpu_out=$(lscpu)
-num_sockets=$(get_lscpu_value 'Socket(s)' <<< "${lscpu_out}")
-num_nodes=$(get_lscpu_value 'NUMA node(s)' <<< "${lscpu_out}")
-cores_per_socket=$(get_lscpu_value 'Core(s) per socket' <<< "${lscpu_out}")
-echo "num_sockets = ${num_sockets} num_nodes=${num_nodes} cores_per_socket=${cores_per_socket}"
-readonly cores_per_node=$(( (num_sockets * cores_per_socket) / num_nodes ))
-if [ ${num_gpus} -gt 1 ]; then
-    readonly gpus_per_node=$(( num_gpus / num_nodes ))
-else
-    readonly gpus_per_node=1
-fi
-readonly cores_per_gpu=$(( cores_per_node / gpus_per_node ))
-readonly local_node=$(( local_rank / gpus_per_node ))
-declare -a ibdevs=()
-case "${cluster}" in
-    circe)
-        # Need to specialize for circe because IB detection is hard
-        ibdevs=(mlx5_1 mlx5_2 mlx5_3 mlx5_4 mlx5_7 mlx5_8 mlx5_9 mlx5_10)
-        ;;
-   selene)
-        # Need to specialize for selene because IB detection is hard
-        ibdevs=(mlx5_0 mlx5_1 mlx5_2 mlx5_3 mlx5_6 mlx5_7 mlx5_8 mlx5_9)
-        ;;
-    '')
-        if ibstat_out="$(ibstat -l 2>/dev/null | sort -V)" ; then
-            mapfile -t ibdevs <<< "${ibstat_out}"
-        fi
-        ;;
-    *)
-        echo "ERROR: Unknown cluster '${cluster}'" >&2
-        exit 1
-        ;;
-esac
-readonly num_ibdevs="${#ibdevs[@]}"
-################################################################################
-# Setup for exec
-################################################################################
-declare -a numactl_args=()
-case "${cpu_mode}" in
-    exclusive)
-        numactl_args+=( "$(printf -- "--physcpubind=%u-%u,%u-%u" \
-            $(( local_rank * cores_per_gpu )) \
-            $(( (local_rank + 1) * cores_per_gpu - 1 )) \
-            $(( local_rank * cores_per_gpu + (cores_per_gpu * gpus_per_node * num_nodes) )) \
-            $(( (local_rank + 1) * cores_per_gpu + (cores_per_gpu * gpus_per_node * num_nodes) - 1 )) \
-        )" )
-        ;;
-    exclusive,nosmt)
-        numactl_args+=( "$(printf -- "--physcpubind=%u-%u" \
-            $(( local_rank * cores_per_gpu )) \
-            $(( (local_rank + 1) * cores_per_gpu - 1 )) \
-        )" )
-        ;;
-    node)
-        numactl_args+=( "--cpunodebind=${local_node}" )
-        ;;
-    *.sh)
-	source "${cpu_mode}"
-	if [ -n "${bind_cpu_cores:-}" ]; then
-	    numactl_args+=( "--physcpubind=${bind_cpu_cores[${local_rank}]}" )
-	elif [ -n "${bind_cpu_nodes:-}" ]; then
-	    numactl_args+=( "--cpunodebind=${bind_cpu_nodes[${local_rank}]}" )
-	else
-	    echo "ERROR: invalid CPU affinity file ${cpu_mode}." >&2
-	    exit 1
-	fi
-	;;
-    off|'')
-        ;;
-    *)
-        echo "ERROR: invalid cpu mode '${cpu_mode}'" 2>&1
-        print_usage
-        exit 1
-        ;;
-esac
-case "${mem_mode}" in
-    node)
-        numactl_args+=( "--membind=${local_node}" )
-        ;;
-    *.sh)
-	source "${mem_mode}"
-	if [ -z "${bind_mem:-}" ]; then
-	    echo "ERROR: invalid memory affinity file ${mem_mode}." >&2
-	    exit 1
-	fi
-	numactl_args+=( "--membind=${bind_mem[${local_rank}]}" )
-	;;
-    off|'')
-        ;;
-    *)
-        echo "ERROR: invalid mem mode '${mem_mode}'" 2>&1
-        print_usage
-        exit 1
-        ;;
-esac
-case "${ib_mode}" in
-    single)
-        if [ "${num_ibdevs}" -eq 0 ]; then
-            echo "WARNING: used '$0 --ib=single', but there are 0 IB devices available; skipping IB binding." 2>&1
-        else
-            readonly ibdev="${ibdevs[$(( local_rank * num_ibdevs / num_gpus ))]}"
-            export OMPI_MCA_btl_openib_if_include="${OMPI_MCA_btl_openib_if_include-$ibdev}"
-            export UCX_NET_DEVICES="${UCX_NET_DEVICES-$ibdev:1}"
-        fi
-        ;;
-    off|'')
-        ;;
-    *)
-        echo "ERROR: invalid ib mode '${ib_mode}'" 2>&1
-        print_usage
-        exit 1
-        ;;
-esac
-################################################################################
-# Exec
-################################################################################
-if [ "${#numactl_args[@]}" -gt 0 ] ; then
-    set -x
-    exec numactl "${numactl_args[@]}" -- "${@}"
-else
-    exec "${@}"
-fi
--- a/PyTorch/NLP/gnmt/bind_launch.py
+++ b/PyTorch/NLP/gnmt/bind_launch.py
-import sys
-import subprocess
-import os
-import socket
-from argparse import ArgumentParser, REMAINDER
-import torch
-def parse_args():
-    """
-    Helper function parsing the command line options
-    @retval ArgumentParser
-    """
-    parser = ArgumentParser(description="PyTorch distributed training launch "
-                                        "helper utilty that will spawn up "
-                                        "multiple distributed processes")
-    # Optional arguments for the launch helper
-    parser.add_argument("--nnodes", type=int, default=1,
-                        help="The number of nodes to use for distributed "
-                             "training")
-    parser.add_argument("--node_rank", type=int, default=0,
-                        help="The rank of the node for multi-node distributed "
-                             "training")
-    parser.add_argument("--nproc_per_node", type=int, default=1,
-                        help="The number of processes to launch on each node, "
-                             "for GPU training, this is recommended to be set "
-                             "to the number of GPUs in your system so that "
-                             "each process can be bound to a single GPU.")
-    parser.add_argument("--master_addr", default="127.0.0.1", type=str,
-                        help="Master node (rank 0)'s address, should be either "
-                             "the IP address or the hostname of node 0, for "
-                             "single node multi-proc training, the "
-                             "--master_addr can simply be 127.0.0.1")
-    parser.add_argument("--master_port", default=29500, type=int,
-                        help="Master node (rank 0)'s free port that needs to "
-                             "be used for communciation during distributed "
-                             "training")
-    parser.add_argument('--no_hyperthreads', action='store_true',
-                        help='Flag to disable binding to hyperthreads')
-    parser.add_argument('--no_membind', action='store_true',
-                        help='Flag to disable memory binding')
-    # non-optional arguments for binding
-    parser.add_argument("--nsockets_per_node", type=int, required=True,
-                        help="Number of CPU sockets on a node")
-    parser.add_argument("--ncores_per_socket", type=int, required=True,
-                        help="Number of CPU cores per socket")
-    # positional
-    parser.add_argument("training_script", type=str,
-                        help="The full path to the single GPU training "
-                             "program/script to be launched in parallel, "
-                             "followed by all the arguments for the "
-                             "training script")
-    # rest from the training program
-    parser.add_argument('training_script_args', nargs=REMAINDER)
-    return parser.parse_args()
-def main():
-    args = parse_args()
-    # variables for numactrl binding
-    NSOCKETS = args.nsockets_per_node
-    NGPUS_PER_SOCKET = (args.nproc_per_node // args.nsockets_per_node) + (1 if (args.nproc_per_node % args.nsockets_per_node) else 0)
-    NCORES_PER_GPU = args.ncores_per_socket // NGPUS_PER_SOCKET
-    # world size in terms of number of processes
-    dist_world_size = args.nproc_per_node * args.nnodes
-    # set PyTorch distributed related environmental variables
-    current_env = os.environ.copy()
-    current_env["MASTER_ADDR"] = args.master_addr
-    current_env["MASTER_PORT"] = str(args.master_port)
-    current_env["WORLD_SIZE"] = str(dist_world_size)
-    processes = []
-    for local_rank in range(0, args.nproc_per_node):
-        # each process's rank
-        dist_rank = args.nproc_per_node * args.node_rank + local_rank
-        current_env["RANK"] = str(dist_rank)
-        # form numactrl binding command
-        cpu_ranges = [local_rank * NCORES_PER_GPU,
-                     (local_rank + 1) * NCORES_PER_GPU - 1,
-                     local_rank * NCORES_PER_GPU + (NCORES_PER_GPU * NGPUS_PER_SOCKET * NSOCKETS),
-                     (local_rank + 1) * NCORES_PER_GPU + (NCORES_PER_GPU * NGPUS_PER_SOCKET * NSOCKETS) - 1]
-        numactlargs = []
-        if args.no_hyperthreads:
-            numactlargs += [ "--physcpubind={}-{}".format(*cpu_ranges[0:2]) ]
-        else:
-            numactlargs += [ "--physcpubind={}-{},{}-{}".format(*cpu_ranges) ]
-        if not args.no_membind:
-            memnode = local_rank // NGPUS_PER_SOCKET
-            numactlargs += [ "--membind={}".format(memnode) ]
-        # spawn the processes
-        cmd = [ "/usr/bin/numactl" ] \
-            + numactlargs \
-            + [ sys.executable,
-                "-u",
-                args.training_script,
-                "--local_rank={}".format(local_rank)
-              ] \
-            + args.training_script_args
-        print("cmd: ",cmd)
-        process = subprocess.Popen(cmd, env=current_env)
-        processes.append(process)
-    for process in processes:
-        process.wait()
-if __name__ == "__main__":
-    main()
--- a/PyTorch/NLP/gnmt/config_DGX1.sh
+++ b/PyTorch/NLP/gnmt/config_DGX1.sh
-## System run parms
-export DGXNNODES=1
-export DGXSYSTEM=$(basename $(readlink -f ${BASH_SOURCE[0]}) | sed 's/^config_//' | sed 's/\.sh$//' )
-export WALLTIME=${WALLTIME:-"00:30:00"}
-## DL params
-export LR=${LR:-"2.0e-3"}
-export TRAIN_BATCH_SIZE=${TRAIN_BATCH_SIZE:-64}
-export TEST_BATCH_SIZE=${TEST_BATCH_SIZE:-64}
-export WARMUP_STEPS=${WARMUP_STEPS:-200}
-export REMAIN_STEPS=${REMAIN_STEPS:-6453}
-export DECAY_INTERVAL=${DECAY_INTERVAL:-809}
-export TARGET=${TARGET:-24.0}
-export MAX_SEQ_LEN=${MAX_SEQ_LEN:-75}
-export NUMEPOCHS=${NUMEPOCHS:-15}
-export MATH=${MATH:-fp32}
-#export DIST_OPTS=${DIST_OPTS-"\
-#   --distributed-weight-update 2 \
-#   --dwu-num-blocks 1 \
-#   --dwu-num-chunks 2 \
-#   --dwu-num-rs-pg 2 \
-#   --dwu-num-ar-pg 2 \
-#   --dwu-num-ag-pg 0 \
-#   --dwu-grad-norm \
-#   "}
-export DIST_OPTS=${DIST_OPTS-"\
-   --dwu-num-blocks 1 \
-   --dwu-num-chunks 2 \
-   --dwu-num-rs-pg 2 \
-   --dwu-num-ar-pg 2 \
-   --dwu-num-ag-pg 0 \
-   --dwu-grad-norm \
-   "}
-export EXTRA_OPTS=${EXTRA_OPTS-"\
-   --fused-attention \
-   --fused-xentropy \
-   --no-log-all-ranks \
-   "}
-## System config params
-export DGXNGPU=4
-export DGXSOCKETCORES=8
-export DGXHT=1 	# HT is on is 2, HT off is 1
-export DGXNSOCKET=4
-#export DGXNGPU=1
-#export DGXSOCKETCORES=8
-#export DGXHT=1 	# HT is on is 2, HT off is 1
-#export DGXNSOCKET=1
--- a/PyTorch/NLP/gnmt/config_DGX1_fp16.sh
+++ b/PyTorch/NLP/gnmt/config_DGX1_fp16.sh
-## System run parms
-export DGXNNODES=1
-export DGXSYSTEM=$(basename $(readlink -f ${BASH_SOURCE[0]}) | sed 's/^config_//' | sed 's/\.sh$//' )
-export WALLTIME=${WALLTIME:-"00:30:00"}
-export LR=${LR:-"2.0e-3"}
-export TRAIN_BATCH_SIZE=${TRAIN_BATCH_SIZE:-256}
-export TEST_BATCH_SIZE=${TEST_BATCH_SIZE:-128}
-export WARMUP_STEPS=${WARMUP_STEPS:-200}
-export REMAIN_STEPS=${REMAIN_STEPS:-6453}
-export DECAY_INTERVAL=${DECAY_INTERVAL:-809}
-export TARGET=${TARGET:-24.0}
-export MAX_SEQ_LEN=${MAX_SEQ_LEN:-75}
-export NUMEPOCHS=${NUMEPOCHS:-8}
-export MATH=${MATH:-fp16}
-export DIST_OPTS=${DIST_OPTS-"\
-   --distributed-weight-update 2 \
-   --dwu-num-blocks 1 \
-   --dwu-num-chunks 2 \
-   --dwu-num-rs-pg 2 \
-   --dwu-num-ar-pg 2 \
-   --dwu-num-ag-pg 0 \
-   --dwu-grad-norm \
-   "}
-export EXTRA_OPTS=${EXTRA_OPTS-"\
-   --fused-attention \
-   --fused-xentropy \
-   --no-log-all-ranks \
-   "}
-## System config params
-export DGXNGPU=4
-export DGXSOCKETCORES=8
-export DGXHT=1 	# HT is on is 2, HT off is 1
-export DGXNSOCKET=4
--- a/PyTorch/NLP/gnmt/config_DGX1_multi.sh
+++ b/PyTorch/NLP/gnmt/config_DGX1_multi.sh
-## System run parms
-export DGXNNODES=2
-export DGXSYSTEM=$(basename $(readlink -f ${BASH_SOURCE[0]}) | sed 's/^config_//' | sed 's/\.sh$//' )
-export WALLTIME=${WALLTIME:-"00:30:00"}
-## DL params
-#export LR=${LR:-"2.0e-3"}
-#export TRAIN_BATCH_SIZE=${TRAIN_BATCH_SIZE:-64}
-#export TEST_BATCH_SIZE=${TEST_BATCH_SIZE:-64}
-#export WARMUP_STEPS=${WARMUP_STEPS:-200}
-#export REMAIN_STEPS=${REMAIN_STEPS:-6453}
-#export DECAY_INTERVAL=${DECAY_INTERVAL:-809}
-#export TARGET=${TARGET:-24.0}
-#export MAX_SEQ_LEN=${MAX_SEQ_LEN:-75}
-#export NUMEPOCHS=${NUMEPOCHS:-15}
-#export MATH=${MATH:-fp32}
-#export DIST_OPTS=${DIST_OPTS-"\
-#   --distributed-weight-update 2 \
-#   --dwu-num-blocks 1 \
-#   --dwu-num-chunks 2 \
-#   --dwu-num-rs-pg 2 \
-#   --dwu-num-ar-pg 2 \
-#   --dwu-num-ag-pg 0 \
-#   --dwu-grad-norm \
-#   "}
-#export EXTRA_OPTS=${EXTRA_OPTS-"\
-#   --fused-attention \
-#   --fused-xentropy \
-#   --no-log-all-ranks \
-#   "}
-## System config params
-export DGXNGPU=4
-export DGXSOCKETCORES=8
-export DGXHT=2 	# HT is on is 2, HT off is 1
-export DGXNSOCKET=4
-#export DGXNGPU=1
-#export DGXSOCKETCORES=8
-#export DGXHT=1 	# HT is on is 2, HT off is 1
-#export DGXNSOCKET=1
--- a/PyTorch/NLP/gnmt/dllogger-master.zip
+++ b/PyTorch/NLP/gnmt/dllogger-master.zip
--- a/PyTorch/NLP/gnmt/img/diagram.png
+++ b/PyTorch/NLP/gnmt/img/diagram.png
--- a/PyTorch/NLP/gnmt/img/training_loss.png
+++ b/PyTorch/NLP/gnmt/img/training_loss.png
--- a/PyTorch/NLP/gnmt/mlperf_log_utils.py
+++ b/PyTorch/NLP/gnmt/mlperf_log_utils.py
-import collections
-import os
-import subprocess
-import torch
-from mlperf_logging.mllog import constants
-from seq2seq.utils import configure_logger, log_event
-def mlperf_submission_log(benchmark):
-    num_nodes = os.environ.get('SLURM_NNODES', 1)
-    if int(num_nodes) > 1:
-        torch.distributed.init_process_group(backend='nccl', init_method='env://')
-    configure_logger(benchmark)
-    log_event(
-        key=constants.SUBMISSION_BENCHMARK,
-        value=benchmark,
-        )
-    log_event(
-        key=constants.SUBMISSION_ORG,
-        value='NVIDIA')
-    log_event(
-        key=constants.SUBMISSION_DIVISION,
-        value='closed')
-    log_event(
-        key=constants.SUBMISSION_STATUS,
-        value='onprem')
-    log_event(
-        key=constants.SUBMISSION_PLATFORM,
-        value=f'{num_nodes}xSUBMISSION_PLATFORM_PLACEHOLDER')
--- a/PyTorch/NLP/gnmt/preprocess_data.py
+++ b/PyTorch/NLP/gnmt/preprocess_data.py
-import logging
-import time
-import os
-import argparse
-import torch
-from torch.utils.data import DataLoader
-from seq2seq.data.tokenizer import Tokenizer
-import seq2seq.data.config as config
-import seq2seq.utils as utils
-from seq2seq.data.dataset import LazyParallelDataset
-from seq2seq.data.dataset import PreprocessedDataset
-def parse_args():
-    parser = argparse.ArgumentParser(
-        description='GNMT prepare data',
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-    parser.add_argument('--dataset-dir', default='data/wmt16_de_en',
-                        help='path to the directory with training/test data')
-    parser.add_argument('--preproc-data-dir', default='/tmp/preprocessed',
-                        help='path to the directory with preprocessed \
-                        training/test data')
-    parser.add_argument('--max-size', default=None, type=int,
-                        help='use at most MAX_SIZE elements from training \
-                         dataset (useful for benchmarking), by default \
-                         uses entire dataset')
-    parser.add_argument('--math', default='fp32',
-                        choices=['fp32', 'fp16'],
-                        help='arithmetic type')
-    parser.add_argument('--max-length-train', default=50, type=int,
-                        help='maximum sequence length for training \
-                        (including special BOS and EOS tokens)')
-    parser.add_argument('--min-length-train', default=0, type=int,
-                        help='minimum sequence length for training \
-                        (including special BOS and EOS tokens)')
-    parser.add_argument('--rank', default=0, type=int,
-                        help='global rank of the process, do not set!')
-    parser.add_argument('--local_rank', default=os.getenv('LOCAL_RANK', 0), type=int,
-                        help='local rank of the process, do not set!')
-    args = parser.parse_args()
-    return args
-def build_collate_fn(max_seq_len, parallel=True):
-    def collate_seq(seq):
-        lengths = torch.tensor([len(s) for s in seq])
-        batch_length = max_seq_len
-        shape = (len(seq), batch_length)
-        seq_tensor = torch.full(shape, config.PAD, dtype=torch.int64)
-        for i, s in enumerate(seq):
-            end_seq = lengths[i]
-            seq_tensor[i, :end_seq].copy_(s[:end_seq])
-        return (seq_tensor, lengths)
-    def parallel_collate(seqs):
-        src_seqs, tgt_seqs = zip(*seqs)
-        return tuple([collate_seq(s) for s in [src_seqs, tgt_seqs]])
-    return parallel_collate
-def load_dataset(tokenizer, args):
-    train_data = LazyParallelDataset(
-        src_fname=os.path.join(args.dataset_dir, config.SRC_TRAIN_FNAME),
-        tgt_fname=os.path.join(args.dataset_dir, config.TGT_TRAIN_FNAME),
-        tokenizer=tokenizer,
-        min_len=args.min_length_train,
-        max_len=args.max_length_train,
-        sort=False,
-        max_size=args.max_size)
-    collate_fn = build_collate_fn(max_seq_len=args.max_length_train,
-                                  parallel=True)
-    loader = DataLoader(train_data,
-                        batch_size=1024,
-                        collate_fn=collate_fn,
-                        num_workers=min(os.cpu_count(), 16),
-                        timeout=120,
-                        drop_last=False)
-    srcs = []
-    tgts = []
-    src_lengths = []
-    tgt_lengths = []
-    for (src, src_len), (tgt, tgt_len) in loader:
-        src_lengths.append(src_len)
-        tgt_lengths.append(tgt_len)
-        srcs.append(src)
-        tgts.append(tgt)
-    srcs = torch.cat(srcs)
-    tgts = torch.cat(tgts)
-    src_lengths = torch.cat(src_lengths)
-    tgt_lengths = torch.cat(tgt_lengths)
-    return srcs, tgts, src_lengths, tgt_lengths
-def broadcast_dataset(world_size, rank, max_length_train, srcs, tgts,
-        src_lengths, tgt_lengths):
-    assert world_size > 1
-    # Broadcast preprocessed dataset length
-    if rank == 0:
-        sizes = torch.tensor(src_lengths.shape, device='cuda',
-            dtype=torch.int64)
-    else:
-        sizes = torch.zeros((1,), device='cuda', dtype=torch.int64)
-    torch.distributed.broadcast(sizes, 0)
-    nsamples = sizes.item()
-    # Prepare tensor for receving preprocessed dataset
-    if rank == 0:
-        srcs_cuda, tgts_cuda, src_lengths_cuda, tgt_lengths_cuda = \
-            srcs.cuda(), tgts.cuda(), src_lengths.cuda(), tgt_lengths.cuda()
-    else:
-        srcs_cuda = torch.empty((nsamples, max_length_train),
-            device='cuda', dtype=torch.int64)
-        tgts_cuda = torch.empty((nsamples, max_length_train),
-            device='cuda', dtype=torch.int64)
-        src_lengths_cuda = torch.empty((nsamples,), device='cuda',
-            dtype=torch.int64)
-        tgt_lengths_cuda = torch.empty((nsamples,), device='cuda',
-            dtype=torch.int64)
-    # Broadcast preprocessed dataset
-    torch.distributed.broadcast(srcs_cuda, 0)
-    torch.distributed.broadcast(tgts_cuda, 0)
-    torch.distributed.broadcast(src_lengths_cuda, 0)
-    torch.distributed.broadcast(tgt_lengths_cuda, 0)
-    if rank > 0:
-        srcs, tgts, src_lengths, tgt_lengths = srcs_cuda.cpu(), \
-            tgts_cuda.cpu(), src_lengths_cuda.cpu(), tgt_lengths_cuda.cpu()
-    return srcs, tgts, src_lengths, tgt_lengths
-def main():
-    args = parse_args()
-    use_cuda = True
-    device = utils.set_device(use_cuda, args.local_rank)
-    distributed = utils.init_distributed(use_cuda)
-    rank = utils.get_rank()
-    world_size = utils.get_world_size()
-    utils.setup_logging()
-    logging.info(f'Run arguments: {args}')
-    pad_vocab = utils.pad_vocabulary(args.math)
-    tokenizer = Tokenizer(os.path.join(args.dataset_dir, config.VOCAB_FNAME),
-                          pad_vocab)
-    # Pre-process dataset only on master node
-    if rank == 0:
-        srcs, tgts, src_lengths, tgt_lengths = load_dataset(tokenizer, args)
-    else:
-        srcs, tgts, src_lengths, tgt_lengths = None, None, None, None
-        time.sleep(30)
-    # Broadcast preprocessed dataset to other ranks
-    if world_size > 1:
-        srcs, tgts, src_lengths, tgt_lengths = broadcast_dataset(
-            world_size, rank, args.max_length_train,
-            srcs, tgts, src_lengths, tgt_lengths)
-    preproc_train_data = PreprocessedDataset(
-        min_len=args.min_length_train,
-        max_len=args.max_length_train,
-        vocab_size=tokenizer.vocab_size,
-        )
-    os.makedirs(args.preproc_data_dir, exist_ok=True)
-    preproc_train_data.write_data(
-        os.path.join(args.preproc_data_dir, 'training.bin'),
-        (srcs, src_lengths),
-        (tgts, tgt_lengths),
-        )
-if __name__ == "__main__":
-    main()
--- a/PyTorch/NLP/gnmt/requirements.txt
+++ b/PyTorch/NLP/gnmt/requirements.txt
+pytablewriter==0.64.0
 sacrebleu==1.2.10
+sacremoses==0.0.19
+pynvml==8.0.4
+#git+https://github.com/rsennrich/subword-nmt.git@48ba99e657591c329e0003f0c6e32e493fa959ef
--- a/PyTorch/NLP/gnmt/run_fp32_multi.sh
+++ b/PyTorch/NLP/gnmt/run_fp32_multi.sh
-#!/bin/bash
-#for multinode
-source `pwd`/config_DGX1_multi.sh
-set -e
-# start timing
-start=$(date +%s)
-start_fmt=$(date +%Y-%m-%d\ %r)
-echo "STARTING TIMING RUN AT $start_fmt"
-export NCCL_DEBUG=${NCCL_DEBUG:-"WARN"}
-# run benchmark
-set -x
-DATASET_DIR='../wmt16_de_en/'
-PREPROC_DATADIR='./preproc_data'
-RESULTS_DIR='gnmt_wmt16'
-## DL params
-export LR=${LR:-"2.0e-3"}
-export TRAIN_BATCH_SIZE=${TRAIN_BATCH_SIZE:-64}
-export TEST_BATCH_SIZE=${TEST_BATCH_SIZE:-64}
-export WARMUP_STEPS=${WARMUP_STEPS:-200}
-export REMAIN_STEPS=${REMAIN_STEPS:-6453}
-export DECAY_INTERVAL=${DECAY_INTERVAL:-809}
-export TARGET=${TARGET:-24.0}
-export MAX_SEQ_LEN=${MAX_SEQ_LEN:-75}
-export NUMEPOCHS=${NUMEPOCHS:-20}
-export MATH=${MATH:-fp32}
-export DIST_OPTS=${DIST_OPTS-"\
-   --distributed-weight-update 2 \
-   --dwu-num-blocks 1 \
-   --dwu-num-chunks 2 \
-   --dwu-num-rs-pg 2 \
-   --dwu-num-ar-pg 2 \
-   --dwu-num-ag-pg 0 \
-   --dwu-grad-norm \
-   "}
-export EXTRA_OPTS=${EXTRA_OPTS-"\
-   --fused-attention \
-   --fused-xentropy \
-   --no-log-all-ranks \
-   "}
-declare -a CMD
-echo "running benchmark"
-CMD_ARGS=("--save ${RESULTS_DIR}" "--dataset-dir ${DATASET_DIR}" "--preproc-data-dir ${PREPROC_DATADIR}/${MAX_SEQ_LEN}" "--target-bleu $TARGET" "--epochs "${NUMEPOCHS}"" "--math ${MATH}" "--max-length-train ${MAX_SEQ_LEN}" "--print-freq 10" "--train-batch-size $TRAIN_BATCH_SIZE" "--test-batch-size $TEST_BATCH_SIZE" "--optimizer FusedAdam" "--lr $LR" "--warmup-steps $WARMUP_STEPS" "--remain-steps $REMAIN_STEPS" "--decay-interval $DECAY_INTERVAL")
-hostfile=./$SLURM_JOB_ID
-scontrol show hostnames $SLURM_JOB_NODELIST > ${hostfile}
-#rm `pwd`/hostfile-dl -f
-cat ${hostfile} > `pwd`/tmp
-dist_url=`sed -n '1p' ./tmp`
-#echo $dist_url
-rank=0
-num_lines=`cat ./tmp |wc -l`
-for((i=0;i<$num_lines-1;i++))
-do
-   ((rank=$i+1))
-    nodename=$(cat ./tmp |sed -n "${rank}p")
-   ssh ${nodename} "cd `pwd` && module rm compiler/rocm/2.9 && source /public/home/aiss/Pytorch/env_torch1.5_miopen24.sh && python3 -m bind_launch --nnodes=$num_lines --node_rank=$i  --master_addr=${dist_url} --master_port=4567 --nsockets_per_node ${DGXNSOCKET} --ncores_per_socket ${DGXSOCKETCORES} --nproc_per_node $SLURM_NTASKS_PER_NODE --no_hyperthreads `pwd`/train.py  ${CMD_ARGS[@]}" &
-done
-((i=$num_lines-1))
-nodename=$(cat ./tmp |sed -n "${num_lines}p")
-ssh ${nodename} "cd `pwd` && module rm compiler/rocm/2.9 && source /public/home/aiss/Pytorch/env_torch1.5_miopen24.sh && python3 -m bind_launch --nnodes=$num_lines --node_rank=$i  --master_addr=${dist_url} --master_port=4567 --nsockets_per_node ${DGXNSOCKET} --ncores_per_socket ${DGXSOCKETCORES} --nproc_per_node $SLURM_NTASKS_PER_NODE --no_hyperthreads `pwd`/train.py  ${CMD_ARGS[@]}"
-set +x
-sleep 3
-# end timing
-end=$(date +%s)
-end_fmt=$(date +%Y-%m-%d\ %r)
-echo "ENDING TIMING RUN AT $end_fmt"
-# report result
-result=$(( $end - $start ))
-result_name="RNN_TRANSLATOR"
-echo "RESULT,$result_name,,$result,nvidia,$start_fmt"
--- a/PyTorch/NLP/gnmt/run_fp32_node.sh
+++ b/PyTorch/NLP/gnmt/run_fp32_node.sh
-#!/bin/bash
-#for singnode
-source `pwd`/config_DGX1.sh
-set -e
-# start timing
-start=$(date +%s)
-start_fmt=$(date +%Y-%m-%d\ %r)
-echo "STARTING TIMING RUN AT $start_fmt"
-export NCCL_DEBUG=${NCCL_DEBUG:-"WARN"}
-# run benchmark
-set -x
-DATASET_DIR='../wmt16_de_en/'
-PREPROC_DATADIR='./preproc_data'
-RESULTS_DIR='gnmt_wmt16'
-DIST_OPTS=${DIST_OPTS:-""}
-EXTRA_OPTS=${EXTRA_OPTS:-""}
-declare -a CMD
-CMD=( 'python3' '-u' '-m' 'bind_launch' "--nsockets_per_node=${DGXNSOCKET}" \
-  "--ncores_per_socket=${DGXSOCKETCORES}" "--nproc_per_node=${DGXNGPU}" "--no_hyperthreads")
-echo "running benchmark"
-#for 1 node fp32 training
-"${CMD[@]}" train.py \
-  --save ${RESULTS_DIR} \
-  --dataset-dir ${DATASET_DIR} \
-  --preproc-data-dir ${PREPROC_DATADIR}/${MAX_SEQ_LEN} \
-  --target-bleu $TARGET \
-  --epochs "${NUMEPOCHS}" \
-  --math ${MATH} \
-  --max-length-train ${MAX_SEQ_LEN} \
-  --print-freq 10 \
-  --train-batch-size $TRAIN_BATCH_SIZE \
-  --test-batch-size $TEST_BATCH_SIZE \
-  --optimizer Adam \
-  --lr $LR \
-  --warmup-steps $WARMUP_STEPS \
-  --remain-steps $REMAIN_STEPS \
-  --decay-interval $DECAY_INTERVAL \
-  $DIST_OPTS \
-  $EXTRA_OPTS ; ret_code=$?
-set +x
-sleep 3
-if [[ $ret_code != 0 ]]; then exit $ret_code; fi
-# end timing
-end=$(date +%s)
-end_fmt=$(date +%Y-%m-%d\ %r)
-echo "ENDING TIMING RUN AT $end_fmt"
-# report result
-result=$(( $end - $start ))
-result_name="RNN_TRANSLATOR"
-echo "RESULT,$result_name,,$result,nvidia,$start_fmt"
--- a/PyTorch/NLP/gnmt/run_fp32_singleCard.sh
+++ b/PyTorch/NLP/gnmt/run_fp32_singleCard.sh
-#!/bin/bash
-source `pwd`/config_DGX1.sh
-set -e
-# start timing
-start=$(date +%s)
-start_fmt=$(date +%Y-%m-%d\ %r)
-echo "STARTING TIMING RUN AT $start_fmt"
-export NCCL_DEBUG=${NCCL_DEBUG:-"WARN"}
-# run benchmark
-set -x
-DATASET_DIR='../wmt16_de_en/'
-PREPROC_DATADIR='./preproc_data'
-RESULTS_DIR='gnmt_wmt16'
-DIST_OPTS=${DIST_OPTS:-""}
-EXTRA_OPTS=${EXTRA_OPTS:-""}
-declare -a CMD
-CMD=( 'python3' '-u' '-m' 'bind_launch' "--nsockets_per_node=${DGXNSOCKET}" \
-  "--ncores_per_socket=${DGXSOCKETCORES}" "--nproc_per_node=${DGXNGPU}" "--no_hyperthreads")
-echo "running benchmark"
-# run training
-#for 1 card fp32 training
-HIP_VISIBLE_DEVICES=0 python3 train.py \
-  --save ${RESULTS_DIR} \
-  --dataset-dir ${DATASET_DIR} \
-  --preproc-data-dir ${PREPROC_DATADIR}/${MAX_SEQ_LEN} \
-  --target-bleu $TARGET \
-  --epochs "${NUMEPOCHS}" \
-  --math ${MATH} \
-  --max-length-train ${MAX_SEQ_LEN} \
-  --print-freq 10 \
-  --train-batch-size $TRAIN_BATCH_SIZE \
-  --test-batch-size $TEST_BATCH_SIZE \
-  --optimizer Adam \
-  --lr $LR \
-  --warmup-steps $WARMUP_STEPS \
-  --remain-steps $REMAIN_STEPS \
-  --decay-interval $DECAY_INTERVAL \
-  $EXTRA_OPTS ; ret_code=$?
-set +x
-sleep 3
-if [[ $ret_code != 0 ]]; then exit $ret_code; fi
-# end timing
-end=$(date +%s)
-end_fmt=$(date +%Y-%m-%d\ %r)
-echo "ENDING TIMING RUN AT $end_fmt"
-# report result
-result=$(( $end - $start ))
-result_name="RNN_TRANSLATOR"
-echo "RESULT,$result_name,,$result,nvidia,$start_fmt"
--- a/PyTorch/NLP/gnmt/scripts/filter_dataset.py
+++ b/PyTorch/NLP/gnmt/scripts/filter_dataset.py
@@ -20,9 +20,7 @@
 import argparse
 from collections import Counter
-import sys
-import importlib
-importlib.reload(sys)
 def parse_args():
    parser = argparse.ArgumentParser(description='Clean dataset')
@@ -32,8 +30,7 @@ def parse_args():
 def save_output(fname, data):
-    #with open(fname, 'w') as f:
+    with open(fname, 'w') as f:
-    with open(fname, 'w', encoding='utf-8') as f:
        f.writelines(data)
@@ -74,8 +71,7 @@ def main():
    data1 = []
    data2 = []
-    #with open(args.file1) as f1, open(args.file2) as f2:
+    with open(args.file1) as f1, open(args.file2) as f2:
-    with open(args.file1, 'r', encoding='utf-8') as f1, open(args.file2, 'r', encoding='utf-8') as f2:
        for idx, lines in enumerate(zip(f1, f2)):
            line1, line2 = lines
            if idx % 100000 == 1: