Commit 01bc05b7 authored by Pan,Huiwen's avatar Pan,Huiwen
Browse files

updata GNMT-v2

parent 20291e9d
# Copyright (c) 2018-2020, NVIDIA CORPORATION. All rights reserved. # Copyright (c) 2018-2020, NVIDIA CORPORATION. All rights reserved.
# #
# Licensed under the Apache License, Version 2.0 (the "License"); # Permission is hereby granted, free of charge, to any person obtaining a copy
# you may not use this file except in compliance with the License. # of this software and associated documentation files (the "Software"), to deal
# You may obtain a copy of the License at # in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
# #
# http://www.apache.org/licenses/LICENSE-2.0 # The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
# #
# Unless required by applicable law or agreed to in writing, software # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# distributed under the License is distributed on an "AS IS" BASIS, # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# See the License for the specific language governing permissions and # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# limitations under the License. # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
ARG FROM_IMAGE_NAME=nvcr.io/nvidia/pytorch:20.06-py3 ARG FROM_IMAGE_NAME=nvcr.io/nvidia/pytorch:20.06-py3
FROM ${FROM_IMAGE_NAME} FROM ${FROM_IMAGE_NAME}
# Install dependencies for system configuration logger ENV LANG C.UTF-8
RUN apt-get update && apt-get install -y --no-install-recommends \ ENV LC_ALL C.UTF-8
infiniband-diags \
pciutils && \
rm -rf /var/lib/apt/lists/*
# Install Python dependencies
WORKDIR /workspace/rnn_translator
COPY requirements.txt . RUN pip install --global-option="--cpp_ext" --global-option="--cuda_ext" git+https://github.com/NVIDIA/apex.git#egg=apex
RUN pip install --no-cache-dir https://github.com/mlperf/logging/archive/9ea0afa.zip \
&& pip install --no-cache-dir -r requirements.txt
# Copy & build extensions WORKDIR /workspace/gnmt
COPY seq2seq/csrc seq2seq/csrc
COPY setup.py .
RUN pip install .
# Copy GNMT code COPY requirements.txt .
COPY . . RUN pip install --no-cache-dir -r requirements.txt
RUN pip install --no-cache-dir git+https://github.com/NVIDIA/dllogger.git#egg=dllogger
# Configure environment variables ADD . /workspace/gnmt
ENV LANG C.UTF-8
ENV LC_ALL C.UTF-8
MIT License
Copyright (c) 2017 Elad Hoffer Copyright (c) 2017 Elad Hoffer
Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. Copyright (c) 2018-2020, NVIDIA CORPORATION. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal of this software and associated documentation files (the "Software"), to deal
......
# 1. 简介 # GNMT v2 For PyTorch
该脚本是基于NLP领域gnmt模型的功能测试用例,参考mlperf工程,当target-bleu指标达到24.0时,视为模型达到收敛标准并成功结束作业运行。 This repository provides a script and recipe to train the GNMT v2 model to
# 2. 运行 achieve state of the art accuracy, and is tested and maintained by NVIDIA.
## 安装依赖项 ## Table Of Contents
pip install sacrebleu==1.2.10
pip3 install --no-cache-dir https://github.com/mlperf/logging/archive/9ea0afa.zip <!-- TOC GFM -->
apex
seq2seq中gpu相关依赖 CC=hipcc CXX=hipcc python3 setup.py install * [Model overview](#model-overview)
* [Model architecture](#model-architecture)
## 数据集下载 * [Default configuration](#default-configuration)
bash scripts/wmt16_en_de.sh * [Feature support matrix](#feature-support-matrix)
* 关于数据集的更详细介绍可以参考README_orgin.md中第3部分 * [Features](#features)
* [Mixed precision training](#mixed-precision-training)
## 预处理 * [Enabling mixed precision](#enabling-mixed-precision)
python3 preprocess_data.py --dataset-dir /path/to/download/wmt16_de_en/ --preproc-data-dir =/path/to/save/preprocess/data --max-length-train "75" --math fp32 * [Enabling TF32](#enabling-tf32)
* [Setup](#setup)
## 单机单卡 * [Requirements](#requirements)
HIP_VISIBLE_DEVICES=0 python3 train.py \ * [Quick Start Guide](#quick-start-guide)
--save ${RESULTS_DIR} \ * [Advanced](#advanced)
--dataset-dir ${DATASET_DIR} \ * [Scripts and sample code](#scripts-and-sample-code)
--preproc-data-dir ${PREPROC_DATADIR}/${MAX_SEQ_LEN} \ * [Parameters](#parameters)
--target-bleu $TARGET \ * [Command-line options](#command-line-options)
--epochs "${NUMEPOCHS}" \ * [Getting the data](#getting-the-data)
--math ${MATH} \ * [Dataset guidelines](#dataset-guidelines)
--max-length-train ${MAX_SEQ_LEN} \ * [Training process](#training-process)
--print-freq 10 \ * [Inference process](#inference-process)
--train-batch-size $TRAIN_BATCH_SIZE \ * [Performance](#performance)
--test-batch-size $TEST_BATCH_SIZE \ * [Benchmarking](#benchmarking)
--optimizer Adam \ * [Training performance benchmark](#training-performance-benchmark)
--lr $LR \ * [Inference performance benchmark](#inference-performance-benchmark)
--warmup-steps $WARMUP_STEPS \ * [Results](#results)
--remain-steps $REMAIN_STEPS \ * [Training accuracy results](#training-accuracy-results)
--decay-interval $DECAY_INTERVAL \ * [Training accuracy: NVIDIA DGX A100 (8x A100 40GB)](#training-accuracy-nvidia-dgx-a100-8x-a100-40gb)
--no-log-all-ranks * [Training accuracy: NVIDIA DGX-1 (8x V100 16GB)](#training-accuracy-nvidia-dgx-1-8x-v100-16gb)
* [Training accuracy: NVIDIA DGX-2H (16x V100 32GB)](#training-accuracy-nvidia-dgx-2h-16x-v100-32gb)
* 可参考run_fp32_singleCard.sh * [Training stability test](#training-stability-test)
* [Training throughput results](#training-throughput-results)
## 单机多卡 * [Training throughput: NVIDIA DGX A100 (8x A100 40GB)](#training-throughput-nvidia-dgx-a100-8x-a100-40gb)
bash run_fp32_node.sh * [Training throughput: NVIDIA DGX-1 (8x V100 16GB)](#training-throughput-nvidia-dgx-1-8x-v100-16gb)
* [Training throughput: NVIDIA DGX-2H (16x V100 32GB)](#training-throughput-nvidia-dgx-2h-16x-v100-32gb)
* 可参考run_fp32_node.sh * [Inference accuracy results](#inference-accuracy-results)
* [Inference accuracy: NVIDIA A100 40GB](#inference-accuracy-nvidia-a100-40gb)
## 多机多卡 * [Inference accuracy: NVIDIA Tesla V100 16GB](#inference-accuracy-nvidia-tesla-v100-16gb)
bash run_fp32_multi.sh * [Inference accuracy: NVIDIA T4](#inference-accuracy-nvidia-t4)
* [Inference throughput results](#inference-throughput-results)
# 3. 模型 * [Inference throughput: NVIDIA A100 40GB](#inference-throughput-nvidia-a100-40gb)
### Publication/Attribution * [Inference throughput: NVIDIA T4](#inference-throughput-nvidia-t4)
* [Inference latency results](#inference-latency-results)
Implemented model is similar to the one from [Google's Neural Machine * [Inference latency: NVIDIA A100 40GB](#inference-latency-nvidia-a100-40gb)
Translation System: Bridging the Gap between Human and Machine * [Inference latency: NVIDIA T4](#inference-latency-nvidia-t4)
* [Release notes](#release-notes)
* [Changelog](#changelog)
* [Known issues](#known-issues)
<!-- /TOC -->
## Model overview
The GNMT v2 model is similar to the one discussed in the [Google's Neural
Machine Translation System: Bridging the Gap between Human and Machine
Translation](https://arxiv.org/abs/1609.08144) paper. Translation](https://arxiv.org/abs/1609.08144) paper.
Most important difference is in the attention mechanism. This repository The most important difference between the two models is in the attention
implements `gnmt_v2` attention: output from first LSTM layer of decoder goes mechanism. In our model, the output from the first LSTM layer of the decoder
into attention, then re-weighted context is concatenated with inputs to all goes into the attention module, then the re-weighted context is concatenated
subsequent LSTM layers in decoder at current timestep. with inputs to all subsequent LSTM layers in the decoder at the current
time step.
The same attention mechanism is also implemented in the default GNMT-like
models from [TensorFlow Neural Machine Translation
Tutorial](https://github.com/tensorflow/nmt) and [NVIDIA OpenSeq2Seq
Toolkit](https://github.com/NVIDIA/OpenSeq2Seq).
The same attention mechanism is also implemented in default ### Model architecture
GNMT-like models from [tensorflow/nmt](https://github.com/tensorflow/nmt) and ![ModelArchitecture](./img/diagram.png)
[NVIDIA/OpenSeq2Seq](https://github.com/NVIDIA/OpenSeq2Seq).
### Structure ### Default configuration
The following features were implemented in this model:
* general: * general:
* encoder and decoder are using shared embeddings * encoder and decoder are using shared embeddings
* data-parallel multi-gpu training * data-parallel multi-GPU training
* dynamic loss scaling with backoff for Tensor Cores (mixed precision)
training
* trained with label smoothing loss (smoothing factor 0.1) * trained with label smoothing loss (smoothing factor 0.1)
* encoder: * encoder:
* 4-layer LSTM, hidden size 1024, first layer is bidirectional, the rest of * 4-layer LSTM, hidden size 1024, first layer is bidirectional, the rest are
layers are unidirectional unidirectional
* with residual connections starting from 3rd LSTM layer * with residual connections starting from 3rd layer
* uses standard pytorch nn.LSTM layer * uses standard PyTorch nn.LSTM layer
* dropout is applied on input to all LSTM layers, probability of dropout is * dropout is applied on input to all LSTM layers, probability of dropout is
set to 0.2 set to 0.2
* hidden state of LSTM layers is initialized with zeros * hidden state of LSTM layers is initialized with zeros
* weights and bias of LSTM layers is initialized with uniform(-0.1, 0.1) * weights and bias of LSTM layers is initialized with uniform(-0.1,0.1)
distribution distribution
* decoder: * decoder:
* 4-layer unidirectional LSTM with hidden size 1024 and fully-connected * 4-layer unidirectional LSTM with hidden size 1024 and fully-connected
classifier classifier
* with residual connections starting from 3rd LSTM layer * with residual connections starting from 3rd layer
* uses standard pytorch nn.LSTM layer * uses standard PyTorch nn.LSTM layer
* dropout is applied on input to all LSTM layers, probability of dropout is * dropout is applied on input to all LSTM layers, probability of dropout is
set to 0.2 set to 0.2
* hidden state of LSTM layers is initialized with zeros * hidden state of LSTM layers is initialized with zeros
* weights and bias of LSTM layers is initialized with uniform(-0.1, 0.1) * weights and bias of LSTM layers is initialized with uniform(-0.1,0.1)
distribution distribution
* weights and bias of fully-connected classifier is initialized with * weights and bias of fully-connected classifier is initialized with
uniform(-0.1, 0.1) distribution uniform(-0.1,0.1) distribution
* attention: * attention:
* normalized Bahdanau attention * normalized Bahdanau attention
* model uses `gnmt_v2` attention mechanism * output from first LSTM layer of decoder goes into attention, then
* output from first LSTM layer of decoder goes into attention, re-weighted context is concatenated with the input to all subsequent LSTM
then re-weighted context is concatenated with the input to all subsequent layers of the decoder at the current timestep
LSTM layers in decoder at the current timestep * linear transform of keys and queries is initialized with uniform(-0.1,
* linear transform of keys and queries is initialized with uniform(-0.1, 0.1), 0.1), normalization scalar is initialized with 1.0/sqrt(1024),
normalization scalar is initialized with 1.0 / sqrt(1024),
normalization bias is initialized with zero normalization bias is initialized with zero
* inference: * inference:
* beam search with beam size of 5 * beam search with default beam size of 5
* with coverage penalty and length normalization, coverage penalty factor is * with coverage penalty and length normalization, coverage penalty factor is
set to 0.1, length normalization factor is set to 0.6 and length set to 0.1, length normalization factor is set to 0.6 and length
normalization constant is set to 5.0 normalization constant is set to 5.0
* BLEU computed by [sacrebleu](https://pypi.org/project/sacrebleu/) * de-tokenized BLEU computed by
[SacreBLEU](https://github.com/mjpost/sacrebleu)
* [motivation](https://github.com/mjpost/sacrebleu#motivation) for choosing
SacreBLEU
When comparing the BLEU score, there are various tokenization approaches and
BLEU calculation methodologies; therefore, ensure you align similar metrics.
Code from this repository can be used to train a larger, 8-layer GNMT v2 model.
Our experiments show that a 4-layer model is significantly faster to train and
yields comparable accuracy on the public [WMT16
English-German](http://www.statmt.org/wmt16/translation-task.html) dataset. The
number of LSTM layers is controlled by the `--num-layers` parameter in the
`train.py` training script.
### Feature support matrix
The following features are supported by this model.
| **Feature** | **GNMT v2** |
|:------------|------------:|
|[Apex AMP](https://nvidia.github.io/apex/amp.html) | Yes |
|[Apex DistributedDataParallel](https://nvidia.github.io/apex/parallel.html#apex.parallel.DistributedDataParallel) | Yes |
#### Features
[Apex AMP](https://nvidia.github.io/apex/amp.html) - a tool that enables Tensor
Core-accelerated training. Refer to the [Enabling mixed
precision](#enabling-mixed-precision) section for more details.
[Apex
DistributedDataParallel](https://nvidia.github.io/apex/parallel.html#apex.parallel.DistributedDataParallel) -
a module wrapper that enables easy multiprocess distributed data parallel
training, similar to
[torch.nn.parallel.DistributedDataParallel](https://pytorch.org/docs/stable/nn.html#torch.nn.parallel.DistributedDataParallel).
`DistributedDataParallel` is optimized for use with
[NCCL](https://github.com/NVIDIA/nccl). It achieves high performance by
overlapping communication with computation during `backward()` and bucketing
smaller gradient transfers to reduce the total number of transfers required.
### Mixed precision training
Mixed precision is the combined use of different numerical precisions in a
computational method.
[Mixed precision](https://arxiv.org/abs/1710.03740) training offers significant
computational speedup by performing operations in half-precision format, while
storing minimal information in single-precision to retain as much information
as possible in critical parts of the network. Since the introduction of [Tensor
Cores](https://developer.nvidia.com/tensor-cores) in Volta, and following with
both the Turing and Ampere architectures, significant training speedups are
experienced by switching to mixed precision -- up to 3x overall speedup on the
most arithmetically intense model architectures. Using mixed precision training
previously required two steps:
1. Porting the model to use the FP16 data type where appropriate.
2. Manually adding loss scaling to preserve small gradient values.
The ability to train deep learning networks with lower precision was introduced
in the Pascal architecture and first supported in [CUDA
8](https://devblogs.nvidia.com/parallelforall/tag/fp16/) in the NVIDIA Deep
Learning SDK.
For information about:
* How to train using mixed precision, see the [Mixed Precision
Training](https://arxiv.org/abs/1710.03740) paper and [Training With Mixed
Precision](https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html)
documentation.
* Techniques used for mixed precision training, see the [Mixed-Precision
Training of Deep Neural
Networks](https://devblogs.nvidia.com/mixed-precision-training-deep-neural-networks/)
blog.
* APEX tools for mixed precision training, see the [NVIDIA Apex: Tools for Easy
Mixed-Precision Training in
PyTorch](https://devblogs.nvidia.com/apex-pytorch-easy-mixed-precision-training/)
.
#### Enabling mixed precision
Mixed precision is enabled in PyTorch by using the Automatic Mixed Precision
(AMP), library from [APEX](https://github.com/NVIDIA/apex) that casts variables
to half-precision upon retrieval, while storing variables in single-precision
format. Furthermore, to preserve small gradient magnitudes in backpropagation,
a [loss
scaling](https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html#lossscaling)
step must be included when applying gradients. In PyTorch, loss scaling can be
easily applied by using `scale_loss()` method provided by AMP. The scaling
value to be used can be
[dynamic](https://nvidia.github.io/apex/amp.html#apex.amp.initialize) or fixed.
For an in-depth walk through on AMP, check out sample usage
[here](https://nvidia.github.io/apex/amp.html#).
[APEX](https://github.com/NVIDIA/apex) is a PyTorch extension that contains
utility libraries, such as AMP, which require minimal network code changes to
leverage Tensor Cores performance.
The following steps were needed to enable mixed precision training in GNMT:
* Import AMP from APEX (file: `seq2seq/train/trainer.py`):
```
from apex import amp
```
* Initialize AMP and wrap the model and the optimizer (file:
`seq2seq/train/trainer.py`, class: `Seq2SeqTrainer`):
```
self.model, self.optimizer = amp.initialize(
self.model,
self.optimizer,
cast_model_outputs=torch.float16,
keep_batchnorm_fp32=False,
opt_level='O2')
```
* Apply `scale_loss` context manager (file: `seq2seq/train/fp_optimizers.py`,
class: `AMPOptimizer`):
```
with amp.scale_loss(loss, optimizer) as scaled_loss:
scaled_loss.backward()
```
* Apply gradient clipping on single precision master weights (file:
`seq2seq/train/fp_optimizers.py`, class: `AMPOptimizer`):
```
if self.grad_clip != float('inf'):
clip_grad_norm_(amp.master_params(optimizer), self.grad_clip)
```
#### Enabling TF32
TensorFloat-32 (TF32) is the new math mode in [NVIDIA
A100](https://www.nvidia.com/en-us/data-center/a100/) GPUs for handling the
matrix math also called tensor operations. TF32 running on Tensor Cores in A100
GPUs can provide up to 10x speedups compared to single-precision floating-point
math (FP32) on Volta GPUs.
TF32 Tensor Cores can speed up networks using FP32, typically with no loss of
accuracy. It is more robust than FP16 for models which require high dynamic
range for weights or activations.
For more information, refer to the [TensorFloat-32 in the A100 GPU Accelerates
AI Training, HPC up to
20x](https://blogs.nvidia.com/blog/2020/05/14/tensorfloat-32-precision-format/)
blog post.
TF32 is supported in the NVIDIA Ampere GPU architecture and is enabled by
default.
## Setup
The following section lists the requirements in order to start training the
GNMT v2 model.
### Requirements
This repository contains `Dockerfile` which extends the PyTorch NGC container
and encapsulates some dependencies. Aside from these dependencies, ensure you
have the following components:
* [NVIDIA Docker](https://github.com/NVIDIA/nvidia-docker)
* [PyTorch 20.06-py3 NGC container](https://ngc.nvidia.com/registry/nvidia-pytorch)
* GPU architecture:
* [NVIDIA Volta](https://www.nvidia.com/en-us/data-center/volta-gpu-architecture/)
* [NVIDIA Turing](https://www.nvidia.com/en-us/geforce/turing/)
* [NVIDIA Ampere architecture](https://www.nvidia.com/en-us/data-center/nvidia-ampere-gpu-architecture/)
For more information about how to get started with NGC containers, see the
following sections from the NVIDIA GPU Cloud Documentation and the Deep
Learning DGX Documentation:
* [Getting Started Using NVIDIA GPU Cloud](https://docs.nvidia.com/ngc/ngc-getting-started-guide/index.html),
* [Accessing And Pulling From The NGC container registry](https://docs.nvidia.com/deeplearning/dgx/user-guide/index.html#accessing_registry),
* [Running PyTorch](https://docs.nvidia.com/deeplearning/dgx/pytorch-release-notes/running.html#running).
For those unable to use the Pytorch NGC container, to set up the required
environment or create your own container, see the versioned [NVIDIA Container
Support
Matrix](https://docs.nvidia.com/deeplearning/frameworks/support-matrix/index.html).
## Quick Start Guide
To train your model using mixed or TF32 precision with Tensor Cores or using
FP32, perform the following steps using the default parameters of the GNMT v2
model on the WMT16 English German dataset. For the specifics concerning
training and inference, see the [Advanced](#advanced) section.
**1. Clone the repository.**
```
git clone https://github.com/NVIDIA/DeepLearningExamples
cd DeepLearningExamples/PyTorch/Translation/GNMT
```
**2. Build the GNMT v2 Docker container.**
```
bash scripts/docker/build.sh
```
**3. Start an interactive session in the container to run training/inference.**
```
bash scripts/docker/interactive.sh
```
**4. Download and preprocess the dataset.**
Data will be downloaded to the `data` directory (on the host). The `data`
directory is mounted to the `/workspace/gnmt/data` location in the Docker
container.
```
bash scripts/wmt16_en_de.sh
```
**5. Start training.**
The training script saves only one checkpoint with the lowest value of the loss
function on the validation dataset. All results and logs are saved to the
`gnmt` directory (on the host) or to the `/workspace/gnmt/gnmt` directory
(in the container). By default, the `train.py` script will launch mixed
precision training with Tensor Cores. You can change this behavior by setting:
* the `--math fp32` flag to launch single precision training (for NVIDIA Volta
and NVIDIA Turing architectures) or
* the `--math tf32` flag to launch TF32 training with Tensor Cores (for NVIDIA
Ampere architecture)
for the `train.py` training script.
To launch mixed precision training on 1, 4 or 8 GPUs, run:
```
python3 -m torch.distributed.launch --nproc_per_node=<#GPUs> train.py --seed 2 --train-global-batch-size 1024
```
To launch mixed precision training on 16 GPUs, run:
```
python3 -m torch.distributed.launch --nproc_per_node=16 train.py --seed 2 --train-global-batch-size 2048
```
By default, the training script will launch training with batch size 128 per
GPU. If `--train-global-batch-size` is specified and larger than 128 times the
number of GPUs available for the training then the training script will
accumulate gradients over consecutive iterations and then perform the weight
update. For example, 1 GPU training with `--train-global-batch-size 1024` will
accumulate gradients over 8 iterations before doing the weight update with
accumulated gradients.
**6. Start evaluation.**
The training process automatically runs evaluation and outputs the BLEU score
after each training epoch. Additionally, after the training is done, you can
manually run inference on the test dataset with the checkpoint saved during the
training.
To launch FP16 inference on the `newstest2014.en` test set, run:
```
python3 translate.py \
--input data/wmt16_de_en/newstest2014.en \
--reference data/wmt16_de_en/newstest2014.de \
--output /tmp/output \
--model gnmt/model_best.pth
```
The script will load the checkpoint specified by the `--model` option, then it
will launch inference on the file specified by the `--input` option, and
compute BLEU score against the reference translation specified by the
`--reference` option. Outputs will be stored to the location specified by the
`--output` option.
Additionally, one can pass the input text directly from the command-line:
```
python3 translate.py \
--input-text "The quick brown fox jumps over the lazy dog" \
--model gnmt/model_best.pth
```
Translated output will be printed to the console:
```
(...)
0: Translated output:
Der schnelle braune Fuchs springt über den faulen Hund
```
By default, the `translate.py` script will launch FP16 inference with Tensor
Cores. You can change this behavior by setting:
* the `--math fp32` flag to launch single precision inference (for NVIDIA Volta
and NVIDIA Turing architectures) or
* the `--math tf32` flag to launch TF32 inference with Tensor Cores (for NVIDIA
Ampere architecture)
for the `translate.py` inference script.
## Advanced
The following sections provide greater details of the dataset, running training
and inference, and the training results.
### Scripts and sample code
In the `root` directory, the most important files are:
* `train.py`: serves as the entry point to launch the training
* `translate.py`: serves as the entry point to launch inference
* `Dockerfile`: container with the basic set of dependencies to run GNMT v2
* `requirements.txt`: set of extra requirements for running GNMT v2
The `seq2seq/model` directory contains the implementation of GNMT v2 building
blocks:
* `attention.py`: implementation of normalized Bahdanau attention
* `encoder.py`: implementation of recurrent encoder
* `decoder.py`: implementation of recurrent decoder with attention
* `seq2seq_base.py`: base class for seq2seq models
* `gnmt.py`: implementation of GNMT v2 model
The `seq2seq/train` directory encapsulates the necessary tools to execute
training:
* `trainer.py`: implementation of training loop
* `smoothing.py`: implementation of cross-entropy with label smoothing
* `lr_scheduler.py`: implementation of exponential learning rate warmup and
step decay
* `fp_optimizers.py`: implementation of optimizers for various floating point
precisions
The `seq2seq/inference` directory contains scripts required to run inference:
* `beam_search.py`: implementation of beam search with length normalization and
length penalty
* `translator.py`: implementation of auto-regressive inference
The `seq2seq/data` directory contains implementation of components needed for
data loading:
* `dataset.py`: implementation of text datasets
* `sampler.py`: implementation of batch samplers with bucketing by sequence
length
* `tokenizer.py`: implementation of tokenizer (maps integer vocabulary indices
to text)
### Parameters
Training
The complete list of available parameters for the `train.py` training script
contains:
```
dataset setup:
--dataset-dir DATASET_DIR
path to the directory with training/test data
(default: data/wmt16_de_en)
--src-lang SRC_LANG source language (default: en)
--tgt-lang TGT_LANG target language (default: de)
--vocab VOCAB path to the vocabulary file (relative to DATASET_DIR
directory) (default: vocab.bpe.32000)
-bpe BPE_CODES, --bpe-codes BPE_CODES
path to the file with bpe codes (relative to
DATASET_DIR directory) (default: bpe.32000)
--train-src TRAIN_SRC
path to the training source data file (relative to
DATASET_DIR directory) (default:
train.tok.clean.bpe.32000.en)
--train-tgt TRAIN_TGT
path to the training target data file (relative to
DATASET_DIR directory) (default:
train.tok.clean.bpe.32000.de)
--val-src VAL_SRC path to the validation source data file (relative to
DATASET_DIR directory) (default:
newstest_dev.tok.clean.bpe.32000.en)
--val-tgt VAL_TGT path to the validation target data file (relative to
DATASET_DIR directory) (default:
newstest_dev.tok.clean.bpe.32000.de)
--test-src TEST_SRC path to the test source data file (relative to
DATASET_DIR directory) (default:
newstest2014.tok.bpe.32000.en)
--test-tgt TEST_TGT path to the test target data file (relative to
DATASET_DIR directory) (default: newstest2014.de)
--train-max-size TRAIN_MAX_SIZE
use at most TRAIN_MAX_SIZE elements from training
dataset (useful for benchmarking), by default uses
entire dataset (default: None)
results setup:
--save-dir SAVE_DIR path to directory with results, it will be
automatically created if it does not exist (default:
gnmt)
--print-freq PRINT_FREQ
print log every PRINT_FREQ batches (default: 10)
model setup:
--hidden-size HIDDEN_SIZE
hidden size of the model (default: 1024)
--num-layers NUM_LAYERS
number of RNN layers in encoder and in decoder
(default: 4)
--dropout DROPOUT dropout applied to input of RNN cells (default: 0.2)
--share-embedding use shared embeddings for encoder and decoder (use '--
no-share-embedding' to disable) (default: True)
--smoothing SMOOTHING
label smoothing, if equal to zero model will use
CrossEntropyLoss, if not zero model will be trained
with label smoothing loss (default: 0.1)
general setup:
--math {fp16,fp32,tf32,manual_fp16}
precision (default: fp16)
--seed SEED master seed for random number generators, if "seed" is
undefined then the master seed will be sampled from
random.SystemRandom() (default: None)
--prealloc-mode {off,once,always}
controls preallocation (default: always)
--dllog-file DLLOG_FILE
Name of the DLLogger output file (default:
train_log.json)
--eval run validation and test after every epoch (use '--no-
eval' to disable) (default: True)
--env print info about execution env (use '--no-env' to
disable) (default: True)
--cuda enables cuda (use '--no-cuda' to disable) (default:
True)
--cudnn enables cudnn (use '--no-cudnn' to disable) (default:
True)
--log-all-ranks enables logging from all distributed ranks, if
disabled then only logs from rank 0 are reported (use
'--no-log-all-ranks' to disable) (default: True)
training setup:
--train-batch-size TRAIN_BATCH_SIZE
training batch size per worker (default: 128)
--train-global-batch-size TRAIN_GLOBAL_BATCH_SIZE
global training batch size, this argument does not
have to be defined, if it is defined it will be used
to automatically compute train_iter_size using the
equation: train_iter_size = train_global_batch_size //
(train_batch_size * world_size) (default: None)
--train-iter-size N training iter size, training loop will accumulate
gradients over N iterations and execute optimizer
every N steps (default: 1)
--epochs EPOCHS max number of training epochs (default: 6)
--grad-clip GRAD_CLIP
enables gradient clipping and sets maximum norm of
gradients (default: 5.0)
--train-max-length TRAIN_MAX_LENGTH
maximum sequence length for training (including
special BOS and EOS tokens) (default: 50)
--train-min-length TRAIN_MIN_LENGTH
minimum sequence length for training (including
special BOS and EOS tokens) (default: 0)
--train-loader-workers TRAIN_LOADER_WORKERS
number of workers for training data loading (default:
2)
--batching {random,sharding,bucketing}
select batching algorithm (default: bucketing)
--shard-size SHARD_SIZE
shard size for "sharding" batching algorithm, in
multiples of global batch size (default: 80)
--num-buckets NUM_BUCKETS
number of buckets for "bucketing" batching algorithm
(default: 5)
optimizer setup:
--optimizer OPTIMIZER
training optimizer (default: Adam)
--lr LR learning rate (default: 0.002)
--optimizer-extra OPTIMIZER_EXTRA
extra options for the optimizer (default: {})
mixed precision loss scaling setup:
--init-scale INIT_SCALE
initial loss scale (default: 8192)
--upscale-interval UPSCALE_INTERVAL
loss upscaling interval (default: 128)
learning rate scheduler setup:
--warmup-steps WARMUP_STEPS
number of learning rate warmup iterations (default:
200)
--remain-steps REMAIN_STEPS
starting iteration for learning rate decay (default:
0.666)
--decay-interval DECAY_INTERVAL
interval between learning rate decay steps (default:
None)
--decay-steps DECAY_STEPS
max number of learning rate decay steps (default: 4)
--decay-factor DECAY_FACTOR
learning rate decay factor (default: 0.5)
validation setup:
--val-batch-size VAL_BATCH_SIZE
batch size for validation (default: 64)
--val-max-length VAL_MAX_LENGTH
maximum sequence length for validation (including
special BOS and EOS tokens) (default: 125)
--val-min-length VAL_MIN_LENGTH
minimum sequence length for validation (including
special BOS and EOS tokens) (default: 0)
--val-loader-workers VAL_LOADER_WORKERS
number of workers for validation data loading
(default: 0)
test setup:
--test-batch-size TEST_BATCH_SIZE
batch size for test (default: 128)
--test-max-length TEST_MAX_LENGTH
maximum sequence length for test (including special
BOS and EOS tokens) (default: 150)
--test-min-length TEST_MIN_LENGTH
minimum sequence length for test (including special
BOS and EOS tokens) (default: 0)
--beam-size BEAM_SIZE
beam size (default: 5)
--len-norm-factor LEN_NORM_FACTOR
length normalization factor (default: 0.6)
--cov-penalty-factor COV_PENALTY_FACTOR
coverage penalty factor (default: 0.1)
--len-norm-const LEN_NORM_CONST
length normalization constant (default: 5.0)
--intra-epoch-eval N evaluate within training epoch, this option will
enable extra N equally spaced evaluations executed
during each training epoch (default: 0)
--test-loader-workers TEST_LOADER_WORKERS
number of workers for test data loading (default: 0)
checkpointing setup:
--start-epoch START_EPOCH
manually set initial epoch counter (default: 0)
--resume PATH resumes training from checkpoint from PATH (default:
None)
--save-all saves checkpoint after every epoch (default: False)
--save-freq SAVE_FREQ
save checkpoint every SAVE_FREQ batches (default:
5000)
--keep-checkpoints KEEP_CHECKPOINTS
keep only last KEEP_CHECKPOINTS checkpoints, affects
only checkpoints controlled by --save-freq option
(default: 0)
benchmark setup:
--target-perf TARGET_PERF
target training performance (in tokens per second)
(default: None)
--target-bleu TARGET_BLEU
target accuracy (default: None)
```
Inference
The complete list of available parameters for the `translate.py` inference
script contains:
```
data setup:
-o OUTPUT, --output OUTPUT
full path to the output file if not specified, then
the output will be printed (default: None)
-r REFERENCE, --reference REFERENCE
full path to the file with reference translations (for
sacrebleu, raw text) (default: None)
-m MODEL, --model MODEL
full path to the model checkpoint file (default: None)
--synthetic use synthetic dataset (default: False)
--synthetic-batches SYNTHETIC_BATCHES
number of synthetic batches to generate (default: 64)
--synthetic-vocab SYNTHETIC_VOCAB
size of synthetic vocabulary (default: 32320)
--synthetic-len SYNTHETIC_LEN
sequence length of synthetic samples (default: 50)
-i INPUT, --input INPUT
full path to the input file (raw text) (default: None)
-t INPUT_TEXT [INPUT_TEXT ...], --input-text INPUT_TEXT [INPUT_TEXT ...]
raw input text (default: None)
--sort sorts dataset by sequence length (use '--no-sort' to
disable) (default: False)
inference setup:
--batch-size BATCH_SIZE [BATCH_SIZE ...]
batch size per GPU (default: [128])
--beam-size BEAM_SIZE [BEAM_SIZE ...]
beam size (default: [5])
--max-seq-len MAX_SEQ_LEN
maximum generated sequence length (default: 80)
--len-norm-factor LEN_NORM_FACTOR
length normalization factor (default: 0.6)
--cov-penalty-factor COV_PENALTY_FACTOR
coverage penalty factor (default: 0.1)
--len-norm-const LEN_NORM_CONST
length normalization constant (default: 5.0)
general setup:
--math {fp16,fp32,tf32} [{fp16,fp32,tf32} ...]
precision (default: ['fp16'])
--env print info about execution env (use '--no-env' to
disable) (default: False)
--bleu compares with reference translation and computes BLEU
(use '--no-bleu' to disable) (default: True)
--cuda enables cuda (use '--no-cuda' to disable) (default:
True)
--cudnn enables cudnn (use '--no-cudnn' to disable) (default:
True)
--batch-first uses (batch, seq, feature) data format for RNNs
(default: True)
--seq-first uses (seq, batch, feature) data format for RNNs
(default: True)
--save-dir SAVE_DIR path to directory with results, it will be
automatically created if it does not exist (default:
gnmt)
--dllog-file DLLOG_FILE
Name of the DLLogger output file (default:
eval_log.json)
--print-freq PRINT_FREQ, -p PRINT_FREQ
print log every PRINT_FREQ batches (default: 1)
benchmark setup:
--target-perf TARGET_PERF
target inference performance (in tokens per second)
(default: None)
--target-bleu TARGET_BLEU
target accuracy (default: None)
--repeat REPEAT [REPEAT ...]
loops over the dataset REPEAT times, flag accepts
multiple arguments, one for each specified batch size
(default: [1])
--warmup WARMUP warmup iterations for performance counters (default:
0)
--percentiles PERCENTILES [PERCENTILES ...]
Percentiles for confidence intervals for
throughput/latency benchmarks (default: (90, 95, 99))
--tables print accuracy, throughput and latency results in
tables (use '--no-tables' to disable) (default: False)
```
### Command-line options
To see the full list of available options and their descriptions, use the `-h`
or `--help` command line option. For example, for training:
```
python3 train.py --help
usage: train.py [-h] [--dataset-dir DATASET_DIR] [--src-lang SRC_LANG]
[--tgt-lang TGT_LANG] [--vocab VOCAB] [-bpe BPE_CODES]
[--train-src TRAIN_SRC] [--train-tgt TRAIN_TGT]
[--val-src VAL_SRC] [--val-tgt VAL_TGT] [--test-src TEST_SRC]
[--test-tgt TEST_TGT] [--save-dir SAVE_DIR]
[--print-freq PRINT_FREQ] [--hidden-size HIDDEN_SIZE]
[--num-layers NUM_LAYERS] [--dropout DROPOUT]
[--share-embedding] [--smoothing SMOOTHING]
[--math {fp16,fp32,tf32,manual_fp16}] [--seed SEED]
[--prealloc-mode {off,once,always}] [--dllog-file DLLOG_FILE]
[--eval] [--env] [--cuda] [--cudnn] [--log-all-ranks]
[--train-max-size TRAIN_MAX_SIZE]
[--train-batch-size TRAIN_BATCH_SIZE]
[--train-global-batch-size TRAIN_GLOBAL_BATCH_SIZE]
[--train-iter-size N] [--epochs EPOCHS]
[--grad-clip GRAD_CLIP] [--train-max-length TRAIN_MAX_LENGTH]
[--train-min-length TRAIN_MIN_LENGTH]
[--train-loader-workers TRAIN_LOADER_WORKERS]
[--batching {random,sharding,bucketing}]
[--shard-size SHARD_SIZE] [--num-buckets NUM_BUCKETS]
[--optimizer OPTIMIZER] [--lr LR]
[--optimizer-extra OPTIMIZER_EXTRA] [--init-scale INIT_SCALE]
[--upscale-interval UPSCALE_INTERVAL]
[--warmup-steps WARMUP_STEPS] [--remain-steps REMAIN_STEPS]
[--decay-interval DECAY_INTERVAL] [--decay-steps DECAY_STEPS]
[--decay-factor DECAY_FACTOR]
[--val-batch-size VAL_BATCH_SIZE]
[--val-max-length VAL_MAX_LENGTH]
[--val-min-length VAL_MIN_LENGTH]
[--val-loader-workers VAL_LOADER_WORKERS]
[--test-batch-size TEST_BATCH_SIZE]
[--test-max-length TEST_MAX_LENGTH]
[--test-min-length TEST_MIN_LENGTH] [--beam-size BEAM_SIZE]
[--len-norm-factor LEN_NORM_FACTOR]
[--cov-penalty-factor COV_PENALTY_FACTOR]
[--len-norm-const LEN_NORM_CONST] [--intra-epoch-eval N]
[--test-loader-workers TEST_LOADER_WORKERS]
[--start-epoch START_EPOCH] [--resume PATH] [--save-all]
[--save-freq SAVE_FREQ] [--keep-checkpoints KEEP_CHECKPOINTS]
[--target-perf TARGET_PERF] [--target-bleu TARGET_BLEU]
[--local_rank LOCAL_RANK]
```
For example, for inference:
```
python3 translate.py --help
usage: translate.py [-h] [-o OUTPUT] [-r REFERENCE] [-m MODEL] [--synthetic]
[--synthetic-batches SYNTHETIC_BATCHES]
[--synthetic-vocab SYNTHETIC_VOCAB]
[--synthetic-len SYNTHETIC_LEN]
[-i INPUT | -t INPUT_TEXT [INPUT_TEXT ...]] [--sort]
[--batch-size BATCH_SIZE [BATCH_SIZE ...]]
[--beam-size BEAM_SIZE [BEAM_SIZE ...]]
[--max-seq-len MAX_SEQ_LEN]
[--len-norm-factor LEN_NORM_FACTOR]
[--cov-penalty-factor COV_PENALTY_FACTOR]
[--len-norm-const LEN_NORM_CONST]
[--math {fp16,fp32,tf32} [{fp16,fp32,tf32} ...]] [--env]
[--bleu] [--cuda] [--cudnn] [--batch-first | --seq-first]
[--save-dir SAVE_DIR] [--dllog-file DLLOG_FILE]
[--print-freq PRINT_FREQ] [--target-perf TARGET_PERF]
[--target-bleu TARGET_BLEU] [--repeat REPEAT [REPEAT ...]]
[--warmup WARMUP]
[--percentiles PERCENTILES [PERCENTILES ...]] [--tables]
[--local_rank LOCAL_RANK]
```
### Getting the data
The GNMT v2 model was trained on the [WMT16
English-German](http://www.statmt.org/wmt16/translation-task.html) dataset.
Concatenation of the newstest2015 and newstest2016 test sets are used as a
validation dataset and the newstest2014 is used as a testing dataset.
This repository contains the `scripts/wmt16_en_de.sh` download script which
automatically downloads and preprocesses the training, validation and test
datasets. By default, data is downloaded to the `data` directory.
Our download script is very similar to the `wmt16_en_de.sh` script from the
[tensorflow/nmt](https://github.com/tensorflow/nmt/blob/master/nmt/scripts/wmt16_en_de.sh)
repository. Our download script contains an extra preprocessing step, which
discards all pairs of sentences which can't be decoded by *latin-1* encoder.
The `scripts/wmt16_en_de.sh` script uses the
[subword-nmt](https://github.com/rsennrich/subword-nmt) package to segment text
into subword units (Byte Pair Encodings -
[BPE](https://en.wikipedia.org/wiki/Byte_pair_encoding)). By default, the
script builds the shared vocabulary of 32,000 tokens.
In order to test with other datasets, the script needs to be customized
accordingly.
#### Dataset guidelines
The process of downloading and preprocessing the data can be found in the
`scripts/wmt16_en_de.sh` script.
Initially, data is downloaded from [www.statmt.org](www.statmt.org). Then
`europarl-v7`, `commoncrawl` and `news-commentary` corpora are concatenated to
form the training dataset, similarly `newstest2015` and `newstest2016` are
concatenated to form the validation dataset. Raw data is preprocessed with
[Moses](https://github.com/moses-smt/mosesdecoder), first by launching [Moses
tokenizer](https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/tokenizer.perl)
(tokenizer breaks up text into individual words), then by launching
[clean-corpus-n.perl](https://github.com/moses-smt/mosesdecoder/blob/master/scripts/training/clean-corpus-n.perl)
which removes invalid sentences and does initial filtering by sequence length.
Second stage of preprocessing is done by launching the
`scripts/filter_dataset.py` script, which discards all pairs of sentences that
can't be decoded by latin-1 encoder.
Third state of preprocessing uses the
[subword-nmt](https://github.com/rsennrich/subword-nmt) package. First it
builds shared [byte pair
encoding](https://en.wikipedia.org/wiki/Byte_pair_encoding) vocabulary with
32,000 merge operations (command `subword-nmt learn-bpe`), then it applies
generated vocabulary to training, validation and test corpora (command
`subword-nmt apply-bpe`).
### Training process
The default training configuration can be launched by running the `train.py`
training script. By default, the training script saves only one checkpoint with
the lowest value of the loss function on the validation dataset. An evaluation
is then performed after each training epoch. Results are stored in the
`gnmt` directory.
The training script launches data-parallel training with batch size 128 per GPU
on all available GPUs. We have tested reliance on up to 16 GPUs on a single
node.
After each training epoch, the script runs an evaluation on the validation
dataset and outputs a BLEU score on the test dataset (newstest2014). BLEU is
computed by the [SacreBLEU](https://github.com/mjpost/sacreBLEU) package. Logs
from the training and evaluation are saved to the `gnmt` directory.
The summary after each training epoch is printed in the following format:
```
0: Summary: Epoch: 3 Training Loss: 3.1336 Validation Loss: 2.9587 Test BLEU: 23.18
0: Performance: Epoch: 3 Training: 418772 Tok/s Validation: 1445331 Tok/s
```
The training loss is averaged over an entire training epoch, the validation
loss is averaged over the validation dataset and the BLEU score is computed on
the test dataset. Performance is reported in total tokens per second. The
result is averaged over an entire training epoch and summed over all GPUs
participating in the training.
By default, the `train.py` script will launch mixed precision training with
Tensor Cores. You can change this behavior by setting:
* the `--math fp32` flag to launch single precision training (for NVIDIA Volta
and NVIDIA Turing architectures) or
* the `--math tf32` flag to launch TF32 training with Tensor Cores (for NVIDIA
Ampere architecture)
for the `train.py` training script.
To view all available options for training, run `python3 train.py --help`.
### Inference process
Inference can be run by launching the `translate.py` inference script,
although, it requires a pre-trained model checkpoint and tokenized input.
The inference script, `translate.py`, supports batched inference. By default,
it launches beam search with beam size of 5, coverage penalty term and length
normalization term. Greedy decoding can be enabled by setting the beam size to
1.
To view all available options for inference, run `python3 translate.py --help`.
## Performance
The performance measurements in this document were conducted at the time of
publication and may not reflect the performance achieved from NVIDIA’s latest
software release. For the most up-to-date performance measurements, go to
[NVIDIA Data Center Deep Learning Product
Performance](https://developer.nvidia.com/deep-learning-performance-training-inference).
### Benchmarking
The following section shows how to run benchmarks measuring the model
performance in training and inference modes.
#### Training performance benchmark
Training is launched on batches of text data, different batches have different
sequence lengths (number of tokens in the longest sequence). Sequence length
and batch efficiency (ratio of non-pad tokens to total number of tokens) affect
performance of the training, therefore it's recommended to run the training on
a large chunk of training dataset to get a stable and reliable average training
performance. Ideally at least one full epoch of training should be launched to
get a good estimate of training performance.
The following commands will launch one epoch of training:
To launch mixed precision training on 1, 4 or 8 GPUs, run:
```
python3 -m torch.distributed.launch --nproc_per_node=<#GPUs> train.py --seed 2 --train-global-batch-size 1024 --epochs 1 --math fp16
```
To launch mixed precision training on 16 GPUs, run:
```
python3 -m torch.distributed.launch --nproc_per_node=16 train.py --seed 2 --train-global-batch-size 2048 --epochs 1 --math fp16
```
Change `--math fp16` to `--math fp32` to launch single precision training (for
NVIDIA Volta and NVIDIA Turing architectures) or to `--math tf32` to launch
TF32 training with Tensor Cores (for NVIDIA Ampere architecture).
After the training is completed, the `train.py` script prints a summary to
standard output. Performance results are printed in the following format:
```
(...)
0: Performance: Epoch: 0 Training: 418926 Tok/s Validation: 1430828 Tok/s
(...)
```
`Training: 418926 Tok/s` represents training throughput averaged over an entire
training epoch and summed over all GPUs participating in the training.
#### Inference performance benchmark
The inference performance and accuracy benchmarks require a checkpoint from a
fully trained model.
Command to launch the inference accuracy benchmark on NVIDIA Volta or on NVIDIA
Turing architectures:
```
python3 translate.py \
--model gnmt/model_best.pth \
--input data/wmt16_de_en/newstest2014.en \
--reference data/wmt16_de_en/newstest2014.de \
--output /tmp/output \
--math fp16 fp32 \
--batch-size 128 \
--beam-size 1 2 5 \
--tables
```
Command to launch the inference accuracy benchmark on NVIDIA Ampere architecture:
```
python3 translate.py \
--model gnmt/model_best.pth \
--input data/wmt16_de_en/newstest2014.en \
--reference data/wmt16_de_en/newstest2014.de \
--output /tmp/output \
--math fp16 tf32 \
--batch-size 128 \
--beam-size 1 2 5 \
--tables
```
Command to launch the inference throughput and latency benchmarks on NVIDIA
Volta or NVIDIA Turing architectures:
```
python3 translate.py \
--model gnmt/model_best.pth \
--input data/wmt16_de_en/newstest2014.en \
--reference data/wmt16_de_en/newstest2014.de \
--output /tmp/output \
--math fp16 fp32 \
--batch-size 1 2 4 8 32 128 512 \
--repeat 1 1 1 1 2 8 16 \
--beam-size 1 2 5 \
--warmup 5 \
--tables
```
Command to launch the inference throughput and latency benchmarks on NVIDIA
Ampere architecture:
```
python3 translate.py \
--model gnmt/model_best.pth \
--input data/wmt16_de_en/newstest2014.en \
--reference data/wmt16_de_en/newstest2014.de \
--output /tmp/output \
--math fp16 tf32 \
--batch-size 1 2 4 8 32 128 512 \
--repeat 1 1 1 1 2 8 16 \
--beam-size 1 2 5 \
--warmup 5 \
--tables
```
### Results
The following sections provide details on how we achieved our performance and
accuracy in training and inference.
#### Training accuracy results
##### Training accuracy: NVIDIA DGX A100 (8x A100 40GB)
Our results were obtained by running the `train.py` script with the default
batch size = 128 per GPU in the pytorch-20.06-py3 NGC container on NVIDIA DGX
A100 with 8x A100 40GB GPUs.
Command to launch the training:
```
python3 -m torch.distributed.launch --nproc_per_node=<#GPUs> train.py --seed 2 --train-global-batch-size 1024 --math fp16
```
Change `--math fp16` to `--math tf32` to launch TF32 training with Tensor Cores.
| **GPUs** | **Batch Size / GPU** | **Accuracy - TF32 (BLEU)** | **Accuracy - Mixed precision (BLEU)** | **Time to Train - TF32 (minutes)** | **Time to Train - Mixed precision (minutes)** | **Time to Train Speedup (TF32 to Mixed precision)** |
| --- | --- | ----- | ----- | ----- | ------ | ---- |
| 8 | 128 | 24.46 | 24.60 | 34.7 | 22.7 | 1.53 |
To achieve these same results, follow the [Quick Start Guide](#quick-start-guide)
outlined above.
##### Training accuracy: NVIDIA DGX-1 (8x V100 16GB)
Our results were obtained by running the `train.py` script with the default
batch size = 128 per GPU in the pytorch-20.06-py3 NGC container on NVIDIA DGX-1
with 8x V100 16GB GPUs.
Command to launch the training:
```
python3 -m torch.distributed.launch --nproc_per_node=<#GPUs> train.py --seed 2 --train-global-batch-size 1024 --math fp16
```
Change `--math fp16` to `--math fp32` to launch single precision training.
| **GPUs** | **Batch Size / GPU** | **Accuracy - FP32 (BLEU)** | **Accuracy - Mixed precision (BLEU)** | **Time to Train - FP32 (minutes)** | **Time to Train - Mixed precision (minutes)** | **Time to Train Speedup (FP32 to Mixed precision)** |
| --- | --- | ----- | ----- | ----- | ------ | ---- |
| 1 | 128 | 24.41 | 24.42 | 810.0 | 224.0 | 3.62 |
| 4 | 128 | 24.40 | 24.33 | 218.2 | 69.5 | 3.14 |
| 8 | 128 | 24.45 | 24.38 | 112.0 | 38.6 | 2.90 |
To achieve these same results, follow the [Quick Start Guide](#quick-start-guide)
outlined above.
##### Training accuracy: NVIDIA DGX-2H (16x V100 32GB)
Our results were obtained by running the `train.py` script with the default
batch size = 128 per GPU in the pytorch-20.06-py3 NGC container on NVIDIA DGX-2H
with 16x V100 32GB GPUs.
To launch mixed precision training on 16 GPUs, run:
```
python3 -m torch.distributed.launch --nproc_per_node=16 train.py --seed 2 --train-global-batch-size 2048 --math fp16
```
Change `--math fp16` to `--math fp32` to launch single precision training.
| **GPUs** | **Batch Size / GPU** | **Accuracy - FP32 (BLEU)** | **Accuracy - Mixed precision (BLEU)** | **Time to Train - FP32 (minutes)** | **Time to Train - Mixed precision (minutes)** | **Time to Train Speedup (FP32 to Mixed precision)** |
| --- | --- | ----- | ----- | ------ | ----- | ---- |
| 16 | 128 | 24.41 | 24.38 | 52.1 | 19.4 | 2.69 |
To achieve these same results, follow the [Quick Start Guide](#quick-start-guide)
outlined above.
![TrainingLoss](./img/training_loss.png)
##### Training stability test
The GNMT v2 model was trained for 6 epochs, starting from 32 different initial
random seeds. After each training epoch, the model was evaluated on the test
dataset and the BLEU score was recorded. The training was performed in the
pytorch-20.06-py3 Docker container on NVIDIA DGX A100 with 8x A100 40GB GPUs.
The following table summarizes the results of the stability test.
In the following table, the BLEU scores after each training epoch for different
initial random seeds are displayed.
| **Epoch** | **Average** | **Standard deviation** | **Minimum** | **Maximum** | **Median** |
| --- | ------ | ----- | ------ | ------ | ------ |
| 1 | 19.959 | 0.238 | 19.410 | 20.390 | 19.970 |
| 2 | 21.772 | 0.293 | 20.960 | 22.280 | 21.820 |
| 3 | 22.435 | 0.264 | 21.740 | 22.870 | 22.465 |
| 4 | 23.167 | 0.166 | 22.870 | 23.620 | 23.195 |
| 5 | 24.233 | 0.149 | 23.820 | 24.530 | 24.235 |
| 6 | 24.416 | 0.131 | 24.140 | 24.660 | 24.390 |
#### Training throughput results
##### Training throughput: NVIDIA DGX A100 (8x A100 40GB)
Our results were obtained by running the `train.py` training script in the
pytorch-20.06-py3 NGC container on NVIDIA DGX A100 with 8x A100 40GB GPUs.
Throughput performance numbers (in tokens per second) were averaged over an
entire training epoch.
| **GPUs** | **Batch size / GPU** | **Throughput - TF32 (tok/s)** | **Throughput - Mixed precision (tok/s)** | **Throughput speedup (TF32 to Mixed precision)** | **Strong Scaling - TF32** | **Strong Scaling - Mixed precision** |
| --- | --- | ------ | ------ | ----- | ----- | ----- |
| 1 | 128 | 83214 | 140909 | 1.693 | 1.000 | 1.000 |
| 4 | 128 | 278576 | 463144 | 1.663 | 3.348 | 3.287 |
| 8 | 128 | 519952 | 822024 | 1.581 | 6.248 | 5.834 |
To achieve these same results, follow the [Quick Start Guide](#quick-start-guide)
outlined above.
##### Training throughput: NVIDIA DGX-1 (8x V100 16GB)
Our results were obtained by running the `train.py` training script in the
pytorch-20.06-py3 NGC container on NVIDIA DGX-1 with 8x V100 16GB GPUs.
Throughput performance numbers (in tokens per second) were averaged over an
entire training epoch.
| **GPUs** | **Batch size / GPU** | **Throughput - FP32 (tok/s)** | **Throughput - Mixed precision (tok/s)** | **Throughput speedup (FP32 to Mixed precision)** | **Strong Scaling - FP32** | **Strong Scaling - Mixed precision** |
| --- | --- | ------ | ------ | ----- | ----- | ----- |
| 1 | 128 | 21860 | 76438 | 3.497 | 1.000 | 1.000 |
| 4 | 128 | 80224 | 249168 | 3.106 | 3.670 | 3.260 |
| 8 | 128 | 154168 | 447832 | 2.905 | 7.053 | 5.859 |
To achieve these same results, follow the [Quick Start Guide](#quick-start-guide)
outlined above.
##### Training throughput: NVIDIA DGX-2H (16x V100 32GB)
Our results were obtained by running the `train.py` training script in the
pytorch-20.06-py3 NGC container on NVIDIA DGX-2H with 16x V100 32GB GPUs.
Throughput performance numbers (in tokens per second) were averaged over an
entire training epoch.
| **GPUs** | **Batch size / GPU** | **Throughput - FP32 (tok/s)** | **Throughput - Mixed precision (tok/s)** | **Throughput speedup (FP32 to Mixed precision)** | **Strong Scaling - FP32** | **Strong Scaling - Mixed precision** |
| --- | --- | ------ | ------ | ----- | ------ | ------ |
| 1 | 128 | 25583 | 87829 | 3.433 | 1.000 | 1.000 |
| 4 | 128 | 91400 | 290640 | 3.180 | 3.573 | 3.309 |
| 8 | 128 | 176616 | 522008 | 2.956 | 6.904 | 5.943 |
| 16 | 128 | 351792 | 1010880 | 2.874 | 13.751 | 11.510 |
To achieve these same results, follow the [Quick Start Guide](#quick-start-guide)
outlined above.
#### Inference accuracy results
##### Inference accuracy: NVIDIA A100 40GB
Our results were obtained by running the `translate.py` script in the
pytorch-20.06-py3 NGC Docker container with NVIDIA A100 40GB GPU. Full
command to launch the inference accuracy benchmark was provided in the
[Inference performance benchmark](#inference-performance-benchmark) section.
| **Batch Size** | **Beam Size** | **Accuracy - TF32 (BLEU)** | **Accuracy - FP16 (BLEU)** |
| -------------: | ------------: | -------------------------: | -------------------------: |
| 128 | 1 | 23.07 | 23.07 |
| 128 | 2 | 23.81 | 23.81 |
| 128 | 5 | 24.41 | 24.43 |
##### Inference accuracy: NVIDIA Tesla V100 16GB
Our results were obtained by running the `translate.py` script in the
pytorch-20.06-py3 NGC Docker container with NVIDIA Tesla V100 16GB GPU. Full
command to launch the inference accuracy benchmark was provided in the
[Inference performance benchmark](#inference-performance-benchmark) section.
| **Batch Size** | **Beam Size** | **Accuracy - FP32 (BLEU)** | **Accuracy - FP16 (BLEU)** |
| -------------: | ------------: | -------------------------: | -------------------------: |
| 128 | 1 | 23.07 | 23.07 |
| 128 | 2 | 23.81 | 23.79 |
| 128 | 5 | 24.40 | 24.43 |
##### Inference accuracy: NVIDIA T4
Our results were obtained by running the `translate.py` script in the
pytorch-20.06-py3 NGC Docker container with NVIDIA Tesla T4. Full command to
launch the inference accuracy benchmark was provided in the [Inference
performance benchmark](#inference-performance-benchmark) section.
| **Batch Size** | **Beam Size** | **Accuracy - FP32 (BLEU)** | **Accuracy - FP16 (BLEU)** |
| -------------: | ------------: | -------------------------: | -------------------------: |
| 128 | 1 | 23.07 | 23.08 |
| 128 | 2 | 23.81 | 23.80 |
| 128 | 5 | 24.40 | 24.39 |
To achieve these same results, follow the [Quick Start Guide](#quick-start-guide)
outlined above.
#### Inference throughput results
Tables presented in this section show the average inference throughput (columns
**Avg (tok/s)**) and inference throughput for various confidence intervals
(columns **N% (ms)**, where `N` denotes the confidence interval). Inference
throughput is measured in tokens per second. Speedups reported in FP16
subsections are relative to FP32 (for NVIDIA Volta and NVIDIA Turing) and
relative to TF32 (for NVIDIA Ampere) numbers for corresponding configuration.
##### Inference throughput: NVIDIA A100 40GB
Our results were obtained by running the `translate.py` script in the
pytorch-20.06-py3 NGC Docker container with NVIDIA A100 40GB.
Full command to launch the inference throughput benchmark was provided in the
[Inference performance benchmark](#inference-performance-benchmark) section.
**FP16**
|**Batch Size**|**Beam Size**|**Avg (tok/s)**|**Speedup**|**90% (tok/s)**|**Speedup**|**95% (tok/s)**|**Speedup**|**99% (tok/s)**|**Speedup**|
|-------------:|------------:|--------------:|----------:|--------------:|----------:|--------------:|----------:|--------------:|----------:|
| 1| 1| 1291.6| 1.031| 1195.7| 1.029| 1165.8| 1.029| 1104.7| 1.030|
| 1| 2| 882.7| 1.019| 803.4| 1.015| 769.2| 1.015| 696.7| 1.017|
| 1| 5| 848.3| 1.042| 753.0| 1.037| 715.0| 1.043| 636.4| 1.033|
| 2| 1| 2060.5| 1.034| 1700.8| 1.032| 1621.8| 1.032| 1487.4| 1.022|
| 2| 2| 1445.7| 1.026| 1197.6| 1.024| 1132.5| 1.023| 1043.7| 1.033|
| 2| 5| 1402.3| 1.063| 1152.4| 1.056| 1100.5| 1.053| 992.9| 1.053|
| 4| 1| 3465.6| 1.046| 2838.3| 1.040| 2672.7| 1.043| 2392.8| 1.043|
| 4| 2| 2425.4| 1.041| 2002.5| 1.028| 1898.3| 1.033| 1690.2| 1.028|
| 4| 5| 2364.4| 1.075| 1930.0| 1.067| 1822.0| 1.065| 1626.1| 1.058|
| 8| 1| 6151.1| 1.099| 5078.0| 1.087| 4786.5| 1.096| 4206.9| 1.090|
| 8| 2| 4241.9| 1.075| 3494.1| 1.066| 3293.6| 1.066| 2970.9| 1.064|
| 8| 5| 4117.7| 1.118| 3430.9| 1.103| 3224.5| 1.104| 2833.5| 1.110|
| 32| 1| 18830.4| 1.147| 16210.0| 1.152| 15563.9| 1.138| 13973.2| 1.135|
| 32| 2| 12698.2| 1.133| 10812.3| 1.114| 10256.1| 1.145| 9330.2| 1.101|
| 32| 5| 11802.6| 1.355| 9998.8| 1.318| 9671.6| 1.329| 9058.4| 1.335|
| 128| 1| 53394.5| 1.350| 48867.6| 1.342| 46898.5| 1.414| 40670.6| 1.305|
| 128| 2| 34876.4| 1.483| 31687.4| 1.491| 30025.4| 1.505| 27677.1| 1.421|
| 128| 5| 28201.3| 1.986| 25660.5| 1.997| 24306.0| 1.967| 23326.2| 2.007|
| 512| 1| 119675.3| 1.904| 112400.5| 1.971| 109694.8| 1.927| 108781.3| 1.919|
| 512| 2| 74514.7| 2.126| 69578.9| 2.209| 69348.1| 2.210| 69253.7| 2.212|
| 512| 5| 47003.2| 2.760| 43348.2| 2.893| 43080.3| 2.884| 42878.4| 2.881|
##### Inference throughput: NVIDIA T4
Our results were obtained by running the `translate.py` script in the
pytorch-20.06-py3 NGC Docker container with NVIDIA T4.
Full command to launch the inference throughput benchmark was provided in the
[Inference performance benchmark](#inference-performance-benchmark) section.
**FP16**
Implementation: |**Batch Size**|**Beam Size**|**Avg (tok/s)**|**Speedup**|**90% (tok/s)**|**Speedup**|**95% (tok/s)**|**Speedup**|**99% (tok/s)**|**Speedup**|
* base Seq2Seq model: `pytorch/seq2seq/models/seq2seq_base.py`, class `Seq2Seq` |-------------:|------------:|--------------:|----------:|--------------:|----------:|--------------:|----------:|--------------:|----------:|
* GNMT model: `pytorch/seq2seq/models/gnmt.py`, class `GNMT` | 1| 1| 1133.8| 1.266| 1059.1| 1.253| 1036.6| 1.251| 989.5| 1.242|
* encoder: `pytorch/seq2seq/models/encoder.py`, class `ResidualRecurrentEncoder` | 1| 2| 793.9| 1.169| 728.3| 1.165| 698.1| 1.163| 637.1| 1.157|
* decoder: `pytorch/seq2seq/models/decoder.py`, class `ResidualRecurrentDecoder` | 1| 5| 766.8| 1.343| 685.6| 1.335| 649.3| 1.335| 584.1| 1.318|
* attention: `pytorch/seq2seq/models/attention.py`, class `BahdanauAttention` | 2| 1| 1759.8| 1.233| 1461.6| 1.239| 1402.3| 1.242| 1302.1| 1.242|
* inference (including BLEU evaluation and detokenization): `pytorch/seq2seq/inference/inference.py`, class `Translator` | 2| 2| 1313.3| 1.186| 1088.7| 1.185| 1031.6| 1.180| 953.2| 1.178|
* beam search: `pytorch/seq2seq/inference/beam_search.py`, class `SequenceGenerator` | 2| 5| 1257.2| 1.301| 1034.1| 1.316| 990.3| 1.313| 886.3| 1.265|
| 4| 1| 2974.0| 1.261| 2440.3| 1.255| 2294.6| 1.257| 2087.7| 1.261|
| 4| 2| 2204.7| 1.320| 1826.3| 1.283| 1718.9| 1.260| 1548.4| 1.260|
| 4| 5| 2106.1| 1.340| 1727.8| 1.345| 1625.7| 1.353| 1467.7| 1.346|
| 8| 1| 5076.6| 1.423| 4207.9| 1.367| 3904.4| 1.360| 3475.3| 1.355|
| 8| 2| 3761.7| 1.311| 3108.1| 1.285| 2931.6| 1.300| 2628.7| 1.300|
| 8| 5| 3578.2| 1.660| 2998.2| 1.614| 2812.1| 1.609| 2447.6| 1.523|
| 32| 1| 14637.8| 1.636| 12702.5| 1.644| 12070.3| 1.634| 11036.9| 1.647|
| 32| 2| 10627.3| 1.818| 9198.3| 1.818| 8431.6| 1.725| 8000.0| 1.773|
| 32| 5| 8205.7| 2.598| 7117.6| 2.476| 6825.2| 2.497| 6293.2| 2.437|
| 128| 1| 33800.5| 2.755| 30824.5| 2.816| 27685.2| 2.661| 26580.9| 2.694|
| 128| 2| 20829.4| 2.795| 18665.2| 2.778| 17372.1| 2.639| 16820.5| 2.821|
| 128| 5| 11753.9| 3.309| 10658.1| 3.273| 10308.7| 3.205| 9630.7| 3.328|
| 512| 1| 44474.6| 3.327| 40108.1| 3.394| 39816.6| 3.378| 39708.0| 3.381|
| 512| 2| 26057.9| 3.295| 23197.3| 3.294| 23019.8| 3.284| 22951.4| 3.284|
| 512| 5| 12161.5| 3.428| 10777.5| 3.418| 10733.1| 3.414| 10710.5| 3.420|
### Loss function To achieve these same results, follow the [Quick Start Guide](#quick-start-guide)
Cross entropy loss with label smoothing (smoothing factor = 0.1), padding is not outlined above.
considered part of the loss.
Loss function is implemented in `pytorch/seq2seq/train/smoothing.py`, class #### Inference latency results
`LabelSmoothing`. Tables presented in this section show the average inference latency (columns **Avg
(ms)**) and inference latency for various confidence intervals (columns **N%
(ms)**, where `N` denotes the confidence interval). Inference latency is
measured in milliseconds. Speedups reported in FP16 subsections are relative to
FP32 (for NVIDIA Volta and NVIDIA Turing) and relative to TF32 (for NVIDIA
Ampere) numbers for corresponding configuration.
### Optimizer ##### Inference latency: NVIDIA A100 40GB
Adam optimizer with learning rate 1e-3, beta1 = 0.9, beta2 = 0.999, epsilon = Our results were obtained by running the `translate.py` script in the
1e-8 and no weight decay. pytorch-20.06-py3 NGC Docker container with NVIDIA A100 40GB.
Network is trained with gradient clipping, max L2 norm of gradients is set to 5.0. Full command to launch the inference latency benchmark was provided in the
[Inference performance benchmark](#inference-performance-benchmark) section.
Optimizer is implemented in `pytorch/seq2seq/train/fp_optimizers.py`, class **FP16**
`Fp32Optimizer`.
### Learning rate schedule |**Batch Size**|**Beam Size**|**Avg (ms)**|**Speedup**|**90% (ms)**|**Speedup**|**95% (ms)**|**Speedup**|**99% (ms)**|**Speedup**|
Model is trained with exponential learning rate warmup for 200 steps and with |-------------:|------------:|-----------:|----------:|-----------:|----------:|-----------:|----------:|-----------:|----------:|
step learning rate decay. Decay is started after 2/3 of training steps, decays | 1| 1| 44.69| 1.032| 74.04| 1.035| 84.61| 1.034| 99.14| 1.042|
for a total of 4 times, at regularly spaced intervals, decay factor is 0.5. | 1| 2| 64.76| 1.020| 105.18| 1.018| 118.92| 1.019| 139.42| 1.023|
| 1| 5| 67.06| 1.043| 107.56| 1.049| 121.82| 1.054| 143.85| 1.054|
| 2| 1| 56.57| 1.034| 85.59| 1.037| 92.55| 1.038| 107.59| 1.046|
| 2| 2| 80.22| 1.027| 119.22| 1.027| 128.43| 1.030| 150.06| 1.028|
| 2| 5| 82.54| 1.063| 121.37| 1.067| 132.35| 1.069| 156.34| 1.059|
| 4| 1| 67.29| 1.047| 92.69| 1.048| 100.08| 1.056| 112.63| 1.064|
| 4| 2| 95.86| 1.041| 129.83| 1.040| 139.48| 1.044| 162.34| 1.045|
| 4| 5| 98.34| 1.075| 133.83| 1.076| 142.70| 1.068| 168.30| 1.075|
| 8| 1| 75.60| 1.099| 97.87| 1.103| 104.13| 1.099| 117.40| 1.102|
| 8| 2| 109.38| 1.074| 137.71| 1.079| 147.69| 1.069| 168.79| 1.065|
| 8| 5| 112.71| 1.116| 143.50| 1.104| 153.17| 1.118| 172.60| 1.113|
| 32| 1| 98.40| 1.146| 117.02| 1.153| 123.42| 1.150| 129.01| 1.128|
| 32| 2| 145.87| 1.133| 171.71| 1.159| 184.01| 1.127| 188.64| 1.141|
| 32| 5| 156.82| 1.357| 189.10| 1.374| 194.95| 1.392| 196.65| 1.419|
| 128| 1| 137.97| 1.350| 150.04| 1.348| 151.52| 1.349| 154.52| 1.434|
| 128| 2| 211.58| 1.484| 232.96| 1.490| 237.46| 1.505| 239.86| 1.567|
| 128| 5| 261.44| 1.990| 288.54| 2.017| 291.63| 2.052| 298.73| 2.136|
| 512| 1| 245.93| 1.906| 262.51| 1.998| 264.24| 1.999| 265.23| 2.000|
| 512| 2| 395.61| 2.129| 428.54| 2.219| 431.58| 2.224| 433.86| 2.227|
| 512| 5| 627.21| 2.767| 691.72| 2.878| 696.01| 2.895| 702.13| 2.887|
Learning rate scheduler is implemented in ##### Inference latency: NVIDIA T4
`pytorch/seq2seq/train/lr_scheduler.py`, class `WarmupMultiStepLR`. Our results were obtained by running the `translate.py` script in the
pytorch-20.06-py3 NGC Docker container with NVIDIA T4.
Full command to launch the inference latency benchmark was provided in the
[Inference performance benchmark](#inference-performance-benchmark) section.
# 4. 评估 **FP16**
### Quality metric |**Batch Size**|**Beam Size**|**Avg (ms)**|**Speedup**|**90% (ms)**|**Speedup**|**95% (ms)**|**Speedup**|**99% (ms)**|**Speedup**|
Uncased BLEU score on newstest2014 en-de dataset. |-------------:|------------:|-----------:|----------:|-----------:|----------:|-----------:|----------:|-----------:|----------:|
BLEU scores reported by [sacrebleu](https://pypi.org/project/sacrebleu/) | 1| 1| 51.08| 1.261| 84.82| 1.254| 97.45| 1.251| 114.6| 1.257|
package (version 1.2.10). Sacrebleu is executed with the following flags: | 1| 2| 72.05| 1.168| 117.41| 1.165| 132.33| 1.170| 155.8| 1.174|
`--score-only -lc --tokenize intl`. | 1| 5| 74.20| 1.345| 119.45| 1.352| 135.07| 1.354| 160.3| 1.354|
| 2| 1| 66.31| 1.232| 100.90| 1.232| 108.52| 1.235| 126.9| 1.238|
| 2| 2| 88.35| 1.185| 131.47| 1.188| 141.46| 1.185| 164.7| 1.191|
| 2| 5| 92.12| 1.305| 136.30| 1.310| 148.66| 1.309| 174.8| 1.320|
| 4| 1| 78.54| 1.260| 108.53| 1.256| 117.19| 1.259| 133.7| 1.259|
| 4| 2| 105.54| 1.315| 142.74| 1.317| 154.36| 1.307| 178.7| 1.303|
| 4| 5| 110.43| 1.351| 150.62| 1.388| 161.61| 1.397| 191.2| 1.427|
| 8| 1| 91.65| 1.418| 117.92| 1.421| 126.60| 1.405| 144.0| 1.411|
| 8| 2| 123.39| 1.315| 156.00| 1.337| 167.34| 1.347| 193.4| 1.340|
| 8| 5| 129.69| 1.666| 165.01| 1.705| 178.18| 1.723| 200.3| 1.765|
| 32| 1| 126.53| 1.641| 153.23| 1.689| 159.58| 1.692| 167.0| 1.700|
| 32| 2| 174.37| 1.822| 209.04| 1.899| 219.59| 1.877| 228.6| 1.878|
| 32| 5| 226.15| 2.598| 277.38| 2.636| 290.27| 2.648| 299.4| 2.664|
| 128| 1| 218.29| 2.755| 238.94| 2.826| 243.18| 2.843| 267.1| 2.828|
| 128| 2| 354.83| 2.796| 396.63| 2.832| 410.53| 2.803| 433.2| 2.866|
| 128| 5| 628.32| 3.311| 699.57| 3.353| 723.98| 3.323| 771.0| 3.337|
| 512| 1| 663.07| 3.330| 748.62| 3.388| 753.20| 3.388| 758.0| 3.378|
| 512| 2| 1134.04| 3.295| 1297.85| 3.283| 1302.25| 3.304| 1306.9| 3.308|
| 512| 5| 2428.82| 3.428| 2771.72| 3.415| 2801.32| 3.427| 2817.6| 3.422|
### Quality target To achieve these same results, follow the [Quick Start Guide](#quick-start-guide)
Uncased BLEU score of 24.00. outlined above.
### Evaluation frequency ## Release notes
Evaluation of BLEU score is done after every epoch. ### Changelog
* July 2020
* Added support for NVIDIA DGX A100
* Default container updated to NGC PyTorch 20.06-py3
* June 2019
* Default container updated to NGC PyTorch 19.05-py3
* Mixed precision training implemented using APEX AMP
* Added inference throughput and latency results on NVIDIA T4 and NVIDIA
Tesla V100 16GB
* Added option to run inference on user-provided raw input text from command
line
* February 2019
* Different batching algorithm (bucketing with 5 equal-width buckets)
* Additional dropouts before first LSTM layer in encoder and in decoder
* Weight initialization changed to uniform (-0.1,0.1)
* Switched order of dropout and concatenation with attention in decoder
* Default container updated to NGC PyTorch 19.01-py3
* December 2018
* Added exponential warm-up and step learning rate decay
* Multi-GPU (distributed) inference and validation
* Default container updated to NGC PyTorch 18.11-py3
* General performance improvements
* August 2018
* Initial release
### Evaluation thoroughness ### Known issues
Evaluation uses all of `newstest2014.en` (3003 sentences). There are no known issues in this release.
# 1. Problem
This problem uses recurrent neural network to do language translation.
## Requirements
* [nvidia-docker](https://github.com/NVIDIA/nvidia-docker)
* [PyTorch 20.06-py3 NGC container](https://ngc.nvidia.com/registry/nvidia-pytorch)
* Slurm with [Pyxis](https://github.com/NVIDIA/pyxis) and [Enroot](https://github.com/NVIDIA/enroot)
# 2. Directions
## Steps to download and verify data
Download the data using the following command:
```
cd ..
bash download_dataset.sh
cd -
```
Verify data with:
```
cd ..
bash verify_dataset.sh
cd -
```
## Steps to launch training
### NVIDIA DGX A100 (single node)
Launch configuration and system-specific hyperparameters for the NVIDIA DGX A100
single node submission are in the `config_DGXA100.sh` script.
Steps required to launch single node training on NVIDIA DGX A100:
1. Build the container and push to a docker registry:
```
docker build --pull -t <docker/registry>/mlperf-nvidia:rnn_translator .
docker push <docker/registry>/mlperf-nvidia:rnn_translator
```
2. Launch the training:
```
source config_DGXA100.sh
CONT="<docker/registry>/mlperf-nvidia:rnn_translator" DATADIR=<path/to/data/dir> LOGDIR=<path/to/output/dir> sbatch -N $DGXNNODES -t $WALLTIME run.sub
```
#### Alternative launch with nvidia-docker
When generating results for the official v0.7 submission with one node, the
benchmark was launched onto a cluster managed by a SLURM scheduler. The
instructions in [NVIDIA DGX A100 (single node)](#nvidia-dgx-a100-single-node) explain
how that is done.
However, to make it easier to run this benchmark on a wider set of machine
environments, we are providing here an alternate set of launch instructions
that can be run using nvidia-docker. Note that performance or functionality may
vary from the tested SLURM instructions.
```
docker build --pull -t mlperf-nvidia:rnn_translator .
source config_DGXA100.sh
CONT="mlperf-nvidia:rnn_translator" DATADIR=<path/to/data/dir> LOGDIR=<path/to/output/dir> ./run_with_docker.sh
```
### NVIDIA DGX-2H (single node)
Launch configuration and system-specific hyperparameters for the NVIDIA DGX-2H
single node submission are in the `config_DGX2.sh` script.
Steps required to launch single node training on NVIDIA DGX-2H:
1. Build the container and push to a docker registry:
```
docker build --pull -t <docker/registry>/mlperf-nvidia:rnn_translator .
docker push <docker/registry>/mlperf-nvidia:rnn_translator
```
2. Launch the training:
```
source config_DGX2.sh
CONT="<docker/registry>/mlperf-nvidia:rnn_translator" DATADIR=<path/to/data/dir> LOGDIR=<path/to/output/dir> sbatch -N $DGXNNODES -t $WALLTIME run.sub
```
#### Alternative launch with nvidia-docker
When generating results for the official v0.7 submission with one node, the
benchmark was launched onto a cluster managed by a SLURM scheduler. The
instructions in [NVIDIA DGX-2H (single node)](#nvidia-dgx-2h-single-node) explain
how that is done.
However, to make it easier to run this benchmark on a wider set of machine
environments, we are providing here an alternate set of launch instructions
that can be run using nvidia-docker. Note that performance or functionality may
vary from the tested SLURM instructions.
```
docker build --pull -t mlperf-nvidia:rnn_translator .
source config_DGX2.sh
CONT="mlperf-nvidia:rnn_translator" DATADIR=<path/to/data/dir> LOGDIR=<path/to/output/dir> ./run_with_docker.sh
```
### NVIDIA DGX-1 (single node)
Launch configuration and system-specific hyperparameters for the NVIDIA DGX-1
single node submission are in the `config_DGX1.sh` script.
Steps required to launch single node training on NVIDIA DGX-1:
1. Build the container and push to a docker registry:
```
docker build --pull -t <docker/registry>/mlperf-nvidia:rnn_translator .
docker push <docker/registry>/mlperf-nvidia:rnn_translator
```
2. Launch the training:
```
source config_DGX1.sh
CONT="<docker/registry>/mlperf-nvidia:rnn_translator" DATADIR=<path/to/data/dir> LOGDIR=<path/to/output/dir> sbatch -N $DGXNNODES -t $WALLTIME run.sub
```
#### Alternative launch with nvidia-docker
When generating results for the official v0.7 submission with one node, the
benchmark was launched onto a cluster managed by a SLURM scheduler. The
instructions in [NVIDIA DGX-1 (single node)](#nvidia-dgx-1-single-node) explain
how that is done.
However, to make it easier to run this benchmark on a wider set of machine
environments, we are providing here an alternate set of launch instructions
that can be run using nvidia-docker. Note that performance or functionality may
vary from the tested SLURM instructions.
```
docker build --pull -t mlperf-nvidia:rnn_translator .
source config_DGX1.sh
CONT="mlperf-nvidia:rnn_translator" DATADIR=<path/to/data/dir> LOGDIR=<path/to/output/dir> ./run_with_docker.sh
```
### NVIDIA DGX A100 (multi node)
Launch configuration and system-specific hyperparameters for the NVIDIA DGX A100
multi node submission are in the following scripts:
* for the 2-node NVIDIA DGX A100 submission: `config_DGXA100_multi_2x8x192_dist.sh`
* for the 32-node NVIDIA DGX A100 submission: `config_DGXA100_multi_32x8x32_dist.sh`
* for the 128-node NVIDIA DGX A100 submission: `config_DGXA100_multi_128x8x16_dist.sh`
Steps required to launch multi node training on NVIDIA DGX A100:
1. Build the docker container and push to a docker registry
```
docker build --pull -t <docker/registry>/mlperf-nvidia:rnn_translator .
docker push <docker/registry>/mlperf-nvidia:rnn_translator
```
2. Launch the training
2-node NVIDIA DGX A100 training:
```
source config_DGXA100_multi_2x8x192_dist.sh
CONT="<docker/registry>/mlperf-nvidia:rnn_translator" DATADIR=<path/to/data/dir> LOGDIR=<path/to/output/dir> sbatch -N $DGXNNODES -t $WALLTIME --ntasks-per-node $DGXNGPU run.sub
```
32-node NVIDIA DGX A100 training:
```
source config_DGXA100_multi_32x8x32_dist.sh
CONT="<docker/registry>/mlperf-nvidia:rnn_translator" DATADIR=<path/to/data/dir> LOGDIR=<path/to/output/dir> sbatch -N $DGXNNODES -t $WALLTIME --ntasks-per-node $DGXNGPU run.sub
```
128-node NVIDIA DGX A100 training:
```
source config_DGXA100_multi_128x8x16_dist.sh
CONT="<docker/registry>/mlperf-nvidia:rnn_translator" DATADIR=<path/to/data/dir> LOGDIR=<path/to/output/dir> sbatch -N $DGXNNODES -t $WALLTIME --ntasks-per-node $DGXNGPU run.sub
```
### NVIDIA DGX-2H (multi node)
Launch configuration and system-specific hyperparameters for the NVIDIA DGX-2H
multi node submission are the following scripts:
* for the 16-node NVIDIA DGX-2H submission: `config_DGX2_multi_16x16x32.sh`
* for the 64-node NVIDIA DGX-2H submission: `config_DGX2_multi_64x16x16.sh`
Steps required to launch multi node training on NVIDIA DGX-2H:
1. Build the docker container and push to a docker registry
```
docker build --pull -t <docker/registry>/mlperf-nvidia:rnn_translator .
docker push <docker/registry>/mlperf-nvidia:rnn_translator
```
2. Launch the training
16-node NVIDIA DGX-2H training:
```
source config_DGX2_multi_16x16x32.sh
CONT="<docker/registry>/mlperf-nvidia:rnn_translator" DATADIR=<path/to/data/dir> LOGDIR=<path/to/output/dir> sbatch -N $DGXNNODES -t $WALLTIME --ntasks-per-node $DGXNGPU run.sub
```
64-node NVIDIA DGX-2H training:
```
source config_DGX2_multi_64x16x16.sh
CONT="<docker/registry>/mlperf-nvidia:rnn_translator" DATADIR=<path/to/data/dir> LOGDIR=<path/to/output/dir> sbatch -N $DGXNNODES -t $WALLTIME --ntasks-per-node $DGXNGPU run.sub
```
# 3. Dataset/Environment
### Publication/Attribution
We use [WMT16 English-German](http://www.statmt.org/wmt16/translation-task.html)
for training.
### Data preprocessing
Script uses [subword-nmt](https://github.com/rsennrich/subword-nmt) package to
segment text into subword units (BPE), by default it builds shared vocabulary of
32,000 tokens.
Preprocessing removes all pairs of sentences that can't be decoded by latin-1
encoder.
### Vocabulary
Vocabulary is generated by the following lines from the `download_dataset.sh`
script:
```
# Create vocabulary file for BPE
cat "${OUTPUT_DIR}/train.tok.clean.bpe.${merge_ops}.en" "${OUTPUT_DIR}/train.tok.clean.bpe.${merge_ops}.de" | \
${OUTPUT_DIR}/subword-nmt/get_vocab.py | cut -f1 -d ' ' > "${OUTPUT_DIR}/vocab.bpe.${merge_ops}"
```
Vocabulary is stored to the `rnn_translator/data/vocab.bpe.32000` plain text
file. Tokens are separated with a newline character, one token per line. The
vocabulary file doesn't contain special tokens like for example BOS
(begin-of-string) or EOS (end-of-string) tokens.
Here are first 10 lines from the `rnn_translator/data/vocab.bpe.32000` file:
```
,
.
the
in
of
and
die
der
to
und
```
### Text datasets
The `download_dataset.sh` script automatically creates training, validation and
test datasets. Datasets are stored as plain text files. Sentences are separated
with a newline character, and tokens within each sentence are separated with a
single space character.
Training data:
* source language (English): `rnn_translator/data/train.tok.clean.bpe.32000.en`
* target language (German): `rnn_translator/data/train.tok.clean.bpe.32000.de`
Validation data:
* source language (English): `rnn_translator/data/newstest_dev.tok.clean.bpe.32000.en`
* target language (German): `rnn_translator/data/newstest_dev.tok.clean.bpe.32000.de`
Test data:
* source language (English): `rnn_translator/data/newstest2014.tok.bpe.32000.en`
* target language (German): `rnn_translator/data/newstest2014.de`
* notice that the `newstest2014.de` file isn't tokenized, BLEU evaluation is
performed by the sacrebleu package and sacrebleu expects plain text raw data
(tokenization is performed internally by sacrebleu)
Here are first 5 lines from the `rnn_translator/data/train.tok.clean.bpe.32000.en` file:
```
Res@@ um@@ ption of the session
I declare resumed the session of the European Parliament ad@@ jour@@ ned on Friday 17 December 1999 , and I would like once again to wish you a happy new year in the hope that you enjoyed a pleasant fes@@ tive period .
Although , as you will have seen , the d@@ read@@ ed &apos; millenn@@ ium bug &apos; failed to materi@@ alise , still the people in a number of countries suffered a series of natural disasters that truly were d@@ read@@ ful .
You have requested a debate on this subject in the course of the next few days , during this part-session .
In the meantime , I should like to observe a minute &apos; s silence , as a number of Members have requested , on behalf of all the victims concerned , particularly those of the terrible stor@@ ms , in the various countries of the European Union .
```
And here are first 5 lines from the `rnn_translator/data/train.tok.clean.bpe.32000.de` file:
```
Wiederaufnahme der Sitzungsperiode
Ich erkläre die am Freitag , dem 17. Dezember unterbro@@ ch@@ ene Sitzungsperiode des Europäischen Parlaments für wieder@@ aufgenommen , wünsche Ihnen nochmals alles Gute zum Jahres@@ wechsel und hoffe , daß Sie schöne Ferien hatten .
Wie Sie feststellen konnten , ist der ge@@ für@@ ch@@ tete &quot; Mill@@ en@@ i@@ um-@@ Bu@@ g &quot; nicht eingetreten . Doch sind Bürger einiger unserer Mitgliedstaaten Opfer von schrecklichen Naturkatastrophen geworden .
Im Parlament besteht der Wunsch nach einer Aussprache im Verlauf dieser Sitzungsperiode in den nächsten Tagen .
Heute möchte ich Sie bitten - das ist auch der Wunsch einiger Kolleginnen und Kollegen - , allen Opfern der St@@ ür@@ me , insbesondere in den verschiedenen Ländern der Europäischen Union , in einer Schwei@@ ge@@ minute zu ge@@ denken .
```
### Training and test data separation
Training uses WMT16 English-German dataset, validation is on concatenation of
newstest2015 and newstest2016, BLEU evaluation is done on newstest2014.
### Data filtering
Training is executed only on pairs of sentences which satisfy the following equation:
```
min_len <= src sentence sequence length <= max_len AND
min_len <= tgt sentence sequence length <= max_len
```
`min_len` is set to 0, `max_len` is set to 75. Source and target sequence
lengths include special BOS (begin-of-sentence) and EOS (end-of-sentence)
tokens.
Filtering is implemented in `pytorch/seq2seq/data/dataset.py`, class
`LazyParallelDataset`.
### Training data order
Training script does bucketing by sequence length. Bucketing algorithm uses 5
equal-width buckets (`num_buckets = 5`). Pairs of training sentences are
assigned to buckets by the value of
`max(src_sentence_len // bucket_width, tgt_sentence_len // bucket_width)`, where
`bucket_width = (max_len + num_buckets - 1) // num_buckets`.
Before each training epoch batches are randomly sampled from the buckets (last
incomplete batches are dropped for each bucket), then all batches are
reshuffled.
Bucketing is implemented in `pytorch/seq2seq/data/sampler.py`, class
`BucketingSampler`.
# 4. Model
### Publication/Attribution
Implemented model is similar to the one from [Google's Neural Machine
Translation System: Bridging the Gap between Human and Machine
Translation](https://arxiv.org/abs/1609.08144) paper.
Most important difference is in the attention mechanism. This repository
implements `gnmt_v2` attention: output from first LSTM layer of decoder goes
into attention, then re-weighted context is concatenated with inputs to all
subsequent LSTM layers in decoder at current timestep.
The same attention mechanism is also implemented in default
GNMT-like models from [tensorflow/nmt](https://github.com/tensorflow/nmt) and
[NVIDIA/OpenSeq2Seq](https://github.com/NVIDIA/OpenSeq2Seq).
### Structure
* general:
* encoder and decoder are using shared embeddings
* data-parallel multi-gpu training
* trained with label smoothing loss (smoothing factor 0.1)
* encoder:
* 4-layer LSTM, hidden size 1024, first layer is bidirectional, the rest of
layers are unidirectional
* with residual connections starting from 3rd LSTM layer
* uses standard pytorch nn.LSTM layer
* dropout is applied on input to all LSTM layers, probability of dropout is
set to 0.2
* hidden state of LSTM layers is initialized with zeros
* weights and bias of LSTM layers is initialized with uniform(-0.1, 0.1)
distribution
* decoder:
* 4-layer unidirectional LSTM with hidden size 1024 and fully-connected
classifier
* with residual connections starting from 3rd LSTM layer
* uses standard pytorch nn.LSTM layer
* dropout is applied on input to all LSTM layers, probability of dropout is
set to 0.2
* hidden state of LSTM layers is initialized with zeros
* weights and bias of LSTM layers is initialized with uniform(-0.1, 0.1)
distribution
* weights and bias of fully-connected classifier is initialized with
uniform(-0.1, 0.1) distribution
* attention:
* normalized Bahdanau attention
* model uses `gnmt_v2` attention mechanism
* output from first LSTM layer of decoder goes into attention,
then re-weighted context is concatenated with the input to all subsequent
LSTM layers in decoder at the current timestep
* linear transform of keys and queries is initialized with uniform(-0.1, 0.1),
normalization scalar is initialized with 1.0 / sqrt(1024),
normalization bias is initialized with zero
* inference:
* beam search with beam size of 5
* with coverage penalty and length normalization, coverage penalty factor is
set to 0.1, length normalization factor is set to 0.6 and length
normalization constant is set to 5.0
* BLEU computed by [sacrebleu](https://pypi.org/project/sacrebleu/)
Implementation:
* base Seq2Seq model: `pytorch/seq2seq/models/seq2seq_base.py`, class `Seq2Seq`
* GNMT model: `pytorch/seq2seq/models/gnmt.py`, class `GNMT`
* encoder: `pytorch/seq2seq/models/encoder.py`, class `ResidualRecurrentEncoder`
* decoder: `pytorch/seq2seq/models/decoder.py`, class `ResidualRecurrentDecoder`
* attention: `pytorch/seq2seq/models/attention.py`, class `BahdanauAttention`
* inference (including BLEU evaluation and detokenization): `pytorch/seq2seq/inference/inference.py`, class `Translator`
* beam search: `pytorch/seq2seq/inference/beam_search.py`, class `SequenceGenerator`
### Loss function
Cross entropy loss with label smoothing (smoothing factor = 0.1), padding is not
considered part of the loss.
Loss function is implemented in `pytorch/seq2seq/train/smoothing.py`, class
`LabelSmoothing`.
### Optimizer
Adam optimizer with learning rate 1e-3, beta1 = 0.9, beta2 = 0.999, epsilon =
1e-8 and no weight decay.
Network is trained with gradient clipping, max L2 norm of gradients is set to 5.0.
Optimizer is implemented in `pytorch/seq2seq/train/fp_optimizers.py`, class
`Fp32Optimizer`.
### Learning rate schedule
Model is trained with exponential learning rate warmup for 200 steps and with
step learning rate decay. Decay is started after 2/3 of training steps, decays
for a total of 4 times, at regularly spaced intervals, decay factor is 0.5.
Learning rate scheduler is implemented in
`pytorch/seq2seq/train/lr_scheduler.py`, class `WarmupMultiStepLR`.
# 5. Quality
### Quality metric
Uncased BLEU score on newstest2014 en-de dataset.
BLEU scores reported by [sacrebleu](https://pypi.org/project/sacrebleu/)
package (version 1.2.10). Sacrebleu is executed with the following flags:
`--score-only -lc --tokenize intl`.
### Quality target
Uncased BLEU score of 24.00.
### Evaluation frequency
Evaluation of BLEU score is done after every epoch.
### Evaluation thoroughness
Evaluation uses all of `newstest2014.en` (3003 sentences).
#! /bin/bash
set -euo pipefail
print_usage() {
cat << EOF
${0} [options] [--] COMMAND [ARG...]
Control binding policy for each task. Assumes one rank will be launched for each GPU.
Options:
--cpu=MODE
* exclusive -- bind each rank to an exclusive set of cores near its GPU
* exclusive,nosmt -- bind each rank to an exclusive set of cores near its GPU, without hyperthreading
* node -- bind each rank to all cores in the NUMA node nearest its GPU [default]
* *.sh -- bind each rank using the bash associative array bind_cpu_cores or bind_cpu_nodes from a file
* off -- don't bind
--mem=MODE
* node -- bind each rank to the nearest NUMA node [default]
* *.sh -- bind each rank using the bash associative array bind_mem from a file
* off -- don't bind
--ib=MODE
* single -- bind each rank to a single IB device near its GPU
* off -- don't bind [default]
--cluster=CLUSTER
Select which cluster is being used. May be required if system params cannot be detected.
EOF
}
################################################################################
# Argument parsing
################################################################################
cpu_mode='node'
mem_mode='node'
ib_mode='off'
cluster=''
while [ $# -gt 0 ]; do
case "$1" in
-h|--help) print_usage ; exit 0 ;;
--cpu=*) cpu_mode="${1/*=/}"; shift ;;
--cpu) cpu_mode="$2"; shift 2 ;;
--mem=*) mem_mode="${1/*=/}"; shift ;;
--mem) mem_mode="$2"; shift 2 ;;
--ib=*) ib_mode="${1/*=/}"; shift ;;
--ib) ib_mode="$2"; shift 2 ;;
--cluster=*) cluster="${1/*=/}"; shift ;;
--cluster) cluster="$2"; shift 2 ;;
--) shift; break ;;
*) break ;;
esac
done
if [ $# -lt 1 ]; then
echo 'ERROR: no command given' 2>&1
print_usage
exit 1
fi
################################################################################
# Get system params
################################################################################
# LOCAL_RANK is set with an enroot hook for Pytorch containers
# SLURM_LOCALID is set by Slurm
# OMPI_COMM_WORLD_LOCAL_RANK is set by mpirun
readonly local_rank="${LOCAL_RANK:=${SLURM_LOCALID:=${OMPI_COMM_WORLD_LOCAL_RANK:-}}}"
if [ -z "${local_rank}" ]; then
echo 'ERROR: cannot read LOCAL_RANK from env' >&2
exit 1
fi
num_gpus=$(nvidia-smi -i 0 --query-gpu=count --format=csv,noheader,nounits)
if [ "${local_rank}" -ge "${num_gpus}" ]; then
echo "ERROR: local rank is ${local_rank}, but there are only ${num_gpus} gpus available" >&2
exit 1
fi
get_lscpu_value() {
awk -F: "(\$1 == \"${1}\"){gsub(/ /, \"\", \$2); print \$2; found=1} END{exit found!=1}"
}
lscpu_out=$(lscpu)
num_sockets=$(get_lscpu_value 'Socket(s)' <<< "${lscpu_out}")
num_nodes=$(get_lscpu_value 'NUMA node(s)' <<< "${lscpu_out}")
cores_per_socket=$(get_lscpu_value 'Core(s) per socket' <<< "${lscpu_out}")
echo "num_sockets = ${num_sockets} num_nodes=${num_nodes} cores_per_socket=${cores_per_socket}"
readonly cores_per_node=$(( (num_sockets * cores_per_socket) / num_nodes ))
if [ ${num_gpus} -gt 1 ]; then
readonly gpus_per_node=$(( num_gpus / num_nodes ))
else
readonly gpus_per_node=1
fi
readonly cores_per_gpu=$(( cores_per_node / gpus_per_node ))
readonly local_node=$(( local_rank / gpus_per_node ))
declare -a ibdevs=()
case "${cluster}" in
circe)
# Need to specialize for circe because IB detection is hard
ibdevs=(mlx5_1 mlx5_2 mlx5_3 mlx5_4 mlx5_7 mlx5_8 mlx5_9 mlx5_10)
;;
selene)
# Need to specialize for selene because IB detection is hard
ibdevs=(mlx5_0 mlx5_1 mlx5_2 mlx5_3 mlx5_6 mlx5_7 mlx5_8 mlx5_9)
;;
'')
if ibstat_out="$(ibstat -l 2>/dev/null | sort -V)" ; then
mapfile -t ibdevs <<< "${ibstat_out}"
fi
;;
*)
echo "ERROR: Unknown cluster '${cluster}'" >&2
exit 1
;;
esac
readonly num_ibdevs="${#ibdevs[@]}"
################################################################################
# Setup for exec
################################################################################
declare -a numactl_args=()
case "${cpu_mode}" in
exclusive)
numactl_args+=( "$(printf -- "--physcpubind=%u-%u,%u-%u" \
$(( local_rank * cores_per_gpu )) \
$(( (local_rank + 1) * cores_per_gpu - 1 )) \
$(( local_rank * cores_per_gpu + (cores_per_gpu * gpus_per_node * num_nodes) )) \
$(( (local_rank + 1) * cores_per_gpu + (cores_per_gpu * gpus_per_node * num_nodes) - 1 )) \
)" )
;;
exclusive,nosmt)
numactl_args+=( "$(printf -- "--physcpubind=%u-%u" \
$(( local_rank * cores_per_gpu )) \
$(( (local_rank + 1) * cores_per_gpu - 1 )) \
)" )
;;
node)
numactl_args+=( "--cpunodebind=${local_node}" )
;;
*.sh)
source "${cpu_mode}"
if [ -n "${bind_cpu_cores:-}" ]; then
numactl_args+=( "--physcpubind=${bind_cpu_cores[${local_rank}]}" )
elif [ -n "${bind_cpu_nodes:-}" ]; then
numactl_args+=( "--cpunodebind=${bind_cpu_nodes[${local_rank}]}" )
else
echo "ERROR: invalid CPU affinity file ${cpu_mode}." >&2
exit 1
fi
;;
off|'')
;;
*)
echo "ERROR: invalid cpu mode '${cpu_mode}'" 2>&1
print_usage
exit 1
;;
esac
case "${mem_mode}" in
node)
numactl_args+=( "--membind=${local_node}" )
;;
*.sh)
source "${mem_mode}"
if [ -z "${bind_mem:-}" ]; then
echo "ERROR: invalid memory affinity file ${mem_mode}." >&2
exit 1
fi
numactl_args+=( "--membind=${bind_mem[${local_rank}]}" )
;;
off|'')
;;
*)
echo "ERROR: invalid mem mode '${mem_mode}'" 2>&1
print_usage
exit 1
;;
esac
case "${ib_mode}" in
single)
if [ "${num_ibdevs}" -eq 0 ]; then
echo "WARNING: used '$0 --ib=single', but there are 0 IB devices available; skipping IB binding." 2>&1
else
readonly ibdev="${ibdevs[$(( local_rank * num_ibdevs / num_gpus ))]}"
export OMPI_MCA_btl_openib_if_include="${OMPI_MCA_btl_openib_if_include-$ibdev}"
export UCX_NET_DEVICES="${UCX_NET_DEVICES-$ibdev:1}"
fi
;;
off|'')
;;
*)
echo "ERROR: invalid ib mode '${ib_mode}'" 2>&1
print_usage
exit 1
;;
esac
################################################################################
# Exec
################################################################################
if [ "${#numactl_args[@]}" -gt 0 ] ; then
set -x
exec numactl "${numactl_args[@]}" -- "${@}"
else
exec "${@}"
fi
import sys
import subprocess
import os
import socket
from argparse import ArgumentParser, REMAINDER
import torch
def parse_args():
"""
Helper function parsing the command line options
@retval ArgumentParser
"""
parser = ArgumentParser(description="PyTorch distributed training launch "
"helper utilty that will spawn up "
"multiple distributed processes")
# Optional arguments for the launch helper
parser.add_argument("--nnodes", type=int, default=1,
help="The number of nodes to use for distributed "
"training")
parser.add_argument("--node_rank", type=int, default=0,
help="The rank of the node for multi-node distributed "
"training")
parser.add_argument("--nproc_per_node", type=int, default=1,
help="The number of processes to launch on each node, "
"for GPU training, this is recommended to be set "
"to the number of GPUs in your system so that "
"each process can be bound to a single GPU.")
parser.add_argument("--master_addr", default="127.0.0.1", type=str,
help="Master node (rank 0)'s address, should be either "
"the IP address or the hostname of node 0, for "
"single node multi-proc training, the "
"--master_addr can simply be 127.0.0.1")
parser.add_argument("--master_port", default=29500, type=int,
help="Master node (rank 0)'s free port that needs to "
"be used for communciation during distributed "
"training")
parser.add_argument('--no_hyperthreads', action='store_true',
help='Flag to disable binding to hyperthreads')
parser.add_argument('--no_membind', action='store_true',
help='Flag to disable memory binding')
# non-optional arguments for binding
parser.add_argument("--nsockets_per_node", type=int, required=True,
help="Number of CPU sockets on a node")
parser.add_argument("--ncores_per_socket", type=int, required=True,
help="Number of CPU cores per socket")
# positional
parser.add_argument("training_script", type=str,
help="The full path to the single GPU training "
"program/script to be launched in parallel, "
"followed by all the arguments for the "
"training script")
# rest from the training program
parser.add_argument('training_script_args', nargs=REMAINDER)
return parser.parse_args()
def main():
args = parse_args()
# variables for numactrl binding
NSOCKETS = args.nsockets_per_node
NGPUS_PER_SOCKET = (args.nproc_per_node // args.nsockets_per_node) + (1 if (args.nproc_per_node % args.nsockets_per_node) else 0)
NCORES_PER_GPU = args.ncores_per_socket // NGPUS_PER_SOCKET
# world size in terms of number of processes
dist_world_size = args.nproc_per_node * args.nnodes
# set PyTorch distributed related environmental variables
current_env = os.environ.copy()
current_env["MASTER_ADDR"] = args.master_addr
current_env["MASTER_PORT"] = str(args.master_port)
current_env["WORLD_SIZE"] = str(dist_world_size)
processes = []
for local_rank in range(0, args.nproc_per_node):
# each process's rank
dist_rank = args.nproc_per_node * args.node_rank + local_rank
current_env["RANK"] = str(dist_rank)
# form numactrl binding command
cpu_ranges = [local_rank * NCORES_PER_GPU,
(local_rank + 1) * NCORES_PER_GPU - 1,
local_rank * NCORES_PER_GPU + (NCORES_PER_GPU * NGPUS_PER_SOCKET * NSOCKETS),
(local_rank + 1) * NCORES_PER_GPU + (NCORES_PER_GPU * NGPUS_PER_SOCKET * NSOCKETS) - 1]
numactlargs = []
if args.no_hyperthreads:
numactlargs += [ "--physcpubind={}-{}".format(*cpu_ranges[0:2]) ]
else:
numactlargs += [ "--physcpubind={}-{},{}-{}".format(*cpu_ranges) ]
if not args.no_membind:
memnode = local_rank // NGPUS_PER_SOCKET
numactlargs += [ "--membind={}".format(memnode) ]
# spawn the processes
cmd = [ "/usr/bin/numactl" ] \
+ numactlargs \
+ [ sys.executable,
"-u",
args.training_script,
"--local_rank={}".format(local_rank)
] \
+ args.training_script_args
print("cmd: ",cmd)
process = subprocess.Popen(cmd, env=current_env)
processes.append(process)
for process in processes:
process.wait()
if __name__ == "__main__":
main()
## System run parms
export DGXNNODES=1
export DGXSYSTEM=$(basename $(readlink -f ${BASH_SOURCE[0]}) | sed 's/^config_//' | sed 's/\.sh$//' )
export WALLTIME=${WALLTIME:-"00:30:00"}
## DL params
export LR=${LR:-"2.0e-3"}
export TRAIN_BATCH_SIZE=${TRAIN_BATCH_SIZE:-64}
export TEST_BATCH_SIZE=${TEST_BATCH_SIZE:-64}
export WARMUP_STEPS=${WARMUP_STEPS:-200}
export REMAIN_STEPS=${REMAIN_STEPS:-6453}
export DECAY_INTERVAL=${DECAY_INTERVAL:-809}
export TARGET=${TARGET:-24.0}
export MAX_SEQ_LEN=${MAX_SEQ_LEN:-75}
export NUMEPOCHS=${NUMEPOCHS:-15}
export MATH=${MATH:-fp32}
#export DIST_OPTS=${DIST_OPTS-"\
# --distributed-weight-update 2 \
# --dwu-num-blocks 1 \
# --dwu-num-chunks 2 \
# --dwu-num-rs-pg 2 \
# --dwu-num-ar-pg 2 \
# --dwu-num-ag-pg 0 \
# --dwu-grad-norm \
# "}
export DIST_OPTS=${DIST_OPTS-"\
--dwu-num-blocks 1 \
--dwu-num-chunks 2 \
--dwu-num-rs-pg 2 \
--dwu-num-ar-pg 2 \
--dwu-num-ag-pg 0 \
--dwu-grad-norm \
"}
export EXTRA_OPTS=${EXTRA_OPTS-"\
--fused-attention \
--fused-xentropy \
--no-log-all-ranks \
"}
## System config params
export DGXNGPU=4
export DGXSOCKETCORES=8
export DGXHT=1 # HT is on is 2, HT off is 1
export DGXNSOCKET=4
#export DGXNGPU=1
#export DGXSOCKETCORES=8
#export DGXHT=1 # HT is on is 2, HT off is 1
#export DGXNSOCKET=1
## System run parms
export DGXNNODES=1
export DGXSYSTEM=$(basename $(readlink -f ${BASH_SOURCE[0]}) | sed 's/^config_//' | sed 's/\.sh$//' )
export WALLTIME=${WALLTIME:-"00:30:00"}
export LR=${LR:-"2.0e-3"}
export TRAIN_BATCH_SIZE=${TRAIN_BATCH_SIZE:-256}
export TEST_BATCH_SIZE=${TEST_BATCH_SIZE:-128}
export WARMUP_STEPS=${WARMUP_STEPS:-200}
export REMAIN_STEPS=${REMAIN_STEPS:-6453}
export DECAY_INTERVAL=${DECAY_INTERVAL:-809}
export TARGET=${TARGET:-24.0}
export MAX_SEQ_LEN=${MAX_SEQ_LEN:-75}
export NUMEPOCHS=${NUMEPOCHS:-8}
export MATH=${MATH:-fp16}
export DIST_OPTS=${DIST_OPTS-"\
--distributed-weight-update 2 \
--dwu-num-blocks 1 \
--dwu-num-chunks 2 \
--dwu-num-rs-pg 2 \
--dwu-num-ar-pg 2 \
--dwu-num-ag-pg 0 \
--dwu-grad-norm \
"}
export EXTRA_OPTS=${EXTRA_OPTS-"\
--fused-attention \
--fused-xentropy \
--no-log-all-ranks \
"}
## System config params
export DGXNGPU=4
export DGXSOCKETCORES=8
export DGXHT=1 # HT is on is 2, HT off is 1
export DGXNSOCKET=4
## System run parms
export DGXNNODES=2
export DGXSYSTEM=$(basename $(readlink -f ${BASH_SOURCE[0]}) | sed 's/^config_//' | sed 's/\.sh$//' )
export WALLTIME=${WALLTIME:-"00:30:00"}
## DL params
#export LR=${LR:-"2.0e-3"}
#export TRAIN_BATCH_SIZE=${TRAIN_BATCH_SIZE:-64}
#export TEST_BATCH_SIZE=${TEST_BATCH_SIZE:-64}
#export WARMUP_STEPS=${WARMUP_STEPS:-200}
#export REMAIN_STEPS=${REMAIN_STEPS:-6453}
#export DECAY_INTERVAL=${DECAY_INTERVAL:-809}
#export TARGET=${TARGET:-24.0}
#export MAX_SEQ_LEN=${MAX_SEQ_LEN:-75}
#export NUMEPOCHS=${NUMEPOCHS:-15}
#export MATH=${MATH:-fp32}
#export DIST_OPTS=${DIST_OPTS-"\
# --distributed-weight-update 2 \
# --dwu-num-blocks 1 \
# --dwu-num-chunks 2 \
# --dwu-num-rs-pg 2 \
# --dwu-num-ar-pg 2 \
# --dwu-num-ag-pg 0 \
# --dwu-grad-norm \
# "}
#export EXTRA_OPTS=${EXTRA_OPTS-"\
# --fused-attention \
# --fused-xentropy \
# --no-log-all-ranks \
# "}
## System config params
export DGXNGPU=4
export DGXSOCKETCORES=8
export DGXHT=2 # HT is on is 2, HT off is 1
export DGXNSOCKET=4
#export DGXNGPU=1
#export DGXSOCKETCORES=8
#export DGXHT=1 # HT is on is 2, HT off is 1
#export DGXNSOCKET=1
import collections
import os
import subprocess
import torch
from mlperf_logging.mllog import constants
from seq2seq.utils import configure_logger, log_event
def mlperf_submission_log(benchmark):
num_nodes = os.environ.get('SLURM_NNODES', 1)
if int(num_nodes) > 1:
torch.distributed.init_process_group(backend='nccl', init_method='env://')
configure_logger(benchmark)
log_event(
key=constants.SUBMISSION_BENCHMARK,
value=benchmark,
)
log_event(
key=constants.SUBMISSION_ORG,
value='NVIDIA')
log_event(
key=constants.SUBMISSION_DIVISION,
value='closed')
log_event(
key=constants.SUBMISSION_STATUS,
value='onprem')
log_event(
key=constants.SUBMISSION_PLATFORM,
value=f'{num_nodes}xSUBMISSION_PLATFORM_PLACEHOLDER')
import logging
import time
import os
import argparse
import torch
from torch.utils.data import DataLoader
from seq2seq.data.tokenizer import Tokenizer
import seq2seq.data.config as config
import seq2seq.utils as utils
from seq2seq.data.dataset import LazyParallelDataset
from seq2seq.data.dataset import PreprocessedDataset
def parse_args():
parser = argparse.ArgumentParser(
description='GNMT prepare data',
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('--dataset-dir', default='data/wmt16_de_en',
help='path to the directory with training/test data')
parser.add_argument('--preproc-data-dir', default='/tmp/preprocessed',
help='path to the directory with preprocessed \
training/test data')
parser.add_argument('--max-size', default=None, type=int,
help='use at most MAX_SIZE elements from training \
dataset (useful for benchmarking), by default \
uses entire dataset')
parser.add_argument('--math', default='fp32',
choices=['fp32', 'fp16'],
help='arithmetic type')
parser.add_argument('--max-length-train', default=50, type=int,
help='maximum sequence length for training \
(including special BOS and EOS tokens)')
parser.add_argument('--min-length-train', default=0, type=int,
help='minimum sequence length for training \
(including special BOS and EOS tokens)')
parser.add_argument('--rank', default=0, type=int,
help='global rank of the process, do not set!')
parser.add_argument('--local_rank', default=os.getenv('LOCAL_RANK', 0), type=int,
help='local rank of the process, do not set!')
args = parser.parse_args()
return args
def build_collate_fn(max_seq_len, parallel=True):
def collate_seq(seq):
lengths = torch.tensor([len(s) for s in seq])
batch_length = max_seq_len
shape = (len(seq), batch_length)
seq_tensor = torch.full(shape, config.PAD, dtype=torch.int64)
for i, s in enumerate(seq):
end_seq = lengths[i]
seq_tensor[i, :end_seq].copy_(s[:end_seq])
return (seq_tensor, lengths)
def parallel_collate(seqs):
src_seqs, tgt_seqs = zip(*seqs)
return tuple([collate_seq(s) for s in [src_seqs, tgt_seqs]])
return parallel_collate
def load_dataset(tokenizer, args):
train_data = LazyParallelDataset(
src_fname=os.path.join(args.dataset_dir, config.SRC_TRAIN_FNAME),
tgt_fname=os.path.join(args.dataset_dir, config.TGT_TRAIN_FNAME),
tokenizer=tokenizer,
min_len=args.min_length_train,
max_len=args.max_length_train,
sort=False,
max_size=args.max_size)
collate_fn = build_collate_fn(max_seq_len=args.max_length_train,
parallel=True)
loader = DataLoader(train_data,
batch_size=1024,
collate_fn=collate_fn,
num_workers=min(os.cpu_count(), 16),
timeout=120,
drop_last=False)
srcs = []
tgts = []
src_lengths = []
tgt_lengths = []
for (src, src_len), (tgt, tgt_len) in loader:
src_lengths.append(src_len)
tgt_lengths.append(tgt_len)
srcs.append(src)
tgts.append(tgt)
srcs = torch.cat(srcs)
tgts = torch.cat(tgts)
src_lengths = torch.cat(src_lengths)
tgt_lengths = torch.cat(tgt_lengths)
return srcs, tgts, src_lengths, tgt_lengths
def broadcast_dataset(world_size, rank, max_length_train, srcs, tgts,
src_lengths, tgt_lengths):
assert world_size > 1
# Broadcast preprocessed dataset length
if rank == 0:
sizes = torch.tensor(src_lengths.shape, device='cuda',
dtype=torch.int64)
else:
sizes = torch.zeros((1,), device='cuda', dtype=torch.int64)
torch.distributed.broadcast(sizes, 0)
nsamples = sizes.item()
# Prepare tensor for receving preprocessed dataset
if rank == 0:
srcs_cuda, tgts_cuda, src_lengths_cuda, tgt_lengths_cuda = \
srcs.cuda(), tgts.cuda(), src_lengths.cuda(), tgt_lengths.cuda()
else:
srcs_cuda = torch.empty((nsamples, max_length_train),
device='cuda', dtype=torch.int64)
tgts_cuda = torch.empty((nsamples, max_length_train),
device='cuda', dtype=torch.int64)
src_lengths_cuda = torch.empty((nsamples,), device='cuda',
dtype=torch.int64)
tgt_lengths_cuda = torch.empty((nsamples,), device='cuda',
dtype=torch.int64)
# Broadcast preprocessed dataset
torch.distributed.broadcast(srcs_cuda, 0)
torch.distributed.broadcast(tgts_cuda, 0)
torch.distributed.broadcast(src_lengths_cuda, 0)
torch.distributed.broadcast(tgt_lengths_cuda, 0)
if rank > 0:
srcs, tgts, src_lengths, tgt_lengths = srcs_cuda.cpu(), \
tgts_cuda.cpu(), src_lengths_cuda.cpu(), tgt_lengths_cuda.cpu()
return srcs, tgts, src_lengths, tgt_lengths
def main():
args = parse_args()
use_cuda = True
device = utils.set_device(use_cuda, args.local_rank)
distributed = utils.init_distributed(use_cuda)
rank = utils.get_rank()
world_size = utils.get_world_size()
utils.setup_logging()
logging.info(f'Run arguments: {args}')
pad_vocab = utils.pad_vocabulary(args.math)
tokenizer = Tokenizer(os.path.join(args.dataset_dir, config.VOCAB_FNAME),
pad_vocab)
# Pre-process dataset only on master node
if rank == 0:
srcs, tgts, src_lengths, tgt_lengths = load_dataset(tokenizer, args)
else:
srcs, tgts, src_lengths, tgt_lengths = None, None, None, None
time.sleep(30)
# Broadcast preprocessed dataset to other ranks
if world_size > 1:
srcs, tgts, src_lengths, tgt_lengths = broadcast_dataset(
world_size, rank, args.max_length_train,
srcs, tgts, src_lengths, tgt_lengths)
preproc_train_data = PreprocessedDataset(
min_len=args.min_length_train,
max_len=args.max_length_train,
vocab_size=tokenizer.vocab_size,
)
os.makedirs(args.preproc_data_dir, exist_ok=True)
preproc_train_data.write_data(
os.path.join(args.preproc_data_dir, 'training.bin'),
(srcs, src_lengths),
(tgts, tgt_lengths),
)
if __name__ == "__main__":
main()
pytablewriter==0.64.0
sacrebleu==1.2.10 sacrebleu==1.2.10
sacremoses==0.0.19
pynvml==8.0.4
#git+https://github.com/rsennrich/subword-nmt.git@48ba99e657591c329e0003f0c6e32e493fa959ef
#!/bin/bash
#for multinode
source `pwd`/config_DGX1_multi.sh
set -e
# start timing
start=$(date +%s)
start_fmt=$(date +%Y-%m-%d\ %r)
echo "STARTING TIMING RUN AT $start_fmt"
export NCCL_DEBUG=${NCCL_DEBUG:-"WARN"}
# run benchmark
set -x
DATASET_DIR='../wmt16_de_en/'
PREPROC_DATADIR='./preproc_data'
RESULTS_DIR='gnmt_wmt16'
## DL params
export LR=${LR:-"2.0e-3"}
export TRAIN_BATCH_SIZE=${TRAIN_BATCH_SIZE:-64}
export TEST_BATCH_SIZE=${TEST_BATCH_SIZE:-64}
export WARMUP_STEPS=${WARMUP_STEPS:-200}
export REMAIN_STEPS=${REMAIN_STEPS:-6453}
export DECAY_INTERVAL=${DECAY_INTERVAL:-809}
export TARGET=${TARGET:-24.0}
export MAX_SEQ_LEN=${MAX_SEQ_LEN:-75}
export NUMEPOCHS=${NUMEPOCHS:-20}
export MATH=${MATH:-fp32}
export DIST_OPTS=${DIST_OPTS-"\
--distributed-weight-update 2 \
--dwu-num-blocks 1 \
--dwu-num-chunks 2 \
--dwu-num-rs-pg 2 \
--dwu-num-ar-pg 2 \
--dwu-num-ag-pg 0 \
--dwu-grad-norm \
"}
export EXTRA_OPTS=${EXTRA_OPTS-"\
--fused-attention \
--fused-xentropy \
--no-log-all-ranks \
"}
declare -a CMD
echo "running benchmark"
CMD_ARGS=("--save ${RESULTS_DIR}" "--dataset-dir ${DATASET_DIR}" "--preproc-data-dir ${PREPROC_DATADIR}/${MAX_SEQ_LEN}" "--target-bleu $TARGET" "--epochs "${NUMEPOCHS}"" "--math ${MATH}" "--max-length-train ${MAX_SEQ_LEN}" "--print-freq 10" "--train-batch-size $TRAIN_BATCH_SIZE" "--test-batch-size $TEST_BATCH_SIZE" "--optimizer FusedAdam" "--lr $LR" "--warmup-steps $WARMUP_STEPS" "--remain-steps $REMAIN_STEPS" "--decay-interval $DECAY_INTERVAL")
hostfile=./$SLURM_JOB_ID
scontrol show hostnames $SLURM_JOB_NODELIST > ${hostfile}
#rm `pwd`/hostfile-dl -f
cat ${hostfile} > `pwd`/tmp
dist_url=`sed -n '1p' ./tmp`
#echo $dist_url
rank=0
num_lines=`cat ./tmp |wc -l`
for((i=0;i<$num_lines-1;i++))
do
((rank=$i+1))
nodename=$(cat ./tmp |sed -n "${rank}p")
ssh ${nodename} "cd `pwd` && module rm compiler/rocm/2.9 && source /public/home/aiss/Pytorch/env_torch1.5_miopen24.sh && python3 -m bind_launch --nnodes=$num_lines --node_rank=$i --master_addr=${dist_url} --master_port=4567 --nsockets_per_node ${DGXNSOCKET} --ncores_per_socket ${DGXSOCKETCORES} --nproc_per_node $SLURM_NTASKS_PER_NODE --no_hyperthreads `pwd`/train.py ${CMD_ARGS[@]}" &
done
((i=$num_lines-1))
nodename=$(cat ./tmp |sed -n "${num_lines}p")
ssh ${nodename} "cd `pwd` && module rm compiler/rocm/2.9 && source /public/home/aiss/Pytorch/env_torch1.5_miopen24.sh && python3 -m bind_launch --nnodes=$num_lines --node_rank=$i --master_addr=${dist_url} --master_port=4567 --nsockets_per_node ${DGXNSOCKET} --ncores_per_socket ${DGXSOCKETCORES} --nproc_per_node $SLURM_NTASKS_PER_NODE --no_hyperthreads `pwd`/train.py ${CMD_ARGS[@]}"
set +x
sleep 3
# end timing
end=$(date +%s)
end_fmt=$(date +%Y-%m-%d\ %r)
echo "ENDING TIMING RUN AT $end_fmt"
# report result
result=$(( $end - $start ))
result_name="RNN_TRANSLATOR"
echo "RESULT,$result_name,,$result,nvidia,$start_fmt"
#!/bin/bash
#for singnode
source `pwd`/config_DGX1.sh
set -e
# start timing
start=$(date +%s)
start_fmt=$(date +%Y-%m-%d\ %r)
echo "STARTING TIMING RUN AT $start_fmt"
export NCCL_DEBUG=${NCCL_DEBUG:-"WARN"}
# run benchmark
set -x
DATASET_DIR='../wmt16_de_en/'
PREPROC_DATADIR='./preproc_data'
RESULTS_DIR='gnmt_wmt16'
DIST_OPTS=${DIST_OPTS:-""}
EXTRA_OPTS=${EXTRA_OPTS:-""}
declare -a CMD
CMD=( 'python3' '-u' '-m' 'bind_launch' "--nsockets_per_node=${DGXNSOCKET}" \
"--ncores_per_socket=${DGXSOCKETCORES}" "--nproc_per_node=${DGXNGPU}" "--no_hyperthreads")
echo "running benchmark"
#for 1 node fp32 training
"${CMD[@]}" train.py \
--save ${RESULTS_DIR} \
--dataset-dir ${DATASET_DIR} \
--preproc-data-dir ${PREPROC_DATADIR}/${MAX_SEQ_LEN} \
--target-bleu $TARGET \
--epochs "${NUMEPOCHS}" \
--math ${MATH} \
--max-length-train ${MAX_SEQ_LEN} \
--print-freq 10 \
--train-batch-size $TRAIN_BATCH_SIZE \
--test-batch-size $TEST_BATCH_SIZE \
--optimizer Adam \
--lr $LR \
--warmup-steps $WARMUP_STEPS \
--remain-steps $REMAIN_STEPS \
--decay-interval $DECAY_INTERVAL \
$DIST_OPTS \
$EXTRA_OPTS ; ret_code=$?
set +x
sleep 3
if [[ $ret_code != 0 ]]; then exit $ret_code; fi
# end timing
end=$(date +%s)
end_fmt=$(date +%Y-%m-%d\ %r)
echo "ENDING TIMING RUN AT $end_fmt"
# report result
result=$(( $end - $start ))
result_name="RNN_TRANSLATOR"
echo "RESULT,$result_name,,$result,nvidia,$start_fmt"
#!/bin/bash
source `pwd`/config_DGX1.sh
set -e
# start timing
start=$(date +%s)
start_fmt=$(date +%Y-%m-%d\ %r)
echo "STARTING TIMING RUN AT $start_fmt"
export NCCL_DEBUG=${NCCL_DEBUG:-"WARN"}
# run benchmark
set -x
DATASET_DIR='../wmt16_de_en/'
PREPROC_DATADIR='./preproc_data'
RESULTS_DIR='gnmt_wmt16'
DIST_OPTS=${DIST_OPTS:-""}
EXTRA_OPTS=${EXTRA_OPTS:-""}
declare -a CMD
CMD=( 'python3' '-u' '-m' 'bind_launch' "--nsockets_per_node=${DGXNSOCKET}" \
"--ncores_per_socket=${DGXSOCKETCORES}" "--nproc_per_node=${DGXNGPU}" "--no_hyperthreads")
echo "running benchmark"
# run training
#for 1 card fp32 training
HIP_VISIBLE_DEVICES=0 python3 train.py \
--save ${RESULTS_DIR} \
--dataset-dir ${DATASET_DIR} \
--preproc-data-dir ${PREPROC_DATADIR}/${MAX_SEQ_LEN} \
--target-bleu $TARGET \
--epochs "${NUMEPOCHS}" \
--math ${MATH} \
--max-length-train ${MAX_SEQ_LEN} \
--print-freq 10 \
--train-batch-size $TRAIN_BATCH_SIZE \
--test-batch-size $TEST_BATCH_SIZE \
--optimizer Adam \
--lr $LR \
--warmup-steps $WARMUP_STEPS \
--remain-steps $REMAIN_STEPS \
--decay-interval $DECAY_INTERVAL \
$EXTRA_OPTS ; ret_code=$?
set +x
sleep 3
if [[ $ret_code != 0 ]]; then exit $ret_code; fi
# end timing
end=$(date +%s)
end_fmt=$(date +%Y-%m-%d\ %r)
echo "ENDING TIMING RUN AT $end_fmt"
# report result
result=$(( $end - $start ))
result_name="RNN_TRANSLATOR"
echo "RESULT,$result_name,,$result,nvidia,$start_fmt"
...@@ -20,9 +20,7 @@ ...@@ -20,9 +20,7 @@
import argparse import argparse
from collections import Counter from collections import Counter
import sys
import importlib
importlib.reload(sys)
def parse_args(): def parse_args():
parser = argparse.ArgumentParser(description='Clean dataset') parser = argparse.ArgumentParser(description='Clean dataset')
...@@ -32,8 +30,7 @@ def parse_args(): ...@@ -32,8 +30,7 @@ def parse_args():
def save_output(fname, data): def save_output(fname, data):
#with open(fname, 'w') as f: with open(fname, 'w') as f:
with open(fname, 'w', encoding='utf-8') as f:
f.writelines(data) f.writelines(data)
...@@ -74,8 +71,7 @@ def main(): ...@@ -74,8 +71,7 @@ def main():
data1 = [] data1 = []
data2 = [] data2 = []
#with open(args.file1) as f1, open(args.file2) as f2: with open(args.file1) as f1, open(args.file2) as f2:
with open(args.file1, 'r', encoding='utf-8') as f1, open(args.file2, 'r', encoding='utf-8') as f2:
for idx, lines in enumerate(zip(f1, f2)): for idx, lines in enumerate(zip(f1, f2)):
line1, line2 = lines line1, line2 = lines
if idx % 100000 == 1: if idx % 100000 == 1:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment