Commit 01bc05b7 authored by Pan,Huiwen's avatar Pan,Huiwen
Browse files

updata GNMT-v2

parent 20291e9d
# Copyright (c) 2018-2020, NVIDIA CORPORATION. All rights reserved. # Copyright (c) 2018-2020, NVIDIA CORPORATION. All rights reserved.
# #
# Licensed under the Apache License, Version 2.0 (the "License"); # Permission is hereby granted, free of charge, to any person obtaining a copy
# you may not use this file except in compliance with the License. # of this software and associated documentation files (the "Software"), to deal
# You may obtain a copy of the License at # in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
# #
# http://www.apache.org/licenses/LICENSE-2.0 # The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
# #
# Unless required by applicable law or agreed to in writing, software # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# distributed under the License is distributed on an "AS IS" BASIS, # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# See the License for the specific language governing permissions and # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# limitations under the License. # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
ARG FROM_IMAGE_NAME=nvcr.io/nvidia/pytorch:20.06-py3 ARG FROM_IMAGE_NAME=nvcr.io/nvidia/pytorch:20.06-py3
FROM ${FROM_IMAGE_NAME} FROM ${FROM_IMAGE_NAME}
# Install dependencies for system configuration logger ENV LANG C.UTF-8
RUN apt-get update && apt-get install -y --no-install-recommends \ ENV LC_ALL C.UTF-8
infiniband-diags \
pciutils && \
rm -rf /var/lib/apt/lists/*
# Install Python dependencies
WORKDIR /workspace/rnn_translator
COPY requirements.txt . RUN pip install --global-option="--cpp_ext" --global-option="--cuda_ext" git+https://github.com/NVIDIA/apex.git#egg=apex
RUN pip install --no-cache-dir https://github.com/mlperf/logging/archive/9ea0afa.zip \
&& pip install --no-cache-dir -r requirements.txt
# Copy & build extensions WORKDIR /workspace/gnmt
COPY seq2seq/csrc seq2seq/csrc
COPY setup.py .
RUN pip install .
# Copy GNMT code COPY requirements.txt .
COPY . . RUN pip install --no-cache-dir -r requirements.txt
RUN pip install --no-cache-dir git+https://github.com/NVIDIA/dllogger.git#egg=dllogger
# Configure environment variables ADD . /workspace/gnmt
ENV LANG C.UTF-8
ENV LC_ALL C.UTF-8
MIT License
Copyright (c) 2017 Elad Hoffer Copyright (c) 2017 Elad Hoffer
Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. Copyright (c) 2018-2020, NVIDIA CORPORATION. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal of this software and associated documentation files (the "Software"), to deal
......
This diff is collapsed.
# 1. Problem
This problem uses recurrent neural network to do language translation.
## Requirements
* [nvidia-docker](https://github.com/NVIDIA/nvidia-docker)
* [PyTorch 20.06-py3 NGC container](https://ngc.nvidia.com/registry/nvidia-pytorch)
* Slurm with [Pyxis](https://github.com/NVIDIA/pyxis) and [Enroot](https://github.com/NVIDIA/enroot)
# 2. Directions
## Steps to download and verify data
Download the data using the following command:
```
cd ..
bash download_dataset.sh
cd -
```
Verify data with:
```
cd ..
bash verify_dataset.sh
cd -
```
## Steps to launch training
### NVIDIA DGX A100 (single node)
Launch configuration and system-specific hyperparameters for the NVIDIA DGX A100
single node submission are in the `config_DGXA100.sh` script.
Steps required to launch single node training on NVIDIA DGX A100:
1. Build the container and push to a docker registry:
```
docker build --pull -t <docker/registry>/mlperf-nvidia:rnn_translator .
docker push <docker/registry>/mlperf-nvidia:rnn_translator
```
2. Launch the training:
```
source config_DGXA100.sh
CONT="<docker/registry>/mlperf-nvidia:rnn_translator" DATADIR=<path/to/data/dir> LOGDIR=<path/to/output/dir> sbatch -N $DGXNNODES -t $WALLTIME run.sub
```
#### Alternative launch with nvidia-docker
When generating results for the official v0.7 submission with one node, the
benchmark was launched onto a cluster managed by a SLURM scheduler. The
instructions in [NVIDIA DGX A100 (single node)](#nvidia-dgx-a100-single-node) explain
how that is done.
However, to make it easier to run this benchmark on a wider set of machine
environments, we are providing here an alternate set of launch instructions
that can be run using nvidia-docker. Note that performance or functionality may
vary from the tested SLURM instructions.
```
docker build --pull -t mlperf-nvidia:rnn_translator .
source config_DGXA100.sh
CONT="mlperf-nvidia:rnn_translator" DATADIR=<path/to/data/dir> LOGDIR=<path/to/output/dir> ./run_with_docker.sh
```
### NVIDIA DGX-2H (single node)
Launch configuration and system-specific hyperparameters for the NVIDIA DGX-2H
single node submission are in the `config_DGX2.sh` script.
Steps required to launch single node training on NVIDIA DGX-2H:
1. Build the container and push to a docker registry:
```
docker build --pull -t <docker/registry>/mlperf-nvidia:rnn_translator .
docker push <docker/registry>/mlperf-nvidia:rnn_translator
```
2. Launch the training:
```
source config_DGX2.sh
CONT="<docker/registry>/mlperf-nvidia:rnn_translator" DATADIR=<path/to/data/dir> LOGDIR=<path/to/output/dir> sbatch -N $DGXNNODES -t $WALLTIME run.sub
```
#### Alternative launch with nvidia-docker
When generating results for the official v0.7 submission with one node, the
benchmark was launched onto a cluster managed by a SLURM scheduler. The
instructions in [NVIDIA DGX-2H (single node)](#nvidia-dgx-2h-single-node) explain
how that is done.
However, to make it easier to run this benchmark on a wider set of machine
environments, we are providing here an alternate set of launch instructions
that can be run using nvidia-docker. Note that performance or functionality may
vary from the tested SLURM instructions.
```
docker build --pull -t mlperf-nvidia:rnn_translator .
source config_DGX2.sh
CONT="mlperf-nvidia:rnn_translator" DATADIR=<path/to/data/dir> LOGDIR=<path/to/output/dir> ./run_with_docker.sh
```
### NVIDIA DGX-1 (single node)
Launch configuration and system-specific hyperparameters for the NVIDIA DGX-1
single node submission are in the `config_DGX1.sh` script.
Steps required to launch single node training on NVIDIA DGX-1:
1. Build the container and push to a docker registry:
```
docker build --pull -t <docker/registry>/mlperf-nvidia:rnn_translator .
docker push <docker/registry>/mlperf-nvidia:rnn_translator
```
2. Launch the training:
```
source config_DGX1.sh
CONT="<docker/registry>/mlperf-nvidia:rnn_translator" DATADIR=<path/to/data/dir> LOGDIR=<path/to/output/dir> sbatch -N $DGXNNODES -t $WALLTIME run.sub
```
#### Alternative launch with nvidia-docker
When generating results for the official v0.7 submission with one node, the
benchmark was launched onto a cluster managed by a SLURM scheduler. The
instructions in [NVIDIA DGX-1 (single node)](#nvidia-dgx-1-single-node) explain
how that is done.
However, to make it easier to run this benchmark on a wider set of machine
environments, we are providing here an alternate set of launch instructions
that can be run using nvidia-docker. Note that performance or functionality may
vary from the tested SLURM instructions.
```
docker build --pull -t mlperf-nvidia:rnn_translator .
source config_DGX1.sh
CONT="mlperf-nvidia:rnn_translator" DATADIR=<path/to/data/dir> LOGDIR=<path/to/output/dir> ./run_with_docker.sh
```
### NVIDIA DGX A100 (multi node)
Launch configuration and system-specific hyperparameters for the NVIDIA DGX A100
multi node submission are in the following scripts:
* for the 2-node NVIDIA DGX A100 submission: `config_DGXA100_multi_2x8x192_dist.sh`
* for the 32-node NVIDIA DGX A100 submission: `config_DGXA100_multi_32x8x32_dist.sh`
* for the 128-node NVIDIA DGX A100 submission: `config_DGXA100_multi_128x8x16_dist.sh`
Steps required to launch multi node training on NVIDIA DGX A100:
1. Build the docker container and push to a docker registry
```
docker build --pull -t <docker/registry>/mlperf-nvidia:rnn_translator .
docker push <docker/registry>/mlperf-nvidia:rnn_translator
```
2. Launch the training
2-node NVIDIA DGX A100 training:
```
source config_DGXA100_multi_2x8x192_dist.sh
CONT="<docker/registry>/mlperf-nvidia:rnn_translator" DATADIR=<path/to/data/dir> LOGDIR=<path/to/output/dir> sbatch -N $DGXNNODES -t $WALLTIME --ntasks-per-node $DGXNGPU run.sub
```
32-node NVIDIA DGX A100 training:
```
source config_DGXA100_multi_32x8x32_dist.sh
CONT="<docker/registry>/mlperf-nvidia:rnn_translator" DATADIR=<path/to/data/dir> LOGDIR=<path/to/output/dir> sbatch -N $DGXNNODES -t $WALLTIME --ntasks-per-node $DGXNGPU run.sub
```
128-node NVIDIA DGX A100 training:
```
source config_DGXA100_multi_128x8x16_dist.sh
CONT="<docker/registry>/mlperf-nvidia:rnn_translator" DATADIR=<path/to/data/dir> LOGDIR=<path/to/output/dir> sbatch -N $DGXNNODES -t $WALLTIME --ntasks-per-node $DGXNGPU run.sub
```
### NVIDIA DGX-2H (multi node)
Launch configuration and system-specific hyperparameters for the NVIDIA DGX-2H
multi node submission are the following scripts:
* for the 16-node NVIDIA DGX-2H submission: `config_DGX2_multi_16x16x32.sh`
* for the 64-node NVIDIA DGX-2H submission: `config_DGX2_multi_64x16x16.sh`
Steps required to launch multi node training on NVIDIA DGX-2H:
1. Build the docker container and push to a docker registry
```
docker build --pull -t <docker/registry>/mlperf-nvidia:rnn_translator .
docker push <docker/registry>/mlperf-nvidia:rnn_translator
```
2. Launch the training
16-node NVIDIA DGX-2H training:
```
source config_DGX2_multi_16x16x32.sh
CONT="<docker/registry>/mlperf-nvidia:rnn_translator" DATADIR=<path/to/data/dir> LOGDIR=<path/to/output/dir> sbatch -N $DGXNNODES -t $WALLTIME --ntasks-per-node $DGXNGPU run.sub
```
64-node NVIDIA DGX-2H training:
```
source config_DGX2_multi_64x16x16.sh
CONT="<docker/registry>/mlperf-nvidia:rnn_translator" DATADIR=<path/to/data/dir> LOGDIR=<path/to/output/dir> sbatch -N $DGXNNODES -t $WALLTIME --ntasks-per-node $DGXNGPU run.sub
```
# 3. Dataset/Environment
### Publication/Attribution
We use [WMT16 English-German](http://www.statmt.org/wmt16/translation-task.html)
for training.
### Data preprocessing
Script uses [subword-nmt](https://github.com/rsennrich/subword-nmt) package to
segment text into subword units (BPE), by default it builds shared vocabulary of
32,000 tokens.
Preprocessing removes all pairs of sentences that can't be decoded by latin-1
encoder.
### Vocabulary
Vocabulary is generated by the following lines from the `download_dataset.sh`
script:
```
# Create vocabulary file for BPE
cat "${OUTPUT_DIR}/train.tok.clean.bpe.${merge_ops}.en" "${OUTPUT_DIR}/train.tok.clean.bpe.${merge_ops}.de" | \
${OUTPUT_DIR}/subword-nmt/get_vocab.py | cut -f1 -d ' ' > "${OUTPUT_DIR}/vocab.bpe.${merge_ops}"
```
Vocabulary is stored to the `rnn_translator/data/vocab.bpe.32000` plain text
file. Tokens are separated with a newline character, one token per line. The
vocabulary file doesn't contain special tokens like for example BOS
(begin-of-string) or EOS (end-of-string) tokens.
Here are first 10 lines from the `rnn_translator/data/vocab.bpe.32000` file:
```
,
.
the
in
of
and
die
der
to
und
```
### Text datasets
The `download_dataset.sh` script automatically creates training, validation and
test datasets. Datasets are stored as plain text files. Sentences are separated
with a newline character, and tokens within each sentence are separated with a
single space character.
Training data:
* source language (English): `rnn_translator/data/train.tok.clean.bpe.32000.en`
* target language (German): `rnn_translator/data/train.tok.clean.bpe.32000.de`
Validation data:
* source language (English): `rnn_translator/data/newstest_dev.tok.clean.bpe.32000.en`
* target language (German): `rnn_translator/data/newstest_dev.tok.clean.bpe.32000.de`
Test data:
* source language (English): `rnn_translator/data/newstest2014.tok.bpe.32000.en`
* target language (German): `rnn_translator/data/newstest2014.de`
* notice that the `newstest2014.de` file isn't tokenized, BLEU evaluation is
performed by the sacrebleu package and sacrebleu expects plain text raw data
(tokenization is performed internally by sacrebleu)
Here are first 5 lines from the `rnn_translator/data/train.tok.clean.bpe.32000.en` file:
```
Res@@ um@@ ption of the session
I declare resumed the session of the European Parliament ad@@ jour@@ ned on Friday 17 December 1999 , and I would like once again to wish you a happy new year in the hope that you enjoyed a pleasant fes@@ tive period .
Although , as you will have seen , the d@@ read@@ ed &apos; millenn@@ ium bug &apos; failed to materi@@ alise , still the people in a number of countries suffered a series of natural disasters that truly were d@@ read@@ ful .
You have requested a debate on this subject in the course of the next few days , during this part-session .
In the meantime , I should like to observe a minute &apos; s silence , as a number of Members have requested , on behalf of all the victims concerned , particularly those of the terrible stor@@ ms , in the various countries of the European Union .
```
And here are first 5 lines from the `rnn_translator/data/train.tok.clean.bpe.32000.de` file:
```
Wiederaufnahme der Sitzungsperiode
Ich erkläre die am Freitag , dem 17. Dezember unterbro@@ ch@@ ene Sitzungsperiode des Europäischen Parlaments für wieder@@ aufgenommen , wünsche Ihnen nochmals alles Gute zum Jahres@@ wechsel und hoffe , daß Sie schöne Ferien hatten .
Wie Sie feststellen konnten , ist der ge@@ für@@ ch@@ tete &quot; Mill@@ en@@ i@@ um-@@ Bu@@ g &quot; nicht eingetreten . Doch sind Bürger einiger unserer Mitgliedstaaten Opfer von schrecklichen Naturkatastrophen geworden .
Im Parlament besteht der Wunsch nach einer Aussprache im Verlauf dieser Sitzungsperiode in den nächsten Tagen .
Heute möchte ich Sie bitten - das ist auch der Wunsch einiger Kolleginnen und Kollegen - , allen Opfern der St@@ ür@@ me , insbesondere in den verschiedenen Ländern der Europäischen Union , in einer Schwei@@ ge@@ minute zu ge@@ denken .
```
### Training and test data separation
Training uses WMT16 English-German dataset, validation is on concatenation of
newstest2015 and newstest2016, BLEU evaluation is done on newstest2014.
### Data filtering
Training is executed only on pairs of sentences which satisfy the following equation:
```
min_len <= src sentence sequence length <= max_len AND
min_len <= tgt sentence sequence length <= max_len
```
`min_len` is set to 0, `max_len` is set to 75. Source and target sequence
lengths include special BOS (begin-of-sentence) and EOS (end-of-sentence)
tokens.
Filtering is implemented in `pytorch/seq2seq/data/dataset.py`, class
`LazyParallelDataset`.
### Training data order
Training script does bucketing by sequence length. Bucketing algorithm uses 5
equal-width buckets (`num_buckets = 5`). Pairs of training sentences are
assigned to buckets by the value of
`max(src_sentence_len // bucket_width, tgt_sentence_len // bucket_width)`, where
`bucket_width = (max_len + num_buckets - 1) // num_buckets`.
Before each training epoch batches are randomly sampled from the buckets (last
incomplete batches are dropped for each bucket), then all batches are
reshuffled.
Bucketing is implemented in `pytorch/seq2seq/data/sampler.py`, class
`BucketingSampler`.
# 4. Model
### Publication/Attribution
Implemented model is similar to the one from [Google's Neural Machine
Translation System: Bridging the Gap between Human and Machine
Translation](https://arxiv.org/abs/1609.08144) paper.
Most important difference is in the attention mechanism. This repository
implements `gnmt_v2` attention: output from first LSTM layer of decoder goes
into attention, then re-weighted context is concatenated with inputs to all
subsequent LSTM layers in decoder at current timestep.
The same attention mechanism is also implemented in default
GNMT-like models from [tensorflow/nmt](https://github.com/tensorflow/nmt) and
[NVIDIA/OpenSeq2Seq](https://github.com/NVIDIA/OpenSeq2Seq).
### Structure
* general:
* encoder and decoder are using shared embeddings
* data-parallel multi-gpu training
* trained with label smoothing loss (smoothing factor 0.1)
* encoder:
* 4-layer LSTM, hidden size 1024, first layer is bidirectional, the rest of
layers are unidirectional
* with residual connections starting from 3rd LSTM layer
* uses standard pytorch nn.LSTM layer
* dropout is applied on input to all LSTM layers, probability of dropout is
set to 0.2
* hidden state of LSTM layers is initialized with zeros
* weights and bias of LSTM layers is initialized with uniform(-0.1, 0.1)
distribution
* decoder:
* 4-layer unidirectional LSTM with hidden size 1024 and fully-connected
classifier
* with residual connections starting from 3rd LSTM layer
* uses standard pytorch nn.LSTM layer
* dropout is applied on input to all LSTM layers, probability of dropout is
set to 0.2
* hidden state of LSTM layers is initialized with zeros
* weights and bias of LSTM layers is initialized with uniform(-0.1, 0.1)
distribution
* weights and bias of fully-connected classifier is initialized with
uniform(-0.1, 0.1) distribution
* attention:
* normalized Bahdanau attention
* model uses `gnmt_v2` attention mechanism
* output from first LSTM layer of decoder goes into attention,
then re-weighted context is concatenated with the input to all subsequent
LSTM layers in decoder at the current timestep
* linear transform of keys and queries is initialized with uniform(-0.1, 0.1),
normalization scalar is initialized with 1.0 / sqrt(1024),
normalization bias is initialized with zero
* inference:
* beam search with beam size of 5
* with coverage penalty and length normalization, coverage penalty factor is
set to 0.1, length normalization factor is set to 0.6 and length
normalization constant is set to 5.0
* BLEU computed by [sacrebleu](https://pypi.org/project/sacrebleu/)
Implementation:
* base Seq2Seq model: `pytorch/seq2seq/models/seq2seq_base.py`, class `Seq2Seq`
* GNMT model: `pytorch/seq2seq/models/gnmt.py`, class `GNMT`
* encoder: `pytorch/seq2seq/models/encoder.py`, class `ResidualRecurrentEncoder`
* decoder: `pytorch/seq2seq/models/decoder.py`, class `ResidualRecurrentDecoder`
* attention: `pytorch/seq2seq/models/attention.py`, class `BahdanauAttention`
* inference (including BLEU evaluation and detokenization): `pytorch/seq2seq/inference/inference.py`, class `Translator`
* beam search: `pytorch/seq2seq/inference/beam_search.py`, class `SequenceGenerator`
### Loss function
Cross entropy loss with label smoothing (smoothing factor = 0.1), padding is not
considered part of the loss.
Loss function is implemented in `pytorch/seq2seq/train/smoothing.py`, class
`LabelSmoothing`.
### Optimizer
Adam optimizer with learning rate 1e-3, beta1 = 0.9, beta2 = 0.999, epsilon =
1e-8 and no weight decay.
Network is trained with gradient clipping, max L2 norm of gradients is set to 5.0.
Optimizer is implemented in `pytorch/seq2seq/train/fp_optimizers.py`, class
`Fp32Optimizer`.
### Learning rate schedule
Model is trained with exponential learning rate warmup for 200 steps and with
step learning rate decay. Decay is started after 2/3 of training steps, decays
for a total of 4 times, at regularly spaced intervals, decay factor is 0.5.
Learning rate scheduler is implemented in
`pytorch/seq2seq/train/lr_scheduler.py`, class `WarmupMultiStepLR`.
# 5. Quality
### Quality metric
Uncased BLEU score on newstest2014 en-de dataset.
BLEU scores reported by [sacrebleu](https://pypi.org/project/sacrebleu/)
package (version 1.2.10). Sacrebleu is executed with the following flags:
`--score-only -lc --tokenize intl`.
### Quality target
Uncased BLEU score of 24.00.
### Evaluation frequency
Evaluation of BLEU score is done after every epoch.
### Evaluation thoroughness
Evaluation uses all of `newstest2014.en` (3003 sentences).
#! /bin/bash
set -euo pipefail
print_usage() {
cat << EOF
${0} [options] [--] COMMAND [ARG...]
Control binding policy for each task. Assumes one rank will be launched for each GPU.
Options:
--cpu=MODE
* exclusive -- bind each rank to an exclusive set of cores near its GPU
* exclusive,nosmt -- bind each rank to an exclusive set of cores near its GPU, without hyperthreading
* node -- bind each rank to all cores in the NUMA node nearest its GPU [default]
* *.sh -- bind each rank using the bash associative array bind_cpu_cores or bind_cpu_nodes from a file
* off -- don't bind
--mem=MODE
* node -- bind each rank to the nearest NUMA node [default]
* *.sh -- bind each rank using the bash associative array bind_mem from a file
* off -- don't bind
--ib=MODE
* single -- bind each rank to a single IB device near its GPU
* off -- don't bind [default]
--cluster=CLUSTER
Select which cluster is being used. May be required if system params cannot be detected.
EOF
}
################################################################################
# Argument parsing
################################################################################
cpu_mode='node'
mem_mode='node'
ib_mode='off'
cluster=''
while [ $# -gt 0 ]; do
case "$1" in
-h|--help) print_usage ; exit 0 ;;
--cpu=*) cpu_mode="${1/*=/}"; shift ;;
--cpu) cpu_mode="$2"; shift 2 ;;
--mem=*) mem_mode="${1/*=/}"; shift ;;
--mem) mem_mode="$2"; shift 2 ;;
--ib=*) ib_mode="${1/*=/}"; shift ;;
--ib) ib_mode="$2"; shift 2 ;;
--cluster=*) cluster="${1/*=/}"; shift ;;
--cluster) cluster="$2"; shift 2 ;;
--) shift; break ;;
*) break ;;
esac
done
if [ $# -lt 1 ]; then
echo 'ERROR: no command given' 2>&1
print_usage
exit 1
fi
################################################################################
# Get system params
################################################################################
# LOCAL_RANK is set with an enroot hook for Pytorch containers
# SLURM_LOCALID is set by Slurm
# OMPI_COMM_WORLD_LOCAL_RANK is set by mpirun
readonly local_rank="${LOCAL_RANK:=${SLURM_LOCALID:=${OMPI_COMM_WORLD_LOCAL_RANK:-}}}"
if [ -z "${local_rank}" ]; then
echo 'ERROR: cannot read LOCAL_RANK from env' >&2
exit 1
fi
num_gpus=$(nvidia-smi -i 0 --query-gpu=count --format=csv,noheader,nounits)
if [ "${local_rank}" -ge "${num_gpus}" ]; then
echo "ERROR: local rank is ${local_rank}, but there are only ${num_gpus} gpus available" >&2
exit 1
fi
get_lscpu_value() {
awk -F: "(\$1 == \"${1}\"){gsub(/ /, \"\", \$2); print \$2; found=1} END{exit found!=1}"
}
lscpu_out=$(lscpu)
num_sockets=$(get_lscpu_value 'Socket(s)' <<< "${lscpu_out}")
num_nodes=$(get_lscpu_value 'NUMA node(s)' <<< "${lscpu_out}")
cores_per_socket=$(get_lscpu_value 'Core(s) per socket' <<< "${lscpu_out}")
echo "num_sockets = ${num_sockets} num_nodes=${num_nodes} cores_per_socket=${cores_per_socket}"
readonly cores_per_node=$(( (num_sockets * cores_per_socket) / num_nodes ))
if [ ${num_gpus} -gt 1 ]; then
readonly gpus_per_node=$(( num_gpus / num_nodes ))
else
readonly gpus_per_node=1
fi
readonly cores_per_gpu=$(( cores_per_node / gpus_per_node ))
readonly local_node=$(( local_rank / gpus_per_node ))
declare -a ibdevs=()
case "${cluster}" in
circe)
# Need to specialize for circe because IB detection is hard
ibdevs=(mlx5_1 mlx5_2 mlx5_3 mlx5_4 mlx5_7 mlx5_8 mlx5_9 mlx5_10)
;;
selene)
# Need to specialize for selene because IB detection is hard
ibdevs=(mlx5_0 mlx5_1 mlx5_2 mlx5_3 mlx5_6 mlx5_7 mlx5_8 mlx5_9)
;;
'')
if ibstat_out="$(ibstat -l 2>/dev/null | sort -V)" ; then
mapfile -t ibdevs <<< "${ibstat_out}"
fi
;;
*)
echo "ERROR: Unknown cluster '${cluster}'" >&2
exit 1
;;
esac
readonly num_ibdevs="${#ibdevs[@]}"
################################################################################
# Setup for exec
################################################################################
declare -a numactl_args=()
case "${cpu_mode}" in
exclusive)
numactl_args+=( "$(printf -- "--physcpubind=%u-%u,%u-%u" \
$(( local_rank * cores_per_gpu )) \
$(( (local_rank + 1) * cores_per_gpu - 1 )) \
$(( local_rank * cores_per_gpu + (cores_per_gpu * gpus_per_node * num_nodes) )) \
$(( (local_rank + 1) * cores_per_gpu + (cores_per_gpu * gpus_per_node * num_nodes) - 1 )) \
)" )
;;
exclusive,nosmt)
numactl_args+=( "$(printf -- "--physcpubind=%u-%u" \
$(( local_rank * cores_per_gpu )) \
$(( (local_rank + 1) * cores_per_gpu - 1 )) \
)" )
;;
node)
numactl_args+=( "--cpunodebind=${local_node}" )
;;
*.sh)
source "${cpu_mode}"
if [ -n "${bind_cpu_cores:-}" ]; then
numactl_args+=( "--physcpubind=${bind_cpu_cores[${local_rank}]}" )
elif [ -n "${bind_cpu_nodes:-}" ]; then
numactl_args+=( "--cpunodebind=${bind_cpu_nodes[${local_rank}]}" )
else
echo "ERROR: invalid CPU affinity file ${cpu_mode}." >&2
exit 1
fi
;;
off|'')
;;
*)
echo "ERROR: invalid cpu mode '${cpu_mode}'" 2>&1
print_usage
exit 1
;;
esac
case "${mem_mode}" in
node)
numactl_args+=( "--membind=${local_node}" )
;;
*.sh)
source "${mem_mode}"
if [ -z "${bind_mem:-}" ]; then
echo "ERROR: invalid memory affinity file ${mem_mode}." >&2
exit 1
fi
numactl_args+=( "--membind=${bind_mem[${local_rank}]}" )
;;
off|'')
;;
*)
echo "ERROR: invalid mem mode '${mem_mode}'" 2>&1
print_usage
exit 1
;;
esac
case "${ib_mode}" in
single)
if [ "${num_ibdevs}" -eq 0 ]; then
echo "WARNING: used '$0 --ib=single', but there are 0 IB devices available; skipping IB binding." 2>&1
else
readonly ibdev="${ibdevs[$(( local_rank * num_ibdevs / num_gpus ))]}"
export OMPI_MCA_btl_openib_if_include="${OMPI_MCA_btl_openib_if_include-$ibdev}"
export UCX_NET_DEVICES="${UCX_NET_DEVICES-$ibdev:1}"
fi
;;
off|'')
;;
*)
echo "ERROR: invalid ib mode '${ib_mode}'" 2>&1
print_usage
exit 1
;;
esac
################################################################################
# Exec
################################################################################
if [ "${#numactl_args[@]}" -gt 0 ] ; then
set -x
exec numactl "${numactl_args[@]}" -- "${@}"
else
exec "${@}"
fi
import sys
import subprocess
import os
import socket
from argparse import ArgumentParser, REMAINDER
import torch
def parse_args():
"""
Helper function parsing the command line options
@retval ArgumentParser
"""
parser = ArgumentParser(description="PyTorch distributed training launch "
"helper utilty that will spawn up "
"multiple distributed processes")
# Optional arguments for the launch helper
parser.add_argument("--nnodes", type=int, default=1,
help="The number of nodes to use for distributed "
"training")
parser.add_argument("--node_rank", type=int, default=0,
help="The rank of the node for multi-node distributed "
"training")
parser.add_argument("--nproc_per_node", type=int, default=1,
help="The number of processes to launch on each node, "
"for GPU training, this is recommended to be set "
"to the number of GPUs in your system so that "
"each process can be bound to a single GPU.")
parser.add_argument("--master_addr", default="127.0.0.1", type=str,
help="Master node (rank 0)'s address, should be either "
"the IP address or the hostname of node 0, for "
"single node multi-proc training, the "
"--master_addr can simply be 127.0.0.1")
parser.add_argument("--master_port", default=29500, type=int,
help="Master node (rank 0)'s free port that needs to "
"be used for communciation during distributed "
"training")
parser.add_argument('--no_hyperthreads', action='store_true',
help='Flag to disable binding to hyperthreads')
parser.add_argument('--no_membind', action='store_true',
help='Flag to disable memory binding')
# non-optional arguments for binding
parser.add_argument("--nsockets_per_node", type=int, required=True,
help="Number of CPU sockets on a node")
parser.add_argument("--ncores_per_socket", type=int, required=True,
help="Number of CPU cores per socket")
# positional
parser.add_argument("training_script", type=str,
help="The full path to the single GPU training "
"program/script to be launched in parallel, "
"followed by all the arguments for the "
"training script")
# rest from the training program
parser.add_argument('training_script_args', nargs=REMAINDER)
return parser.parse_args()
def main():
args = parse_args()
# variables for numactrl binding
NSOCKETS = args.nsockets_per_node
NGPUS_PER_SOCKET = (args.nproc_per_node // args.nsockets_per_node) + (1 if (args.nproc_per_node % args.nsockets_per_node) else 0)
NCORES_PER_GPU = args.ncores_per_socket // NGPUS_PER_SOCKET
# world size in terms of number of processes
dist_world_size = args.nproc_per_node * args.nnodes
# set PyTorch distributed related environmental variables
current_env = os.environ.copy()
current_env["MASTER_ADDR"] = args.master_addr
current_env["MASTER_PORT"] = str(args.master_port)
current_env["WORLD_SIZE"] = str(dist_world_size)
processes = []
for local_rank in range(0, args.nproc_per_node):
# each process's rank
dist_rank = args.nproc_per_node * args.node_rank + local_rank
current_env["RANK"] = str(dist_rank)
# form numactrl binding command
cpu_ranges = [local_rank * NCORES_PER_GPU,
(local_rank + 1) * NCORES_PER_GPU - 1,
local_rank * NCORES_PER_GPU + (NCORES_PER_GPU * NGPUS_PER_SOCKET * NSOCKETS),
(local_rank + 1) * NCORES_PER_GPU + (NCORES_PER_GPU * NGPUS_PER_SOCKET * NSOCKETS) - 1]
numactlargs = []
if args.no_hyperthreads:
numactlargs += [ "--physcpubind={}-{}".format(*cpu_ranges[0:2]) ]
else:
numactlargs += [ "--physcpubind={}-{},{}-{}".format(*cpu_ranges) ]
if not args.no_membind:
memnode = local_rank // NGPUS_PER_SOCKET
numactlargs += [ "--membind={}".format(memnode) ]
# spawn the processes
cmd = [ "/usr/bin/numactl" ] \
+ numactlargs \
+ [ sys.executable,
"-u",
args.training_script,
"--local_rank={}".format(local_rank)
] \
+ args.training_script_args
print("cmd: ",cmd)
process = subprocess.Popen(cmd, env=current_env)
processes.append(process)
for process in processes:
process.wait()
if __name__ == "__main__":
main()
## System run parms
export DGXNNODES=1
export DGXSYSTEM=$(basename $(readlink -f ${BASH_SOURCE[0]}) | sed 's/^config_//' | sed 's/\.sh$//' )
export WALLTIME=${WALLTIME:-"00:30:00"}
## DL params
export LR=${LR:-"2.0e-3"}
export TRAIN_BATCH_SIZE=${TRAIN_BATCH_SIZE:-64}
export TEST_BATCH_SIZE=${TEST_BATCH_SIZE:-64}
export WARMUP_STEPS=${WARMUP_STEPS:-200}
export REMAIN_STEPS=${REMAIN_STEPS:-6453}
export DECAY_INTERVAL=${DECAY_INTERVAL:-809}
export TARGET=${TARGET:-24.0}
export MAX_SEQ_LEN=${MAX_SEQ_LEN:-75}
export NUMEPOCHS=${NUMEPOCHS:-15}
export MATH=${MATH:-fp32}
#export DIST_OPTS=${DIST_OPTS-"\
# --distributed-weight-update 2 \
# --dwu-num-blocks 1 \
# --dwu-num-chunks 2 \
# --dwu-num-rs-pg 2 \
# --dwu-num-ar-pg 2 \
# --dwu-num-ag-pg 0 \
# --dwu-grad-norm \
# "}
export DIST_OPTS=${DIST_OPTS-"\
--dwu-num-blocks 1 \
--dwu-num-chunks 2 \
--dwu-num-rs-pg 2 \
--dwu-num-ar-pg 2 \
--dwu-num-ag-pg 0 \
--dwu-grad-norm \
"}
export EXTRA_OPTS=${EXTRA_OPTS-"\
--fused-attention \
--fused-xentropy \
--no-log-all-ranks \
"}
## System config params
export DGXNGPU=4
export DGXSOCKETCORES=8
export DGXHT=1 # HT is on is 2, HT off is 1
export DGXNSOCKET=4
#export DGXNGPU=1
#export DGXSOCKETCORES=8
#export DGXHT=1 # HT is on is 2, HT off is 1
#export DGXNSOCKET=1
## System run parms
export DGXNNODES=1
export DGXSYSTEM=$(basename $(readlink -f ${BASH_SOURCE[0]}) | sed 's/^config_//' | sed 's/\.sh$//' )
export WALLTIME=${WALLTIME:-"00:30:00"}
export LR=${LR:-"2.0e-3"}
export TRAIN_BATCH_SIZE=${TRAIN_BATCH_SIZE:-256}
export TEST_BATCH_SIZE=${TEST_BATCH_SIZE:-128}
export WARMUP_STEPS=${WARMUP_STEPS:-200}
export REMAIN_STEPS=${REMAIN_STEPS:-6453}
export DECAY_INTERVAL=${DECAY_INTERVAL:-809}
export TARGET=${TARGET:-24.0}
export MAX_SEQ_LEN=${MAX_SEQ_LEN:-75}
export NUMEPOCHS=${NUMEPOCHS:-8}
export MATH=${MATH:-fp16}
export DIST_OPTS=${DIST_OPTS-"\
--distributed-weight-update 2 \
--dwu-num-blocks 1 \
--dwu-num-chunks 2 \
--dwu-num-rs-pg 2 \
--dwu-num-ar-pg 2 \
--dwu-num-ag-pg 0 \
--dwu-grad-norm \
"}
export EXTRA_OPTS=${EXTRA_OPTS-"\
--fused-attention \
--fused-xentropy \
--no-log-all-ranks \
"}
## System config params
export DGXNGPU=4
export DGXSOCKETCORES=8
export DGXHT=1 # HT is on is 2, HT off is 1
export DGXNSOCKET=4
## System run parms
export DGXNNODES=2
export DGXSYSTEM=$(basename $(readlink -f ${BASH_SOURCE[0]}) | sed 's/^config_//' | sed 's/\.sh$//' )
export WALLTIME=${WALLTIME:-"00:30:00"}
## DL params
#export LR=${LR:-"2.0e-3"}
#export TRAIN_BATCH_SIZE=${TRAIN_BATCH_SIZE:-64}
#export TEST_BATCH_SIZE=${TEST_BATCH_SIZE:-64}
#export WARMUP_STEPS=${WARMUP_STEPS:-200}
#export REMAIN_STEPS=${REMAIN_STEPS:-6453}
#export DECAY_INTERVAL=${DECAY_INTERVAL:-809}
#export TARGET=${TARGET:-24.0}
#export MAX_SEQ_LEN=${MAX_SEQ_LEN:-75}
#export NUMEPOCHS=${NUMEPOCHS:-15}
#export MATH=${MATH:-fp32}
#export DIST_OPTS=${DIST_OPTS-"\
# --distributed-weight-update 2 \
# --dwu-num-blocks 1 \
# --dwu-num-chunks 2 \
# --dwu-num-rs-pg 2 \
# --dwu-num-ar-pg 2 \
# --dwu-num-ag-pg 0 \
# --dwu-grad-norm \
# "}
#export EXTRA_OPTS=${EXTRA_OPTS-"\
# --fused-attention \
# --fused-xentropy \
# --no-log-all-ranks \
# "}
## System config params
export DGXNGPU=4
export DGXSOCKETCORES=8
export DGXHT=2 # HT is on is 2, HT off is 1
export DGXNSOCKET=4
#export DGXNGPU=1
#export DGXSOCKETCORES=8
#export DGXHT=1 # HT is on is 2, HT off is 1
#export DGXNSOCKET=1
import collections
import os
import subprocess
import torch
from mlperf_logging.mllog import constants
from seq2seq.utils import configure_logger, log_event
def mlperf_submission_log(benchmark):
num_nodes = os.environ.get('SLURM_NNODES', 1)
if int(num_nodes) > 1:
torch.distributed.init_process_group(backend='nccl', init_method='env://')
configure_logger(benchmark)
log_event(
key=constants.SUBMISSION_BENCHMARK,
value=benchmark,
)
log_event(
key=constants.SUBMISSION_ORG,
value='NVIDIA')
log_event(
key=constants.SUBMISSION_DIVISION,
value='closed')
log_event(
key=constants.SUBMISSION_STATUS,
value='onprem')
log_event(
key=constants.SUBMISSION_PLATFORM,
value=f'{num_nodes}xSUBMISSION_PLATFORM_PLACEHOLDER')
import logging
import time
import os
import argparse
import torch
from torch.utils.data import DataLoader
from seq2seq.data.tokenizer import Tokenizer
import seq2seq.data.config as config
import seq2seq.utils as utils
from seq2seq.data.dataset import LazyParallelDataset
from seq2seq.data.dataset import PreprocessedDataset
def parse_args():
parser = argparse.ArgumentParser(
description='GNMT prepare data',
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('--dataset-dir', default='data/wmt16_de_en',
help='path to the directory with training/test data')
parser.add_argument('--preproc-data-dir', default='/tmp/preprocessed',
help='path to the directory with preprocessed \
training/test data')
parser.add_argument('--max-size', default=None, type=int,
help='use at most MAX_SIZE elements from training \
dataset (useful for benchmarking), by default \
uses entire dataset')
parser.add_argument('--math', default='fp32',
choices=['fp32', 'fp16'],
help='arithmetic type')
parser.add_argument('--max-length-train', default=50, type=int,
help='maximum sequence length for training \
(including special BOS and EOS tokens)')
parser.add_argument('--min-length-train', default=0, type=int,
help='minimum sequence length for training \
(including special BOS and EOS tokens)')
parser.add_argument('--rank', default=0, type=int,
help='global rank of the process, do not set!')
parser.add_argument('--local_rank', default=os.getenv('LOCAL_RANK', 0), type=int,
help='local rank of the process, do not set!')
args = parser.parse_args()
return args
def build_collate_fn(max_seq_len, parallel=True):
def collate_seq(seq):
lengths = torch.tensor([len(s) for s in seq])
batch_length = max_seq_len
shape = (len(seq), batch_length)
seq_tensor = torch.full(shape, config.PAD, dtype=torch.int64)
for i, s in enumerate(seq):
end_seq = lengths[i]
seq_tensor[i, :end_seq].copy_(s[:end_seq])
return (seq_tensor, lengths)
def parallel_collate(seqs):
src_seqs, tgt_seqs = zip(*seqs)
return tuple([collate_seq(s) for s in [src_seqs, tgt_seqs]])
return parallel_collate
def load_dataset(tokenizer, args):
train_data = LazyParallelDataset(
src_fname=os.path.join(args.dataset_dir, config.SRC_TRAIN_FNAME),
tgt_fname=os.path.join(args.dataset_dir, config.TGT_TRAIN_FNAME),
tokenizer=tokenizer,
min_len=args.min_length_train,
max_len=args.max_length_train,
sort=False,
max_size=args.max_size)
collate_fn = build_collate_fn(max_seq_len=args.max_length_train,
parallel=True)
loader = DataLoader(train_data,
batch_size=1024,
collate_fn=collate_fn,
num_workers=min(os.cpu_count(), 16),
timeout=120,
drop_last=False)
srcs = []
tgts = []
src_lengths = []
tgt_lengths = []
for (src, src_len), (tgt, tgt_len) in loader:
src_lengths.append(src_len)
tgt_lengths.append(tgt_len)
srcs.append(src)
tgts.append(tgt)
srcs = torch.cat(srcs)
tgts = torch.cat(tgts)
src_lengths = torch.cat(src_lengths)
tgt_lengths = torch.cat(tgt_lengths)
return srcs, tgts, src_lengths, tgt_lengths
def broadcast_dataset(world_size, rank, max_length_train, srcs, tgts,
src_lengths, tgt_lengths):
assert world_size > 1
# Broadcast preprocessed dataset length
if rank == 0:
sizes = torch.tensor(src_lengths.shape, device='cuda',
dtype=torch.int64)
else:
sizes = torch.zeros((1,), device='cuda', dtype=torch.int64)
torch.distributed.broadcast(sizes, 0)
nsamples = sizes.item()
# Prepare tensor for receving preprocessed dataset
if rank == 0:
srcs_cuda, tgts_cuda, src_lengths_cuda, tgt_lengths_cuda = \
srcs.cuda(), tgts.cuda(), src_lengths.cuda(), tgt_lengths.cuda()
else:
srcs_cuda = torch.empty((nsamples, max_length_train),
device='cuda', dtype=torch.int64)
tgts_cuda = torch.empty((nsamples, max_length_train),
device='cuda', dtype=torch.int64)
src_lengths_cuda = torch.empty((nsamples,), device='cuda',
dtype=torch.int64)
tgt_lengths_cuda = torch.empty((nsamples,), device='cuda',
dtype=torch.int64)
# Broadcast preprocessed dataset
torch.distributed.broadcast(srcs_cuda, 0)
torch.distributed.broadcast(tgts_cuda, 0)
torch.distributed.broadcast(src_lengths_cuda, 0)
torch.distributed.broadcast(tgt_lengths_cuda, 0)
if rank > 0:
srcs, tgts, src_lengths, tgt_lengths = srcs_cuda.cpu(), \
tgts_cuda.cpu(), src_lengths_cuda.cpu(), tgt_lengths_cuda.cpu()
return srcs, tgts, src_lengths, tgt_lengths
def main():
args = parse_args()
use_cuda = True
device = utils.set_device(use_cuda, args.local_rank)
distributed = utils.init_distributed(use_cuda)
rank = utils.get_rank()
world_size = utils.get_world_size()
utils.setup_logging()
logging.info(f'Run arguments: {args}')
pad_vocab = utils.pad_vocabulary(args.math)
tokenizer = Tokenizer(os.path.join(args.dataset_dir, config.VOCAB_FNAME),
pad_vocab)
# Pre-process dataset only on master node
if rank == 0:
srcs, tgts, src_lengths, tgt_lengths = load_dataset(tokenizer, args)
else:
srcs, tgts, src_lengths, tgt_lengths = None, None, None, None
time.sleep(30)
# Broadcast preprocessed dataset to other ranks
if world_size > 1:
srcs, tgts, src_lengths, tgt_lengths = broadcast_dataset(
world_size, rank, args.max_length_train,
srcs, tgts, src_lengths, tgt_lengths)
preproc_train_data = PreprocessedDataset(
min_len=args.min_length_train,
max_len=args.max_length_train,
vocab_size=tokenizer.vocab_size,
)
os.makedirs(args.preproc_data_dir, exist_ok=True)
preproc_train_data.write_data(
os.path.join(args.preproc_data_dir, 'training.bin'),
(srcs, src_lengths),
(tgts, tgt_lengths),
)
if __name__ == "__main__":
main()
pytablewriter==0.64.0
sacrebleu==1.2.10 sacrebleu==1.2.10
sacremoses==0.0.19
pynvml==8.0.4
#git+https://github.com/rsennrich/subword-nmt.git@48ba99e657591c329e0003f0c6e32e493fa959ef
#!/bin/bash
#for multinode
source `pwd`/config_DGX1_multi.sh
set -e
# start timing
start=$(date +%s)
start_fmt=$(date +%Y-%m-%d\ %r)
echo "STARTING TIMING RUN AT $start_fmt"
export NCCL_DEBUG=${NCCL_DEBUG:-"WARN"}
# run benchmark
set -x
DATASET_DIR='../wmt16_de_en/'
PREPROC_DATADIR='./preproc_data'
RESULTS_DIR='gnmt_wmt16'
## DL params
export LR=${LR:-"2.0e-3"}
export TRAIN_BATCH_SIZE=${TRAIN_BATCH_SIZE:-64}
export TEST_BATCH_SIZE=${TEST_BATCH_SIZE:-64}
export WARMUP_STEPS=${WARMUP_STEPS:-200}
export REMAIN_STEPS=${REMAIN_STEPS:-6453}
export DECAY_INTERVAL=${DECAY_INTERVAL:-809}
export TARGET=${TARGET:-24.0}
export MAX_SEQ_LEN=${MAX_SEQ_LEN:-75}
export NUMEPOCHS=${NUMEPOCHS:-20}
export MATH=${MATH:-fp32}
export DIST_OPTS=${DIST_OPTS-"\
--distributed-weight-update 2 \
--dwu-num-blocks 1 \
--dwu-num-chunks 2 \
--dwu-num-rs-pg 2 \
--dwu-num-ar-pg 2 \
--dwu-num-ag-pg 0 \
--dwu-grad-norm \
"}
export EXTRA_OPTS=${EXTRA_OPTS-"\
--fused-attention \
--fused-xentropy \
--no-log-all-ranks \
"}
declare -a CMD
echo "running benchmark"
CMD_ARGS=("--save ${RESULTS_DIR}" "--dataset-dir ${DATASET_DIR}" "--preproc-data-dir ${PREPROC_DATADIR}/${MAX_SEQ_LEN}" "--target-bleu $TARGET" "--epochs "${NUMEPOCHS}"" "--math ${MATH}" "--max-length-train ${MAX_SEQ_LEN}" "--print-freq 10" "--train-batch-size $TRAIN_BATCH_SIZE" "--test-batch-size $TEST_BATCH_SIZE" "--optimizer FusedAdam" "--lr $LR" "--warmup-steps $WARMUP_STEPS" "--remain-steps $REMAIN_STEPS" "--decay-interval $DECAY_INTERVAL")
hostfile=./$SLURM_JOB_ID
scontrol show hostnames $SLURM_JOB_NODELIST > ${hostfile}
#rm `pwd`/hostfile-dl -f
cat ${hostfile} > `pwd`/tmp
dist_url=`sed -n '1p' ./tmp`
#echo $dist_url
rank=0
num_lines=`cat ./tmp |wc -l`
for((i=0;i<$num_lines-1;i++))
do
((rank=$i+1))
nodename=$(cat ./tmp |sed -n "${rank}p")
ssh ${nodename} "cd `pwd` && module rm compiler/rocm/2.9 && source /public/home/aiss/Pytorch/env_torch1.5_miopen24.sh && python3 -m bind_launch --nnodes=$num_lines --node_rank=$i --master_addr=${dist_url} --master_port=4567 --nsockets_per_node ${DGXNSOCKET} --ncores_per_socket ${DGXSOCKETCORES} --nproc_per_node $SLURM_NTASKS_PER_NODE --no_hyperthreads `pwd`/train.py ${CMD_ARGS[@]}" &
done
((i=$num_lines-1))
nodename=$(cat ./tmp |sed -n "${num_lines}p")
ssh ${nodename} "cd `pwd` && module rm compiler/rocm/2.9 && source /public/home/aiss/Pytorch/env_torch1.5_miopen24.sh && python3 -m bind_launch --nnodes=$num_lines --node_rank=$i --master_addr=${dist_url} --master_port=4567 --nsockets_per_node ${DGXNSOCKET} --ncores_per_socket ${DGXSOCKETCORES} --nproc_per_node $SLURM_NTASKS_PER_NODE --no_hyperthreads `pwd`/train.py ${CMD_ARGS[@]}"
set +x
sleep 3
# end timing
end=$(date +%s)
end_fmt=$(date +%Y-%m-%d\ %r)
echo "ENDING TIMING RUN AT $end_fmt"
# report result
result=$(( $end - $start ))
result_name="RNN_TRANSLATOR"
echo "RESULT,$result_name,,$result,nvidia,$start_fmt"
#!/bin/bash
#for singnode
source `pwd`/config_DGX1.sh
set -e
# start timing
start=$(date +%s)
start_fmt=$(date +%Y-%m-%d\ %r)
echo "STARTING TIMING RUN AT $start_fmt"
export NCCL_DEBUG=${NCCL_DEBUG:-"WARN"}
# run benchmark
set -x
DATASET_DIR='../wmt16_de_en/'
PREPROC_DATADIR='./preproc_data'
RESULTS_DIR='gnmt_wmt16'
DIST_OPTS=${DIST_OPTS:-""}
EXTRA_OPTS=${EXTRA_OPTS:-""}
declare -a CMD
CMD=( 'python3' '-u' '-m' 'bind_launch' "--nsockets_per_node=${DGXNSOCKET}" \
"--ncores_per_socket=${DGXSOCKETCORES}" "--nproc_per_node=${DGXNGPU}" "--no_hyperthreads")
echo "running benchmark"
#for 1 node fp32 training
"${CMD[@]}" train.py \
--save ${RESULTS_DIR} \
--dataset-dir ${DATASET_DIR} \
--preproc-data-dir ${PREPROC_DATADIR}/${MAX_SEQ_LEN} \
--target-bleu $TARGET \
--epochs "${NUMEPOCHS}" \
--math ${MATH} \
--max-length-train ${MAX_SEQ_LEN} \
--print-freq 10 \
--train-batch-size $TRAIN_BATCH_SIZE \
--test-batch-size $TEST_BATCH_SIZE \
--optimizer Adam \
--lr $LR \
--warmup-steps $WARMUP_STEPS \
--remain-steps $REMAIN_STEPS \
--decay-interval $DECAY_INTERVAL \
$DIST_OPTS \
$EXTRA_OPTS ; ret_code=$?
set +x
sleep 3
if [[ $ret_code != 0 ]]; then exit $ret_code; fi
# end timing
end=$(date +%s)
end_fmt=$(date +%Y-%m-%d\ %r)
echo "ENDING TIMING RUN AT $end_fmt"
# report result
result=$(( $end - $start ))
result_name="RNN_TRANSLATOR"
echo "RESULT,$result_name,,$result,nvidia,$start_fmt"
#!/bin/bash
source `pwd`/config_DGX1.sh
set -e
# start timing
start=$(date +%s)
start_fmt=$(date +%Y-%m-%d\ %r)
echo "STARTING TIMING RUN AT $start_fmt"
export NCCL_DEBUG=${NCCL_DEBUG:-"WARN"}
# run benchmark
set -x
DATASET_DIR='../wmt16_de_en/'
PREPROC_DATADIR='./preproc_data'
RESULTS_DIR='gnmt_wmt16'
DIST_OPTS=${DIST_OPTS:-""}
EXTRA_OPTS=${EXTRA_OPTS:-""}
declare -a CMD
CMD=( 'python3' '-u' '-m' 'bind_launch' "--nsockets_per_node=${DGXNSOCKET}" \
"--ncores_per_socket=${DGXSOCKETCORES}" "--nproc_per_node=${DGXNGPU}" "--no_hyperthreads")
echo "running benchmark"
# run training
#for 1 card fp32 training
HIP_VISIBLE_DEVICES=0 python3 train.py \
--save ${RESULTS_DIR} \
--dataset-dir ${DATASET_DIR} \
--preproc-data-dir ${PREPROC_DATADIR}/${MAX_SEQ_LEN} \
--target-bleu $TARGET \
--epochs "${NUMEPOCHS}" \
--math ${MATH} \
--max-length-train ${MAX_SEQ_LEN} \
--print-freq 10 \
--train-batch-size $TRAIN_BATCH_SIZE \
--test-batch-size $TEST_BATCH_SIZE \
--optimizer Adam \
--lr $LR \
--warmup-steps $WARMUP_STEPS \
--remain-steps $REMAIN_STEPS \
--decay-interval $DECAY_INTERVAL \
$EXTRA_OPTS ; ret_code=$?
set +x
sleep 3
if [[ $ret_code != 0 ]]; then exit $ret_code; fi
# end timing
end=$(date +%s)
end_fmt=$(date +%Y-%m-%d\ %r)
echo "ENDING TIMING RUN AT $end_fmt"
# report result
result=$(( $end - $start ))
result_name="RNN_TRANSLATOR"
echo "RESULT,$result_name,,$result,nvidia,$start_fmt"
...@@ -20,9 +20,7 @@ ...@@ -20,9 +20,7 @@
import argparse import argparse
from collections import Counter from collections import Counter
import sys
import importlib
importlib.reload(sys)
def parse_args(): def parse_args():
parser = argparse.ArgumentParser(description='Clean dataset') parser = argparse.ArgumentParser(description='Clean dataset')
...@@ -32,8 +30,7 @@ def parse_args(): ...@@ -32,8 +30,7 @@ def parse_args():
def save_output(fname, data): def save_output(fname, data):
#with open(fname, 'w') as f: with open(fname, 'w') as f:
with open(fname, 'w', encoding='utf-8') as f:
f.writelines(data) f.writelines(data)
...@@ -74,8 +71,7 @@ def main(): ...@@ -74,8 +71,7 @@ def main():
data1 = [] data1 = []
data2 = [] data2 = []
#with open(args.file1) as f1, open(args.file2) as f2: with open(args.file1) as f1, open(args.file2) as f2:
with open(args.file1, 'r', encoding='utf-8') as f1, open(args.file2, 'r', encoding='utf-8') as f2:
for idx, lines in enumerate(zip(f1, f2)): for idx, lines in enumerate(zip(f1, f2)):
line1, line2 = lines line1, line2 = lines
if idx % 100000 == 1: if idx % 100000 == 1:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment