Initial commit

9e8a8c05 · jerrrrry · 9e8a8c05 · 9e8a8c05 · 9e8a8c05 · 9e8a8c05
Commit 9e8a8c05 authored Oct 14, 2024 by jerrrrry
20 changed files
--- a/implementations/pytorch/CONTRIBUTING.md
+++ b/implementations/pytorch/CONTRIBUTING.md
+# Contributing to FAIR Sequence-to-Sequence Toolkit (PyTorch)
+We want to make contributing to this project as easy and transparent as
+possible.
+## Pull Requests
+We actively welcome your pull requests.
+1. Fork the repo and create your branch from `master`.
+2. If you've added code that should be tested, add tests.
+3. If you've changed APIs, update the documentation.
+4. Ensure the test suite passes.
+5. Make sure your code lints.
+6. If you haven't already, complete the Contributor License Agreement ("CLA").
+## Contributor License Agreement ("CLA")
+In order to accept your pull request, we need you to submit a CLA. You only need
+to do this once to work on any of Facebook's open source projects.
+Complete your CLA here: <https://code.facebook.com/cla>
+## Issues
+We use GitHub issues to track public bugs. Please ensure your description is
+clear and has sufficient instructions to be able to reproduce the issue.
+## Coding Style
+We try to follow the PEP style guidelines and encourage you to as well.
+## License
+By contributing to FAIR Sequence-to-Sequence Toolkit, you agree that your contributions will be licensed
+under the LICENSE file in the root directory of this source tree.
\ No newline at end of file
--- a/implementations/pytorch/DGXA100-nic-affinity-minimal.xml
+++ b/implementations/pytorch/DGXA100-nic-affinity-minimal.xml
+<system version="1">
+  <cpu numaid="3" affinity="00000000,00000000,ffff0000,00000000,00000000,00000000,ffff0000,00000000" arch="x86_64" vendor="AuthenticAMD" familyid="143" modelid="49">
+    <pci busid="0000:01:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
+      <pci busid="0000:03:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
+        <pci busid="0000:07:00.0" class="0x030200" link_speed="16 GT/s" link_width="16"/>
+        <pci busid="0000:12:00.0" class="0x020700" link_speed="16 GT/s" link_width="16"/>
+      </pci>
+      <pci busid="0000:0a:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
+        <pci busid="0000:0f:00.0" class="0x030200" link_speed="16 GT/s" link_width="16"/>
+        <pci busid="0000:0c:00.0" class="0x020700" link_speed="16 GT/s" link_width="16"/>
+      </pci>
+    </pci>
+  </cpu>
+  <cpu numaid="1" affinity="00000000,00000000,00000000,ffff0000,00000000,00000000,00000000,ffff0000" arch="x86_64" vendor="AuthenticAMD" familyid="143" modelid="49">
+    <pci busid="0000:41:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
+      <pci busid="0000:43:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
+        <pci busid="0000:47:00.0" class="0x030200" link_speed="16 GT/s" link_width="16"/>
+        <pci busid="0000:54:00.0" class="0x020700" link_speed="16 GT/s" link_width="16"/>
+      </pci>
+      <pci busid="0000:49:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
+        <pci busid="0000:4e:00.0" class="0x030200" link_speed="16 GT/s" link_width="16"/>
+        <pci busid="0000:4b:00.0" class="0x020700" link_speed="16 GT/s" link_width="16"/>
+      </pci>
+    </pci>
+  </cpu>
+  <cpu numaid="7" affinity="ffff0000,00000000,00000000,00000000,ffff0000,00000000,00000000,00000000" arch="x86_64" vendor="AuthenticAMD" familyid="143" modelid="49">
+    <pci busid="0000:81:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
+      <pci busid="0000:83:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
+        <pci busid="0000:87:00.0" class="0x030200" link_speed="16 GT/s" link_width="16"/>
+        <pci busid="0000:94:00.0" class="0x020700" link_speed="16 GT/s" link_width="16"/>
+      </pci>
+      <pci busid="0000:8b:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
+        <pci busid="0000:90:00.0" class="0x030200" link_speed="16 GT/s" link_width="16"/>
+        <pci busid="0000:8d:00.0" class="0x020700" link_speed="16 GT/s" link_width="16"/>
+      </pci>
+    </pci>
+  </cpu>
+  <cpu numaid="5" affinity="00000000,ffff0000,00000000,00000000,00000000,ffff0000,00000000,00000000" arch="x86_64" vendor="AuthenticAMD" familyid="143" modelid="49">
+    <pci busid="0000:b1:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
+      <pci busid="0000:b3:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
+        <pci busid="0000:b7:00.0" class="0x030200" link_speed="16 GT/s" link_width="16"/>
+        <pci busid="0000:ca:00.0" class="0x020700" link_speed="16 GT/s" link_width="16"/>
+      </pci>
+      <pci busid="0000:b8:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
+        <pci busid="0000:bd:00.0" class="0x030200" link_speed="16 GT/s" link_width="16"/>
+        <pci busid="0000:ba:00.0" class="0x020700" link_speed="16 GT/s" link_width="16"/>
+      </pci>
+    </pci>
+  </cpu>
+</system>
--- a/implementations/pytorch/Dockerfile
+++ b/implementations/pytorch/Dockerfile
+# Copyright (c) 2018-2020, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+ARG FROM_IMAGE_NAME=nvcr.io/nvidia/pytorch:20.06-py3
+FROM ${FROM_IMAGE_NAME}
+# Install dependencies for system configuration logger
+RUN apt-get update \
+ && apt-get install -y --no-install-recommends \
+        infiniband-diags \
+        pciutils \
+ && rm -rf /var/lib/apt/lists/*
+# Install Python dependencies
+WORKDIR /workspace/translation
+COPY requirements.txt .
+RUN pip install --no-cache-dir https://github.com/mlperf/logging/archive/9ea0afa.zip \
+ && pip install --no-cache-dir -r requirements.txt
+# Copy and build Transformer
+COPY . .
+RUN pip install -e .
+# Force tcmalloc
+ENV LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4
--- a/implementations/pytorch/LICENSE
+++ b/implementations/pytorch/LICENSE
+BSD License
+For fairseq software
+Copyright (c) 2017-present, Facebook, Inc. All rights reserved.
+Redistribution and use in source and binary forms, with or without modification,
+are permitted provided that the following conditions are met:
+ * Redistributions of source code must retain the above copyright notice, this
+    list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright notice,
+    this list of conditions and the following disclaimer in the documentation
+       and/or other materials provided with the distribution.
+ * Neither the name Facebook nor the names of its contributors may be used to
+    endorse or promote products derived from this software without specific
+       prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--- a/implementations/pytorch/PATENTS
+++ b/implementations/pytorch/PATENTS
+Additional Grant of Patent Rights Version 2
+"Software" means the fairseq software distributed by Facebook, Inc.
+Facebook, Inc. ("Facebook") hereby grants to each recipient of the Software
+("you") a perpetual, worldwide, royalty-free, non-exclusive, irrevocable
+(subject to the termination provision below) license under any Necessary
+Claims, to make, have made, use, sell, offer to sell, import, and otherwise
+transfer the Software. For avoidance of doubt, no license is granted under
+Facebook’s rights in any patent claims that are infringed by (i) modifications
+to the Software made by you or any third party or (ii) the Software in
+combination with any software or other technology.
+The license granted hereunder will terminate, automatically and without notice,
+if you (or any of your subsidiaries, corporate affiliates or agents) initiate
+directly or indirectly, or take a direct financial interest in, any Patent
+Assertion: (i) against Facebook or any of its subsidiaries or corporate
+affiliates, (ii) against any party if such Patent Assertion arises in whole or
+in part from any software, technology, product or service of Facebook or any of
+its subsidiaries or corporate affiliates, or (iii) against any party relating
+to the Software. Notwithstanding the foregoing, if Facebook or any of its
+subsidiaries or corporate affiliates files a lawsuit alleging patent
+infringement against you in the first instance, and you respond by filing a
+patent infringement counterclaim in that lawsuit against that party that is
+unrelated to the Software, the license granted hereunder will not terminate
+under section (i) of this paragraph due to such counterclaim.
+A "Necessary Claim" is a claim of a patent owned by Facebook that is
+necessarily infringed by the Software standing alone.
+A "Patent Assertion" is any lawsuit or other action alleging direct, indirect,
+or contributory infringement or inducement to infringe any patent, including a
+cross-claim or counterclaim.
--- a/implementations/pytorch/README.md
+++ b/implementations/pytorch/README.md
+# 1. Problem 
+This problem uses Attention mechanisms to do language translation.
+## Requirements
+* [PyTorch 20.06-py3 NGC container](https://ngc.nvidia.com/registry/nvidia-pytorch)
+* Slurm with [Pyxis](https://github.com/NVIDIA/pyxis) and [Enroot](https://github.com/NVIDIA/enroot)
+* [nvidia-docker](https://github.com/NVIDIA/nvidia-docker)
+# 2. Directions
+### Steps to download and verify data
+Downloading and preprocessing the data is handled inside submission scripts. To do this manually run 
+    bash run_preprocessing.sh && bash run_conversion.sh
+The raw downloaded data is stored in /raw_data and preprocessed data is stored in /workspace/translation/examples/translation/wmt14_en_de. Your external DATADIR path can be mounted to this location to be used in the following steps. The vocabulary file provided by the MLPerf v0.7 transformer reference is stored inside of the container at /workspace/translation/reference_dictionary.ende.txt.
+## Steps to launch training on a single node
+For single-node training, we use docker to run our container.
+### NVIDIA DGX A100 (single node)
+Launch configuration and system-specific hyperparameters for the NVIDIA DGX A100
+single node submission are in the `config_DGXA100.sh` script.
+Steps required to launch single node training on NVIDIA DGX A100:
+1. Build the container and push to a docker registry:
+```
+docker build --pull -t <docker/registry>/mlperf-nvidia:translation .
+docker push <docker/registry>/mlperf-nvidia:translation
+```
+2. Launch the training:
+```
+source config_DGXA100.sh
+CONT="<docker/registry>/mlperf-nvidia:translation" DATADIR=<path/to/data/dir> LOGDIR=<path/to/output/dir> sbatch -N $DGXNNODES -t $WALLTIME run.sub
+```
+#### Alternative launch with nvidia-docker
+When generating results for the official v0.7 submission with one node, the
+benchmark was launched onto a cluster managed by a SLURM scheduler. The
+instructions in [NVIDIA DGX A100 (single node)](#nvidia-dgx-a100-single-node) explain
+how that is done.
+However, to make it easier to run this benchmark on a wider set of machine
+environments, we are providing here an alternate set of launch instructions
+that can be run using nvidia-docker. Note that performance or functionality may
+vary from the tested SLURM instructions.
+```
+docker build --pull -t mlperf-nvidia:translation .
+source config_DGXA100.sh
+CONT=mlperf-nvidia:translation DATADIR=<path/to/data/dir> LOGDIR=<path/to/output/dir> ./run_with_docker.sh
+```
+### NVIDIA DGX-1 (single node)
+Launch configuration and system-specific hyperparameters for the NVIDIA DGX-1
+single node submission are in the `config_DGX1.sh` script.
+Steps required to launch single node training on NVIDIA DGX-1:
+1. Build the container and push to a docker registry:
+```
+docker build --pull -t <docker/registry>/mlperf-nvidia:translation .
+docker push <docker/registry>/mlperf-nvidia:translation
+```
+2. Launch the training:
+```
+source config_DGX1.sh
+CONT="<docker/registry>/mlperf-nvidia:translation" DATADIR=<path/to/data/dir> LOGDIR=<path/to/output/dir> sbatch -N $DGXNNODES -t $WALLTIME run.sub
+```
+#### Alternative launch with nvidia-docker
+When generating results for the official v0.7 submission with one node, the
+benchmark was launched onto a cluster managed by a SLURM scheduler. The
+instructions in [NVIDIA DGX-1 (single node)](#nvidia-dgx-1-single-node) explain
+how that is done.
+However, to make it easier to run this benchmark on a wider set of machine
+environments, we are providing here an alternate set of launch instructions
+that can be run using nvidia-docker. Note that performance or functionality may
+vary from the tested SLURM instructions.
+```
+docker build --pull -t mlperf-nvidia:translation .
+source config_DGX1.sh
+CONT=mlperf-nvidia:translation DATADIR=<path/to/data/dir> LOGDIR=<path/to/output/dir> ./run_with_docker.sh
+```
+### NVIDIA DGX-2H (single node)
+Launch configuration and system-specific hyperparameters for the NVIDIA DGX-2H
+single node submission are in the `config_DGX2.sh` script.
+Steps required to launch single node training on NVIDIA DGX-2H:
+1. Build the container and push to a docker registry:
+```
+docker build --pull -t <docker/registry>/mlperf-nvidia:translation .
+docker push <docker/registry>/mlperf-nvidia:translation
+```
+2. Launch the training:
+```
+source config_DGX2.sh
+CONT="<docker/registry>/mlperf-nvidia:translation" DATADIR=<path/to/data/dir> LOGDIR=<path/to/output/dir> sbatch -N $DGXNNODES -t $WALLTIME run.sub
+```
+#### Alternative launch with nvidia-docker
+When generating results for the official v0.7 submission with one node, the
+benchmark was launched onto a cluster managed by a SLURM scheduler. The
+instructions in [NVIDIA DGX-2H (single node)](#nvidia-dgx-2h-single-node) explain
+how that is done.
+However, to make it easier to run this benchmark on a wider set of machine
+environments, we are providing here an alternate set of launch instructions
+that can be run using nvidia-docker. Note that performance or functionality may
+vary from the tested SLURM instructions.
+```
+docker build --pull -t mlperf-nvidia:translation .
+source config_DGX2.sh
+CONT=mlperf-nvidia:translation DATADIR=<path/to/data/dir> LOGDIR=<path/to/output/dir> ./run_with_docker.sh
+```
+## Steps to launch training on multiple nodes
+For multi-node training, we use Slurm for scheduling and Pyxis to run our container.
+### NVIDIA DGX A100 (multi node)
+Launch configuration and system-specific hyperparameters for the NVIDIA DGX A100
+multi node submission are in the following scripts:
+* for the 2-node NVIDIA DGX A100 submission: `config_DGXA100_multi_2x8x6912.sh` 
+* for the 10-node NVIDIA DGX A100 submission: `config_DGXA100_multi_10x8x9216.sh` 
+* for the 20-node NVIDIA DGX A100 submission: `config_DGXA100_multi_20x8x4608.sh` 
+* for the 60-node NVIDIA DGX A100 submission: `config_DGXA100_multi_60x8x1536.sh` 
+Steps required to launch multi node training on NVIDIA DGX A100:
+1. Build the container:
+```
+docker build --pull -t <docker/registry>/mlperf-nvidia:translation .
+docker push <docker/registry>/mlperf-nvidia:translation
+```
+2. Launch the training:
+2-node NVIDIA DGX A100 training:
+```
+source config_DGXA100_multi_2x8x6912.sh
+CONT="<docker/registry>/mlperf-nvidia:translation" DATADIR=<path/to/data/dir> LOGDIR=<path/to/output/dir> sbatch -N $DGXNNODES -t $WALLTIME run.sub
+```
+10-node NVIDIA DGX A100 training:
+```
+source config_DGXA100_multi_10x8x9216.sh
+CONT="<docker/registry>/mlperf-nvidia:translation" DATADIR=<path/to/data/dir> LOGDIR=<path/to/output/dir> sbatch -N $DGXNNODES -t $WALLTIME run.sub
+```
+20-node NVIDIA DGX A100 training:
+```
+source config_DGXA100_multi_20x8x4608.sh
+CONT="<docker/registry>/mlperf-nvidia:translation" DATADIR=<path/to/data/dir> LOGDIR=<path/to/output/dir> sbatch -N $DGXNNODES -t $WALLTIME run.sub
+```
+60-node NVIDIA DGX A100 training:
+```
+source config_DGXA100_multi_60x8x1536.sh
+CONT="<docker/registry>/mlperf-nvidia:translation" DATADIR=<path/to/data/dir> LOGDIR=<path/to/output/dir> sbatch -N $DGXNNODES -t $WALLTIME run.sub
+```
+### NVIDIA DGX-2H (multi node)
+Launch configuration and system-specific hyperparameters for the NVIDIA DGX-2H
+multi node submission are in the following scripts:
+* for the 10-node NVIDIA DGX-2H submission: `config_DGX2_multi_10x16x4608.sh`
+* for the 60-node NVIDIA DGX-2H submission: `config_DGX2_multi_60x16x768.sh`
+Steps required to launch multi node training on NVIDIA DGX-2H:
+1. Build the container:
+```
+docker build --pull -t <docker/registry>/mlperf-nvidia:translation .
+docker push <docker/registry>/mlperf-nvidia:translation
+```
+2. Launch the training:
+10-node NVIDIA DGX-2H training:
+```
+source config_DGX2_multi_10x16x4608.sh
+CONT="<docker/registry>/mlperf-nvidia:translation" DATADIR=<path/to/data/dir> LOGDIR=<path/to/output/dir> sbatch -N $DGXNNODES -t $WALLTIME run.sub
+```
+60-node NVIDIA DGX-2H training:
+```
+source config_DGX2_multi_60x16x768.sh
+CONT="<docker/registry>/mlperf-nvidia:translation" DATADIR=<path/to/data/dir> LOGDIR=<path/to/output/dir> sbatch -N $DGXNNODES -t $WALLTIME run.sub
+```
--- a/implementations/pytorch/__pycache__/bind_launch.cpython-310.pyc
+++ b/implementations/pytorch/__pycache__/bind_launch.cpython-310.pyc
--- a/implementations/pytorch/__pycache__/distributed_train.cpython-310.pyc
+++ b/implementations/pytorch/__pycache__/distributed_train.cpython-310.pyc
--- a/implementations/pytorch/__pycache__/mlperf_log_utils.cpython-310.pyc
+++ b/implementations/pytorch/__pycache__/mlperf_log_utils.cpython-310.pyc
--- a/implementations/pytorch/__pycache__/multiprocessing_train.cpython-310.pyc
+++ b/implementations/pytorch/__pycache__/multiprocessing_train.cpython-310.pyc
--- a/implementations/pytorch/__pycache__/train.cpython-310.pyc
+++ b/implementations/pytorch/__pycache__/train.cpython-310.pyc
--- a/implementations/pytorch/bind.sh
+++ b/implementations/pytorch/bind.sh
+#! /bin/bash
+set -euo pipefail
+print_usage() {
+    cat << EOF
+${0} [options] [--] COMMAND [ARG...]
+Control binding policy for each task. Assumes one rank will be launched for each GPU.
+Options:
+    --cpu=MODE
+        * exclusive -- bind each rank to an exclusive set of cores near its GPU
+        * exclusive,nosmt -- bind each rank to an exclusive set of cores near its GPU, without hyperthreading
+        * node -- bind each rank to all cores in the NUMA node nearest its GPU [default]
+	* *.sh -- bind each rank using the bash associative array bind_cpu_cores or bind_cpu_nodes from a file
+        * off -- don't bind
+    --mem=MODE
+        * node -- bind each rank to the nearest NUMA node [default]
+	* *.sh -- bind each rank using the bash associative array bind_mem from a file
+        * off -- don't bind
+    --ib=MODE
+        * single -- bind each rank to a single IB device near its GPU
+        * off -- don't bind [default]
+    --cluster=CLUSTER
+        Select which cluster is being used. May be required if system params cannot be detected.
+EOF
+}
+################################################################################
+# Argument parsing
+################################################################################
+cpu_mode='node'
+mem_mode='node'
+ib_mode='off'
+cluster=''
+while [ $# -gt 0 ]; do
+    case "$1" in
+        -h|--help) print_usage ; exit 0 ;;
+        --cpu=*) cpu_mode="${1/*=/}"; shift ;;
+        --cpu)   cpu_mode="$2"; shift 2 ;;
+        --mem=*) mem_mode="${1/*=/}"; shift ;;
+        --mem)   mem_mode="$2"; shift 2 ;;
+        --ib=*) ib_mode="${1/*=/}"; shift ;;
+        --ib)   ib_mode="$2"; shift 2 ;;
+        --cluster=*) cluster="${1/*=/}"; shift ;;
+        --cluster)   cluster="$2"; shift 2 ;;
+        --) shift; break ;;
+        *) break ;;
+    esac
+done
+if [ $# -lt 1 ]; then
+    echo 'ERROR: no command given' 2>&1
+    print_usage
+    exit 1
+fi
+################################################################################
+# Get system params
+################################################################################
+# LOCAL_RANK is set with an enroot hook for Pytorch containers
+# SLURM_LOCALID is set by Slurm
+# OMPI_COMM_WORLD_LOCAL_RANK is set by mpirun
+readonly local_rank="${LOCAL_RANK:=${SLURM_LOCALID:=${OMPI_COMM_WORLD_LOCAL_RANK:-}}}"
+if [ -z "${local_rank}" ]; then
+    echo 'ERROR: cannot read LOCAL_RANK from env' >&2
+    exit 1
+fi
+num_gpus=$(nvidia-smi -i 0 --query-gpu=count --format=csv,noheader,nounits)
+if [ "${local_rank}" -ge "${num_gpus}" ]; then
+    echo "ERROR: local rank is ${local_rank}, but there are only ${num_gpus} gpus available" >&2
+    exit 1
+fi
+get_lscpu_value() {
+    awk -F: "(\$1 == \"${1}\"){gsub(/ /, \"\", \$2); print \$2; found=1} END{exit found!=1}"
+}
+lscpu_out=$(lscpu)
+num_sockets=$(get_lscpu_value 'Socket(s)' <<< "${lscpu_out}")
+num_nodes=$(get_lscpu_value 'NUMA node(s)' <<< "${lscpu_out}")
+cores_per_socket=$(get_lscpu_value 'Core(s) per socket' <<< "${lscpu_out}")
+echo "num_sockets = ${num_sockets} num_nodes=${num_nodes} cores_per_socket=${cores_per_socket}"
+readonly cores_per_node=$(( (num_sockets * cores_per_socket) / num_nodes ))
+if [ ${num_gpus} -gt 1 ]; then
+    readonly gpus_per_node=$(( num_gpus / num_nodes ))
+else
+    readonly gpus_per_node=1
+fi
+readonly cores_per_gpu=$(( cores_per_node / gpus_per_node ))
+readonly local_node=$(( local_rank / gpus_per_node ))
+declare -a ibdevs=()
+case "${cluster}" in
+    circe)
+        # Need to specialize for circe because IB detection is hard
+        ibdevs=(mlx5_1 mlx5_2 mlx5_3 mlx5_4 mlx5_7 mlx5_8 mlx5_9 mlx5_10)
+        ;;
+   selene)
+        # Need to specialize for selene because IB detection is hard
+        ibdevs=(mlx5_0 mlx5_1 mlx5_2 mlx5_3 mlx5_6 mlx5_7 mlx5_8 mlx5_9)
+        ;;
+    '')
+        if ibstat_out="$(ibstat -l 2>/dev/null | sort -V)" ; then
+            mapfile -t ibdevs <<< "${ibstat_out}"
+        fi
+        ;;
+    *)
+        echo "ERROR: Unknown cluster '${cluster}'" >&2
+        exit 1
+        ;;
+esac
+readonly num_ibdevs="${#ibdevs[@]}"
+################################################################################
+# Setup for exec
+################################################################################
+declare -a numactl_args=()
+case "${cpu_mode}" in
+    exclusive)
+        numactl_args+=( "$(printf -- "--physcpubind=%u-%u,%u-%u" \
+            $(( local_rank * cores_per_gpu )) \
+            $(( (local_rank + 1) * cores_per_gpu - 1 )) \
+            $(( local_rank * cores_per_gpu + (cores_per_gpu * gpus_per_node * num_nodes) )) \
+            $(( (local_rank + 1) * cores_per_gpu + (cores_per_gpu * gpus_per_node * num_nodes) - 1 )) \
+        )" )
+        ;;
+    exclusive,nosmt)
+        numactl_args+=( "$(printf -- "--physcpubind=%u-%u" \
+            $(( local_rank * cores_per_gpu )) \
+            $(( (local_rank + 1) * cores_per_gpu - 1 )) \
+        )" )
+        ;;
+    node)
+        numactl_args+=( "--cpunodebind=${local_node}" )
+        ;;
+    *.sh)
+	source "${cpu_mode}"
+	if [ -n "${bind_cpu_cores:-}" ]; then
+	    numactl_args+=( "--physcpubind=${bind_cpu_cores[${local_rank}]}" )
+	elif [ -n "${bind_cpu_nodes:-}" ]; then
+	    numactl_args+=( "--cpunodebind=${bind_cpu_nodes[${local_rank}]}" )
+	else
+	    echo "ERROR: invalid CPU affinity file ${cpu_mode}." >&2
+	    exit 1
+	fi
+	;;
+    off|'')
+        ;;
+    *)
+        echo "ERROR: invalid cpu mode '${cpu_mode}'" 2>&1
+        print_usage
+        exit 1
+        ;;
+esac
+case "${mem_mode}" in
+    node)
+        numactl_args+=( "--membind=${local_node}" )
+        ;;
+    *.sh)
+	source "${mem_mode}"
+	if [ -z "${bind_mem:-}" ]; then
+	    echo "ERROR: invalid memory affinity file ${mem_mode}." >&2
+	    exit 1
+	fi
+	numactl_args+=( "--membind=${bind_mem[${local_rank}]}" )
+	;;
+    off|'')
+        ;;
+    *)
+        echo "ERROR: invalid mem mode '${mem_mode}'" 2>&1
+        print_usage
+        exit 1
+        ;;
+esac
+case "${ib_mode}" in
+    single)
+        if [ "${num_ibdevs}" -eq 0 ]; then
+            echo "WARNING: used '$0 --ib=single', but there are 0 IB devices available; skipping IB binding." 2>&1
+        else
+            readonly ibdev="${ibdevs[$(( local_rank * num_ibdevs / num_gpus ))]}"
+            export OMPI_MCA_btl_openib_if_include="${OMPI_MCA_btl_openib_if_include-$ibdev}"
+            export UCX_NET_DEVICES="${UCX_NET_DEVICES-$ibdev:1}"
+        fi
+        ;;
+    off|'')
+        ;;
+    *)
+        echo "ERROR: invalid ib mode '${ib_mode}'" 2>&1
+        print_usage
+        exit 1
+        ;;
+esac
+################################################################################
+# Exec
+################################################################################
+if [ "${#numactl_args[@]}" -gt 0 ] ; then
+    set -x
+    exec numactl "${numactl_args[@]}" -- "${@}"
+else
+    exec "${@}"
+fi
--- a/implementations/pytorch/bind_launch.py
+++ b/implementations/pytorch/bind_launch.py
+import sys
+import subprocess
+import os
+import socket
+from argparse import ArgumentParser, REMAINDER
+import torch
+def parse_args():
+    """
+    Helper function parsing the command line options
+    @retval ArgumentParser
+    """
+    parser = ArgumentParser(description="PyTorch distributed training launch "
+                                        "helper utilty that will spawn up "
+                                        "multiple distributed processes")
+    # Optional arguments for the launch helper
+    parser.add_argument("--nnodes", type=int, default=1,
+                        help="The number of nodes to use for distributed "
+                             "training")
+    parser.add_argument("--node_rank", type=int, default=0,
+                        help="The rank of the node for multi-node distributed "
+                             "training")
+    parser.add_argument("--nproc_per_node", type=int, default=1,
+                        help="The number of processes to launch on each node, "
+                             "for GPU training, this is recommended to be set "
+                             "to the number of GPUs in your system so that "
+                             "each process can be bound to a single GPU.")
+    parser.add_argument("--master_addr", default="127.0.0.1", type=str,
+                        help="Master node (rank 0)'s address, should be either "
+                             "the IP address or the hostname of node 0, for "
+                             "single node multi-proc training, the "
+                             "--master_addr can simply be 127.0.0.1")
+    parser.add_argument("--master_port", default=29500, type=int,
+                        help="Master node (rank 0)'s free port that needs to "
+                             "be used for communciation during distributed "
+                             "training")
+    parser.add_argument('--no_hyperthreads', action='store_true',
+                        help='Flag to disable binding to hyperthreads')
+    parser.add_argument('--no_membind', action='store_true',
+                        help='Flag to disable memory binding')
+    # non-optional arguments for binding
+    parser.add_argument("--nsockets_per_node", type=int, required=True,
+                        help="Number of CPU sockets on a node")
+    parser.add_argument("--ncores_per_socket", type=int, required=True,
+                        help="Number of CPU cores per socket")
+    # positional
+    parser.add_argument("training_script", type=str,
+                        help="The full path to the single GPU training "
+                             "program/script to be launched in parallel, "
+                             "followed by all the arguments for the "
+                             "training script")
+    # rest from the training program
+    parser.add_argument('training_script_args', nargs=REMAINDER)
+    return parser.parse_args()
+def main():
+    args = parse_args()
+    # variables for numactrl binding
+    NSOCKETS = args.nsockets_per_node
+    NGPUS_PER_SOCKET = args.nproc_per_node // args.nsockets_per_node
+    NCORES_PER_GPU = args.ncores_per_socket // NGPUS_PER_SOCKET
+    # world size in terms of number of processes
+    dist_world_size = args.nproc_per_node * args.nnodes
+    # set PyTorch distributed related environmental variables
+    current_env = os.environ.copy()
+    current_env["MASTER_ADDR"] = args.master_addr
+    current_env["MASTER_PORT"] = str(args.master_port)
+    current_env["WORLD_SIZE"] = str(dist_world_size)
+    processes = []
+    for local_rank in range(0, args.nproc_per_node):
+        # each process's rank
+        dist_rank = args.nproc_per_node * args.node_rank + local_rank
+        current_env["RANK"] = str(dist_rank)
+        # form numactrl binding command
+        cpu_ranges = [local_rank * NCORES_PER_GPU,
+                     (local_rank + 1) * NCORES_PER_GPU - 1,
+                     local_rank * NCORES_PER_GPU + (NCORES_PER_GPU * NGPUS_PER_SOCKET * NSOCKETS),
+                     (local_rank + 1) * NCORES_PER_GPU + (NCORES_PER_GPU * NGPUS_PER_SOCKET * NSOCKETS) - 1]
+        numactlargs = []
+        if args.no_hyperthreads:
+            numactlargs += [ "--physcpubind={}-{}".format(*cpu_ranges[0:2]) ]
+        else:
+            numactlargs += [ "--physcpubind={}-{},{}-{}".format(*cpu_ranges) ]
+        if not args.no_membind:
+            memnode = local_rank // NGPUS_PER_SOCKET
+            numactlargs += [ "--membind={}".format(memnode) ]
+        # spawn the processes
+        cmd = [ "/usr/bin/numactl" ] \
+            + numactlargs \
+            + [ sys.executable,
+                "-u",
+                args.training_script,
+                "--local_rank={}".format(local_rank)
+              ] \
+            + args.training_script_args
+        process = subprocess.Popen(cmd, env=current_env)
+        processes.append(process)
+    for process in processes:
+        process.wait()
+if __name__ == "__main__":
+    main()
--- a/implementations/pytorch/config_DGX1.sh
+++ b/implementations/pytorch/config_DGX1.sh
+## DL params
+export MAX_TOKENS=10240
+export LEARNING_RATE="1.9e-3"
+export WARMUP_UPDATES=750
+export EXTRA_PARAMS=" --dwu-num-blocks 4 --dwu-num-rs-pg 2 --dwu-num-ar-pg 2 --dwu-num-ag-pg 0 --dwu-overlap-reductions --dwu-num-chunks 1 --dwu-flat-mt --dwu-compute-L2-grad-norm --max-source-positions 64 --max-target-positions 64 --adam-betas (0.9,0.98) "
+## System run parms
+export DGXNNODES=1
+export DGXSYSTEM=$(basename $(readlink -f ${BASH_SOURCE[0]}) | sed 's/^config_//' | sed 's/\.sh$//' )
+export WALLTIME=02:00:00
+## System config params
+export DGXNGPU=8
+export DGXSOCKETCORES=20
+export DGXNSOCKET=2
+export DGXHT=2         # HT is on is 2, HT off is 1
--- a/implementations/pytorch/config_DGX2.sh
+++ b/implementations/pytorch/config_DGX2.sh
+## DL params
+export MAX_TOKENS=8192
+export LEARNING_RATE="1.976e-3"
+export WARMUP_UPDATES=1000
+export EXTRA_PARAMS="--max-source-positions 80 --max-target-positions 80 --distributed-weight-update 2 --dwu-num-blocks 4 --dwu-num-rs-pg 2 --dwu-num-ar-pg 2 --dwu-num-ag-pg 0 --dwu-overlap-reductions --dwu-num-chunks 1 --dwu-flat-mt --dwu-compute-L2-grad-norm --adam-betas (0.9,0.98) "
+## System run parms
+export DGXNNODES=1
+export DGXSYSTEM=$(basename $(readlink -f ${BASH_SOURCE[0]}) | sed 's/^config_//' | sed 's/\.sh$//' )
+export WALLTIME=01:00:00
+## System config params
+export DGXNGPU=16
+export DGXSOCKETCORES=24
+export DGXNSOCKET=2
+export DGXHT=2         # HT is on is 2, HT off is 1
--- a/implementations/pytorch/config_DGX2_multi_10x16x4608.sh
+++ b/implementations/pytorch/config_DGX2_multi_10x16x4608.sh
+## DL params
+export MAX_TOKENS=4608
+export LEARNING_RATE="1.732e-3"
+export WARMUP_UPDATES=400
+export EXTRA_PARAMS="--distributed-weight-update 2 --dwu-num-blocks 4 --dwu-num-rs-pg 2 --dwu-num-ar-pg 2 --dwu-num-ag-pg 0 --dwu-overlap-reductions --dwu-num-chunks 1 --dwu-flat-mt --dwu-compute-L2-grad-norm --max-source-positions 76 --max-target-positions 76 --adam-betas (0.86,0.92) "
+## System run parms
+export DGXNNODES=10
+export DGXSYSTEM=$(basename $(readlink -f ${BASH_SOURCE[0]}) | sed 's/^config_//' | sed 's/\.sh$//' )
+export WALLTIME=00:25:00
+## System config params
+export DGXNGPU=16
+export DGXSOCKETCORES=24
+export DGXNSOCKET=2
+export DGXHT=2 	# HT is on is 2, HT off is 1
--- a/implementations/pytorch/config_DGX2_multi_60x16x768.sh
+++ b/implementations/pytorch/config_DGX2_multi_60x16x768.sh
+## DL params
+export MAX_TOKENS=768
+export LEARNING_RATE="1.732e-3"
+export WARMUP_UPDATES=400
+export EXTRA_PARAMS="--distributed-weight-update 2 --dwu-num-blocks 4 --dwu-num-rs-pg 2 --dwu-num-ar-pg 2 --dwu-num-ag-pg 0 --dwu-overlap-reductions --dwu-num-chunks 1 --dwu-flat-mt --dwu-compute-L2-grad-norm --max-source-positions 76 --max-target-positions 76 --adam-betas (0.86,0.92) "
+## System run parms
+export DGXNNODES=60
+export DGXSYSTEM=$(basename $(readlink -f ${BASH_SOURCE[0]}) | sed 's/^config_//' | sed 's/\.sh$//' )
+export WALLTIME=00:25:00
+## System config params
+export DGXNGPU=16
+export DGXSOCKETCORES=24
+export DGXNSOCKET=2
+export DGXHT=2 	# HT is on is 2, HT off is 1
--- a/implementations/pytorch/config_DGXA100.sh
+++ b/implementations/pytorch/config_DGXA100.sh
+## DL params
+export MAX_TOKENS=13824
+export LEARNING_RATE="1.9e-3"
+export WARMUP_UPDATES=750
+export EXTRA_PARAMS="--max-source-positions 64 --max-target-positions 64 --distributed-weight-update 2 --dwu-num-blocks 4 --dwu-num-rs-pg 2 --dwu-num-ar-pg 2 --dwu-num-ag-pg 0 --dwu-overlap-reductions --dwu-num-chunks 1 --dwu-flat-mt --dwu-compute-L2-grad-norm --adam-betas (0.9,0.98) "
+## System run parms
+export DGXNNODES=1
+export DGXSYSTEM=$(basename $(readlink -f ${BASH_SOURCE[0]}) | sed 's/^config_//' | sed 's/\.sh$//' )
+export WALLTIME=01:00:00
+## System config params
+export DGXNGPU=8
+export DGXSOCKETCORES=64
+export DGXNSOCKET=2
+export DGXHT=2         # HT is on is 2, HT off is 1
--- a/implementations/pytorch/config_DGXA100_multi_10x8x9216.sh
+++ b/implementations/pytorch/config_DGXA100_multi_10x8x9216.sh
+## DL params
+export MAX_TOKENS=9216
+export LEARNING_RATE="1.732e-3"
+export WARMUP_UPDATES=400
+export EXTRA_PARAMS="--distributed-weight-update 2 --dwu-num-blocks 4 --dwu-num-rs-pg 2 --dwu-num-ar-pg 2 --dwu-num-ag-pg 0 --dwu-overlap-reductions --dwu-num-chunks 1 --dwu-flat-mt --dwu-compute-L2-grad-norm --max-source-positions 76 --max-target-positions 76 --adam-betas (0.86,0.92) "
+## System run parms
+export DGXNNODES=10
+export DGXSYSTEM=$(basename $(readlink -f ${BASH_SOURCE[0]}) | sed 's/^config_//' | sed 's/\.sh$//' )
+export WALLTIME=00:40:00
+## System config params
+export DGXNGPU=8
+export DGXSOCKETCORES=64
+export DGXNSOCKET=2
+export DGXHT=2 	# HT is on is 2, HT off is 1
+# Topology file for distributed optimizer
+export NCCL_TOPO_FILE=/workspace/translation/DGXA100-nic-affinity-minimal.xml
--- a/implementations/pytorch/config_DGXA100_multi_20x8x4608.sh
+++ b/implementations/pytorch/config_DGXA100_multi_20x8x4608.sh
+## DL params
+export MAX_TOKENS=4608
+export LEARNING_RATE="1.732e-3"
+export WARMUP_UPDATES=400
+export EXTRA_PARAMS="--distributed-weight-update 2 --dwu-num-blocks 4 --dwu-num-rs-pg 2 --dwu-num-ar-pg 2 --dwu-num-ag-pg 0 --dwu-overlap-reductions --dwu-num-chunks 1 --dwu-flat-mt --dwu-compute-L2-grad-norm --max-source-positions 76 --max-target-positions 76 --adam-betas (0.86,0.92) "
+## System run parms
+export DGXNNODES=20
+export DGXSYSTEM=$(basename $(readlink -f ${BASH_SOURCE[0]}) | sed 's/^config_//' | sed 's/\.sh$//' )
+export WALLTIME=00:20:00
+## System config params
+export DGXNGPU=8
+export DGXSOCKETCORES=64
+export DGXNSOCKET=2
+export DGXHT=2 	# HT is on is 2, HT off is 1
+# Topology file for distributed optimizer
+export NCCL_TOPO_FILE=/workspace/translation/DGXA100-nic-affinity-minimal.xml