bert-large training

230156c4 · yangzhong · 230156c4 · 230156c4 · 230156c4 · 230156c4
Commit 230156c4 authored Oct 21, 2025 by yangzhong
20 changed files
--- a/__pycache__/tokenization.cpython-310.pyc
+++ b/__pycache__/tokenization.cpython-310.pyc
--- a/__pycache__/utils.cpython-310.pyc
+++ b/__pycache__/utils.cpython-310.pyc
--- a/bert.png
+++ b/bert.png
--- a/bert_config.json
+++ b/bert_config.json
+{
+  "attention_probs_dropout_prob": 0.1,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 1024,
+  "initializer_range": 0.02,
+  "intermediate_size": 4096,
+  "max_position_embeddings": 512,
+  "num_attention_heads": 16,
+  "num_hidden_layers": 24,
+  "type_vocab_size": 2,
+  "vocab_size": 30522
+}
--- a/bert_model.png
+++ b/bert_model.png
--- a/bert_pre1.sh
+++ b/bert_pre1.sh
+#!/bin/bash
+export HIP_VISIBLE_DEVICES=0
+mpirun --allow-run-as-root -np 1  single_pre1_1.sh
--- a/bert_pre1_4.log
+++ b/bert_pre1_4.log
--- a/bert_pre1_4.sh
+++ b/bert_pre1_4.sh
+export HIP_LAUNCH_BLOCKING=1
+mpirun --allow-run-as-root -np 4  single_pre1_4.sh
--- a/bert_pre1_4_fp16.sh
+++ b/bert_pre1_4_fp16.sh
+export HIP_LAUNCH_BLOCKING=1
+mpirun --allow-run-as-root -np 4  single_pre1_4_fp16.sh
+
+
--- a/bert_pre1_fp16.sh
+++ b/bert_pre1_fp16.sh
+#!/bin/bash
+mpirun --allow-run-as-root -np 1  single_pre1_1_fp16.sh
+
--- a/bert_pre2.sh
+++ b/bert_pre2.sh
+#!/bin/bash
+mpirun --allow-run-as-root -np 1  single_pre2_1.sh 
--- a/bert_pre2_4.sh
+++ b/bert_pre2_4.sh
+#!/bin/bash
+export HIP_LAUNCH_BLOCKING=1
+mpirun --allow-run-as-root -np 4  single_pre2_4.sh 
+
--- a/bert_pre2_4_fp16.sh
+++ b/bert_pre2_4_fp16.sh
+#!/bin/bash
+export HIP_LAUNCH_BLOCKING=1
+mpirun --allow-run-as-root -np 4  single_pre2_4_fp16.sh 
+
--- a/bert_pre2_fp16.sh
+++ b/bert_pre2_fp16.sh
+#!/bin/bash
+mpirun --allow-run-as-root -np 1  single_pre2_1_fp16.sh 
--- a/bert_squad.sh
+++ b/bert_squad.sh
+#!/bin/bash
+mpirun --allow-run-as-root -np 1  single_squad.sh 
+
+
+
--- a/bert_squad4.sh
+++ b/bert_squad4.sh
+#!/bin/bash
+#export LD_LIBRARY_PATH=/public/home/hepj/job_env/apps/dtk-21.10.1/lib
+
+mpirun --allow-run-as-root -np 4 single_squad4.sh  
+
+
+
+
+
--- a/bert_squad4_fp16.sh
+++ b/bert_squad4_fp16.sh
+#!/bin/bash
+#export LD_LIBRARY_PATH=/public/home/hepj/job_env/apps/dtk-21.10.1/lib
+
+mpirun --allow-run-as-root -np 4 single_squad4_fp16.sh  
+
+
+
+
+
--- a/bert_squad_fp16.sh
+++ b/bert_squad_fp16.sh
+#!/bin/bash
+mpirun --allow-run-as-root -np 1  single_squad_fp16.sh 
+
+
+
--- a/bind.sh
+++ b/bind.sh
+# Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#! /bin/bash
+set -euo pipefail
+
+print_usage() {
+    cat << EOF
+${0} [options] [--] COMMAND [ARG...]
+
+Control binding policy for each task. Assumes one rank will be launched for each GPU.
+
+Options:
+    --cpu=MODE
+        * exclusive -- bind each rank to an exclusive set of cores near its GPU
+        * exclusive,nosmt -- bind each rank to an exclusive set of cores near its GPU, without hyperthreading
+        * node -- bind each rank to all cores in the NUMA node nearest its GPU [default]
+	* *.sh -- bind each rank using the bash associative array bind_cpu_cores or bind_cpu_nodes from a file
+        * off -- don't bind
+    --mem=MODE
+        * node -- bind each rank to the nearest NUMA node [default]
+	* *.sh -- bind each rank using the bash associative array bind_mem from a file
+        * off -- don't bind
+    --ib=MODE
+        * single -- bind each rank to a single IB device near its GPU
+        * off -- donot bind [default]
+    --cluster=CLUSTER
+        Select which cluster is being used. May be required if system params cannot be detected.
+EOF
+}
+
+################################################################################
+# Argument parsing
+################################################################################
+
+cpu_mode='node'
+mem_mode='node'
+ib_mode='off'
+cluster=''
+while [ $# -gt 0 ]; do
+    case "$1" in
+        -h|--help) print_usage ; exit 0 ;;
+        --cpu=*) cpu_mode="${1/*=/}"; shift ;;
+        --cpu)   cpu_mode="$2"; shift 2 ;;
+        --mem=*) mem_mode="${1/*=/}"; shift ;;
+        --mem)   mem_mode="$2"; shift 2 ;;
+        --ib=*) ib_mode="${1/*=/}"; shift ;;
+        --ib)   ib_mode="$2"; shift 2 ;;
+        --cluster=*) cluster="${1/*=/}"; shift ;;
+        --cluster)   cluster="$2"; shift 2 ;;
+        --) shift; break ;;
+        *) break ;;
+    esac
+done
+if [ $# -lt 1 ]; then
+    echo 'ERROR: no command given' 2>&1
+    print_usage
+    exit 1
+fi
+
+################################################################################
+# Get system params
+################################################################################
+
+# LOCAL_RANK is set with an enroot hook for Pytorch containers
+# SLURM_LOCALID is set by Slurm
+# OMPI_COMM_WORLD_LOCAL_RANK is set by mpirun
+readonly local_rank="${LOCAL_RANK:=${SLURM_LOCALID:=${OMPI_COMM_WORLD_LOCAL_RANK:-}}}"
+if [ -z "${local_rank}" ]; then
+    echo 'ERROR: cannot read LOCAL_RANK from env' >&2
+    exit 1
+fi
+
+num_gpus=$(nvidia-smi -i 0 --query-gpu=count --format=csv,noheader,nounits)
+if [ "${local_rank}" -ge "${num_gpus}" ]; then
+    echo "ERROR: local rank is ${local_rank}, but there are only ${num_gpus} gpus available" >&2
+    exit 1
+fi
+
+get_lscpu_value() {
+    awk -F: "(\$1 == \"${1}\"){gsub(/ /, \"\", \$2); print \$2; found=1} END{exit found!=1}"
+}
+lscpu_out=$(lscpu)
+num_sockets=$(get_lscpu_value 'Socket(s)' <<< "${lscpu_out}")
+num_nodes=$(get_lscpu_value 'NUMA node(s)' <<< "${lscpu_out}")
+cores_per_socket=$(get_lscpu_value 'Core(s) per socket' <<< "${lscpu_out}")
+
+echo "num_sockets = ${num_sockets} num_nodes=${num_nodes} cores_per_socket=${cores_per_socket}"
+
+readonly cores_per_node=$(( (num_sockets * cores_per_socket) / num_nodes ))
+if [ ${num_gpus} -gt 1 ]; then
+    readonly gpus_per_node=$(( num_gpus / num_nodes ))
+else
+    readonly gpus_per_node=1
+fi
+readonly cores_per_gpu=$(( cores_per_node / gpus_per_node ))
+readonly local_node=$(( local_rank / gpus_per_node ))
+
+
+declare -a ibdevs=()
+case "${cluster}" in
+    circe)
+        # Need to specialize for circe because IB detection is hard
+        ibdevs=(mlx5_1 mlx5_2 mlx5_3 mlx5_4 mlx5_7 mlx5_8 mlx5_9 mlx5_10)
+        ;;
+   selene)
+        # Need to specialize for selene because IB detection is hard
+        ibdevs=(mlx5_0 mlx5_1 mlx5_2 mlx5_3 mlx5_6 mlx5_7 mlx5_8 mlx5_9)
+        ;;
+    '')
+        if ibstat_out="$(ibstat -l 2>/dev/null | sort -V)" ; then
+            mapfile -t ibdevs <<< "${ibstat_out}"
+        fi
+        ;;
+    *)
+        echo "ERROR: Unknown cluster '${cluster}'" >&2
+        exit 1
+        ;;
+esac
+readonly num_ibdevs="${#ibdevs[@]}"
+
+################################################################################
+# Setup for exec
+################################################################################
+
+declare -a numactl_args=()
+
+case "${cpu_mode}" in
+    exclusive)
+        numactl_args+=( "$(printf -- "--physcpubind=%u-%u,%u-%u" \
+            $(( local_rank * cores_per_gpu )) \
+            $(( (local_rank + 1) * cores_per_gpu - 1 )) \
+            $(( local_rank * cores_per_gpu + (cores_per_gpu * gpus_per_node * num_nodes) )) \
+            $(( (local_rank + 1) * cores_per_gpu + (cores_per_gpu * gpus_per_node * num_nodes) - 1 )) \
+        )" )
+        ;;
+    exclusive,nosmt)
+        numactl_args+=( "$(printf -- "--physcpubind=%u-%u" \
+            $(( local_rank * cores_per_gpu )) \
+            $(( (local_rank + 1) * cores_per_gpu - 1 )) \
+        )" )
+        ;;
+    node)
+        numactl_args+=( "--cpunodebind=${local_node}" )
+        ;;
+    *.sh)
+	source "${cpu_mode}"
+	if [ -n "${bind_cpu_cores:-}" ]; then
+	    numactl_args+=( "--physcpubind=${bind_cpu_cores[${local_rank}]}" )
+	elif [ -n "${bind_cpu_nodes:-}" ]; then
+	    numactl_args+=( "--cpunodebind=${bind_cpu_nodes[${local_rank}]}" )
+	else
+	    echo "ERROR: invalid CPU affinity file ${cpu_mode}." >&2
+	    exit 1
+	fi
+	;;
+    off|'')
+        ;;
+    *)
+        echo "ERROR: invalid cpu mode '${cpu_mode}'" 2>&1
+        print_usage
+        exit 1
+        ;;
+esac
+
+case "${mem_mode}" in
+    node)
+        numactl_args+=( "--membind=${local_node}" )
+        ;;
+    *.sh)
+	source "${mem_mode}"
+	if [ -z "${bind_mem:-}" ]; then
+	    echo "ERROR: invalid memory affinity file ${mem_mode}." >&2
+	    exit 1
+	fi
+	numactl_args+=( "--membind=${bind_mem[${local_rank}]}" )
+	;;
+    off|'')
+        ;;
+    *)
+        echo "ERROR: invalid mem mode '${mem_mode}'" 2>&1
+        print_usage
+        exit 1
+        ;;
+esac
+
+case "${ib_mode}" in
+    single)
+        if [ "${num_ibdevs}" -eq 0 ]; then
+            echo "WARNING: used '$0 --ib=single', but there are 0 IB devices available; skipping IB binding." 2>&1
+        else
+            readonly ibdev="${ibdevs[$(( local_rank * num_ibdevs / num_gpus ))]}"
+            export OMPI_MCA_btl_openib_if_include="${OMPI_MCA_btl_openib_if_include-$ibdev}"
+        fi
+        ;;
+    off|'')
+        ;;
+    *)
+        echo "ERROR: invalid ib mode '${ib_mode}'" 2>&1
+        print_usage
+        exit 1
+        ;;
+esac
+
+################################################################################
+# Exec
+################################################################################
+
+if [ "${#numactl_args[@]}" -gt 0 ] ; then
+    set -x
+    exec numactl "${numactl_args[@]}" -- "${@}"
+else
+    exec "${@}"
+fi
--- a/bind_pyt.py
+++ b/bind_pyt.py
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+import subprocess
+import os
+import socket
+from argparse import ArgumentParser, REMAINDER
+
+import torch
+
+
+def parse_args():
+    """
+    Helper function parsing the command line options
+    @retval ArgumentParser
+    """
+    parser = ArgumentParser(description="PyTorch distributed training launch "
+                                        "helper utilty that will spawn up "
+                                        "multiple distributed processes")
+
+    # Optional arguments for the launch helper
+    parser.add_argument("--nnodes", type=int, default=1,
+                        help="The number of nodes to use for distributed "
+                             "training")
+    parser.add_argument("--node_rank", type=int, default=0,
+                        help="The rank of the node for multi-node distributed "
+                             "training")
+    parser.add_argument("--nproc_per_node", type=int, default=1,
+                        help="The number of processes to launch on each node, "
+                             "for GPU training, this is recommended to be set "
+                             "to the number of GPUs in your system so that "
+                             "each process can be bound to a single GPU.")
+    parser.add_argument("--master_addr", default="127.0.0.1", type=str,
+                        help="Master node (rank 0)'s address, should be either "
+                             "the IP address or the hostname of node 0, for "
+                             "single node multi-proc training, the "
+                             "--master_addr can simply be 127.0.0.1")
+    parser.add_argument("--master_port", default=29500, type=int,
+                        help="Master node (rank 0)'s free port that needs to "
+                             "be used for communciation during distributed "
+                             "training")
+    parser.add_argument('--no_hyperthreads', action='store_true',
+                        help='Flag to disable binding to hyperthreads')
+    parser.add_argument('--no_membind', action='store_true',
+                        help='Flag to disable memory binding')
+
+    # non-optional arguments for binding
+    parser.add_argument("--nsockets_per_node", type=int, required=True,
+                        help="Number of CPU sockets on a node")
+    parser.add_argument("--ncores_per_socket", type=int, required=True,
+                        help="Number of CPU cores per socket")
+
+    # positional
+    parser.add_argument("training_script", type=str,
+                        help="The full path to the single GPU training "
+                             "program/script to be launched in parallel, "
+                             "followed by all the arguments for the "
+                             "training script")
+
+    # rest from the training program
+    parser.add_argument('training_script_args', nargs=REMAINDER)
+    return parser.parse_args()
+
+def main():
+    args = parse_args()
+
+    # variables for numactrl binding
+    
+    
+    NSOCKETS = args.nsockets_per_node
+    NGPUS_PER_SOCKET = (args.nproc_per_node // args.nsockets_per_node) + (1 if (args.nproc_per_node % args.nsockets_per_node) else 0)
+    NCORES_PER_GPU = args.ncores_per_socket // NGPUS_PER_SOCKET
+
+    # world size in terms of number of processes
+    dist_world_size = args.nproc_per_node * args.nnodes
+
+    # set PyTorch distributed related environmental variables
+    current_env = os.environ.copy()
+    current_env["MASTER_ADDR"] = args.master_addr
+    current_env["MASTER_PORT"] = str(args.master_port)
+    current_env["WORLD_SIZE"] = str(dist_world_size)
+
+    processes = []
+
+    for local_rank in range(0, args.nproc_per_node):
+        # each process's rank
+        dist_rank = args.nproc_per_node * args.node_rank + local_rank
+        current_env["RANK"] = str(dist_rank)
+
+        # form numactrl binding command
+        cpu_ranges = [local_rank * NCORES_PER_GPU,
+                     (local_rank + 1) * NCORES_PER_GPU - 1,
+                     local_rank * NCORES_PER_GPU + (NCORES_PER_GPU * NGPUS_PER_SOCKET * NSOCKETS),
+                     (local_rank + 1) * NCORES_PER_GPU + (NCORES_PER_GPU * NGPUS_PER_SOCKET * NSOCKETS) - 1]
+
+        numactlargs = []
+        if args.no_hyperthreads:
+            numactlargs += [ "--physcpubind={}-{}".format(*cpu_ranges[0:2]) ]
+        else:
+            numactlargs += [ "--physcpubind={}-{},{}-{}".format(*cpu_ranges) ]
+
+        if not args.no_membind:
+            memnode = local_rank // NGPUS_PER_SOCKET
+            numactlargs += [ "--membind={}".format(memnode) ]
+
+        # spawn the processes
+        cmd = [ "/usr/bin/numactl" ] \
+            + numactlargs \
+            + [ sys.executable,
+                "-u",
+                args.training_script,
+                "--local_rank={}".format(local_rank)
+              ] \
+            + args.training_script_args
+
+        process = subprocess.Popen(cmd, env=current_env)
+        processes.append(process)
+
+    for process in processes:
+        process.wait()
+
+
+if __name__ == "__main__":
+    main()
+
+