#!/bin/bash #env lrank=$OMPI_COMM_WORLD_LOCAL_RANK comm_rank=$OMPI_COMM_WORLD_RANK comm_size=$OMPI_COMM_WORLD_SIZE WORKSPACE="" #parameters precision=${3:-"fp32"} #1 epoch ~ 10224512 step train_steps=${6:-10224512} save_checkpoint_steps=${7:-40000} accumulate_gradients=${10:-"true"} gradient_accumulation_steps=${11:-1} seed=${12:-12439} job_name=${13:-"bert_pretraining_vs_dgx1"} allreduce_post_accumulation=${14:-"true"} allreduce_post_accumulation_fp16=${15:-"false"} train_batch_size_phase2=${16:-4} learning_rate_phase2=${17:-"4e-3"} warmup_proportion_phase2=${18:-"0.128"} train_steps_phase2=${19:-10224512} gradient_accumulation_steps_phase2=${5:-1} #bert base #BERT_CONFIG=${WORKSPACE}/uncased_L-12_H-768_A-12/bert_config.json #bert large BERT_CONFIG=${WORKSPACE}/bert_large_uncased/bert_config.json DATASET2=${WORKSPACE}/lower_case_1_seq_len_512_max_pred_80_masked_lm_prob_0.15_random_seed_12345_dupe_factor_5_shard_1472_test_split_10/wikicorpus_en/training # change this for other datasets DATA_DIR_PHASE2=${4:-${DATASET2}/} CODEDIR=${9:-"`pwd`"} init_checkpoint=${8:-"None"} RESULTS_DIR=$CODEDIR/${2} CHECKPOINTS_DIR=$RESULTS_DIR/checkpoints if [ ! -d "$RESULTS_DIR" ] ; then echo "Error! $RESULTS_DIR directory missing." exit -1 fi if [ ! -d "$CHECKPOINTS_DIR" ] ; then echo "Warning! $CHECKPOINTS_DIR directory missing." echo "Checkpoints will be written to $RESULTS_DIR instead." CHECKPOINTS_DIR=$RESULTS_DIR fi if [ ! -f "$BERT_CONFIG" ] ; then echo "Error! BERT large configuration file not found at $BERT_CONFIG" exit -1 fi #Start Phase2 PREC="" if [ "$precision" = "fp16" ] ; then PREC="--fp16" elif [ "$precision" = "fp32" ] ; then PREC="" elif [ "$precision" = "tf32" ] ; then PREC="" else echo "Unknown argument" exit -2 fi ACCUMULATE_GRADIENTS="" if [ "$accumulate_gradients" == "true" ] ; then ACCUMULATE_GRADIENTS="--gradient_accumulation_steps=$gradient_accumulation_steps_phase2" fi ALL_REDUCE_POST_ACCUMULATION="" if [ "$allreduce_post_accumulation" == "true" ] ; then ALL_REDUCE_POST_ACCUMULATION="--allreduce_post_accumulation" fi ALL_REDUCE_POST_ACCUMULATION_FP16="" if [ "$allreduce_post_accumulation_fp16" == "true" ] ; then ALL_REDUCE_POST_ACCUMULATION_FP16="--allreduce_post_accumulation_fp16" fi echo $DATA_DIR_PHASE2 INPUT_DIR=$DATA_DIR_PHASE2 CMD=" $CODEDIR/run_pretraining_v1_v2.py" CMD+=" --input_dir=$DATA_DIR_PHASE2" CMD+=" --output_dir=$CHECKPOINTS_DIR" CMD+=" --config_file=$BERT_CONFIG" CMD+=" --bert_model=bert-large-uncased" CMD+=" --train_batch_size=$train_batch_size_phase2" CMD+=" --max_seq_length=512" CMD+=" --max_predictions_per_seq=80" CMD+=" --max_steps=$train_steps_phase2" CMD+=" --warmup_proportion=$warmup_proportion_phase2" CMD+=" --num_steps_per_checkpoint=$save_checkpoint_steps" CMD+=" --learning_rate=$learning_rate_phase2" CMD+=" --seed=$seed" CMD+=" $PREC" CMD+=" $ACCUMULATE_GRADIENTS" CMD+=" $CHECKPOINT" CMD+=" $ALL_REDUCE_POST_ACCUMULATION" CMD+=" $ALL_REDUCE_POST_ACCUMULATION_FP16" CMD+=" --do_train --phase2 --resume_from_checkpoint --phase1_end_step=$train_steps" CMD+=" --json-summary ${RESULTS_DIR}/dllogger.json " CMD+=" --local_rank ${comm_rank} " CMD+=" --dist_url tcp://${1}:45679 " CMD+=" --world_size ${comm_size} " APP="python3 $CMD" set +x echo ${CMD} case ${lrank} in [0]) export HIP_VISIBLE_DEVICES=0,1,2,3 export UCX_NET_DEVICES=mlx5_0:1 export UCX_IB_PCI_BW=mlx5_0:50Gbs NCCL_SOCKET_IFNAME=ib0 numactl --cpunodebind=0 --membind=0 ${APP} ;; [1]) export HIP_VISIBLE_DEVICES=0,1,2,3 export UCX_NET_DEVICES=mlx5_1:1 export UCX_IB_PCI_BW=mlx5_1:50Gbs NCCL_SOCKET_IFNAME=ib1 numactl --cpunodebind=1 --membind=1 ${APP} ;; [2]) export HIP_VISIBLE_DEVICES=0,1,2,3 export UCX_NET_DEVICES=mlx5_2:1 export UCX_IB_PCI_BW=mlx5_2:50Gbs NCCL_SOCKET_IFNAME=ib2 numactl --cpunodebind=2 --membind=2 ${APP} ;; [3]) export HIP_VISIBLE_DEVICES=0,1,2,3 export UCX_NET_DEVICES=mlx5_3:1 export UCX_IB_PCI_BW=mlx5_3:50Gbs NCCL_SOCKET_IFNAME=ib3 numactl --cpunodebind=3 --membind=3 ${APP} ;; esac echo "finished phase2"