初始化仓库

e5ca7e62 · hepj987 · e5ca7e62 · e5ca7e62 · e5ca7e62 · e5ca7e62
Commit e5ca7e62 authored Jul 17, 2023 by hepj987
20 changed files
--- a/scripts/docker/launch.sh
+++ b/scripts/docker/launch.sh
+#!/bin/bash
+
+CMD=${1:-/bin/bash}
+NV_VISIBLE_DEVICES=${2:-"all"}
+DOCKER_BRIDGE=${3:-"host"}
+
+docker run -it --rm \
+  --gpus device=$NV_VISIBLE_DEVICES \
+  --net=$DOCKER_BRIDGE \
+  --shm-size=1g \
+  --ulimit memlock=-1 \
+  --ulimit stack=67108864 \
+  -e LD_LIBRARY_PATH='/workspace/install/lib/' \
+  -v $PWD:/workspace/bert \
+  -v $PWD/results:/results \
+  bert $CMD
--- a/scripts/run_glue.sh
+++ b/scripts/run_glue.sh
+#!/bin/bash
+
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -e
+
+echo "Container nvidia build = " $NVIDIA_BUILD_ID
+
+init_checkpoint=${1:-"/workspace/bert/checkpoints/bert_uncased.pt"}
+data_dir=${2:-"$BERT_PREP_WORKING_DIR/download/glue/MRPC/"}
+vocab_file=${3:-"$BERT_PREP_WORKING_DIR/download/google_pretrained_weights/uncased_L-24_H-1024_A-16/vocab.txt"}
+config_file=${4:-"/workspace/bert/bert_config.json"}
+out_dir=${5:-"/workspace/bert/results/MRPC"}
+task_name=${6:-"mrpc"}
+num_gpu=${7:-"8"}
+batch_size=${8:-"16"}
+gradient_accumulation_steps=${9:-"1"}
+learning_rate=${10:-"2.4e-5"}
+warmup_proportion=${11:-"0.1"}
+epochs=${12:-"3"}
+max_steps=${13:-"-1.0"}
+precision=${14:-"fp16"}
+seed=${15:-"2"}
+mode=${16:-"train eval"}
+
+mkdir -p $out_dir
+
+if [ "$mode" = "eval" ] ; then
+  num_gpu=1
+fi
+
+use_fp16=""
+if [ "$precision" = "fp16" ] ; then
+  echo "fp16 activated!"
+  use_fp16="--fp16"
+fi
+
+if [ "$num_gpu" = "1" ] ; then
+  export CUDA_VISIBLE_DEVICES=0
+  mpi_command=""
+else
+  unset CUDA_VISIBLE_DEVICES
+  mpi_command=" -m torch.distributed.launch --nproc_per_node=$num_gpu"
+fi
+
+CMD="python $mpi_command run_glue.py "
+CMD+="--task_name ${task_name} "
+if [[ $mode == *"train"* ]] ; then
+  CMD+="--do_train "
+  CMD+="--train_batch_size=$batch_size "
+fi
+if [[ $mode == *"eval"* ]] || [[ $mode == *"prediction"* ]]; then
+  if [[ $mode == *"eval"* ]] ; then
+    CMD+="--do_eval "
+  fi
+  if [[ $mode == *"prediction"* ]] ; then
+    CMD+="--do_predict "
+  fi
+  CMD+="--eval_batch_size=$batch_size "
+fi
+
+CMD+="--gradient_accumulation_steps=$gradient_accumulation_steps "
+CMD+="--do_lower_case "
+CMD+="--data_dir $data_dir "
+CMD+="--bert_model bert-large-uncased "
+CMD+="--seed $seed "
+CMD+="--init_checkpoint $init_checkpoint "
+CMD+="--warmup_proportion $warmup_proportion "
+CMD+="--max_seq_length 128 "
+CMD+="--learning_rate $learning_rate "
+CMD+="--num_train_epochs $epochs "
+CMD+="--max_steps $max_steps "
+CMD+="--vocab_file=$vocab_file "
+CMD+="--config_file=$config_file "
+CMD+="--output_dir $out_dir "
+CMD+="$use_fp16"
+
+LOGFILE=$out_dir/logfile
+
+$CMD |& tee $LOGFILE
--- a/scripts/run_pretrain.sh
+++ b/scripts/run_pretrain.sh
+#!/bin/bash
+
+#set -x
+hostfile=./hostfile
+num_node=$(cat $hostfile|sort|uniq |wc -l)
+ 
+num_gpu=$(($num_node*4))
+nodename=$(cat $hostfile |sed -n "1p")
+dist_url=`echo $nodename | awk '{print $1}'`
+echo $dist_url
+rm `pwd`/hostfile-dl -f
+cat $hostfile|sort|uniq >`pwd`/tmp
+ 
+for i in `cat ./tmp`
+do
+    echo ${i} slots=4 >> `pwd`/hostfile-dl
+done
+
+mpirun -np ${num_gpu} --hostfile `pwd`/hostfile-dl --bind-to none `pwd`/single_process_pretrain.sh $dist_url
--- a/scripts/run_pretrain2.sh
+++ b/scripts/run_pretrain2.sh
+#!/bin/bash
+
+#set -x
+hostfile=./hostfile
+num_node=$(cat $hostfile|sort|uniq |wc -l)
+ 
+num_gpu=$(($num_node*4))
+nodename=$(cat $hostfile |sed -n "1p")
+dist_url=`echo $nodename | awk '{print $1}'`
+echo $dist_url
+rm `pwd`/hostfile-dl -f
+cat $hostfile|sort|uniq >`pwd`/tmp
+ 
+for i in `cat ./tmp`
+do
+    echo ${i} slots=4 >> `pwd`/hostfile-dl
+done
+
+mpirun -np ${num_gpu} --hostfile `pwd`/hostfile-dl --bind-to none -x LD_LIBRARY_PATH -x PATH `pwd`/ingle_process_pretrain2.sh $dist_url results
--- a/scripts/run_pretraining.sh
+++ b/scripts/run_pretraining.sh
+#!/bin/bash
+
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+echo "Container nvidia build = " $NVIDIA_BUILD_ID
+train_batch_size=${1:-8192}
+learning_rate=${2:-"6e-3"}
+precision=${3:-"fp16"}
+num_gpus=${4:-8}
+warmup_proportion=${5:-"0.2843"}
+train_steps=${6:-7038}
+save_checkpoint_steps=${7:-200}
+resume_training=${8:-"false"}
+create_logfile=${9:-"true"}
+accumulate_gradients=${10:-"true"}
+gradient_accumulation_steps=${11:-128}
+seed=${12:-12439}
+job_name=${13:-"bert_lamb_pretraining"}
+allreduce_post_accumulation=${14:-"true"}
+allreduce_post_accumulation_fp16=${15:-"true"}
+train_batch_size_phase2=${16:-4096}
+learning_rate_phase2=${17:-"4e-3"}
+warmup_proportion_phase2=${18:-"0.128"}
+train_steps_phase2=${19:-1563}
+gradient_accumulation_steps_phase2=${20:-512}
+DATASET=hdf5_lower_case_1_seq_len_128_max_pred_20_masked_lm_prob_0.15_random_seed_12345_dupe_factor_5_shard_1472_test_split_10/books_wiki_en_corpus/training # change this for other datasets
+DATA_DIR_PHASE1=${21:-$BERT_PREP_WORKING_DIR/${DATASET}/}
+BERT_CONFIG=bert_config.json
+DATASET2=hdf5_lower_case_1_seq_len_512_max_pred_80_masked_lm_prob_0.15_random_seed_12345_dupe_factor_5_shard_1472_test_split_10/books_wiki_en_corpus/training # change this for other datasets
+DATA_DIR_PHASE2=${22:-$BERT_PREP_WORKING_DIR/${DATASET2}/}
+CODEDIR=${23:-"/workspace/bert"}
+init_checkpoint=${24:-"None"}
+RESULTS_DIR=$CODEDIR/results
+CHECKPOINTS_DIR=$RESULTS_DIR/checkpoints
+
+mkdir -p $CHECKPOINTS_DIR
+
+
+if [ ! -d "$DATA_DIR_PHASE1" ] ; then
+   echo "Warning! $DATA_DIR_PHASE1 directory missing. Training cannot start"
+fi
+if [ ! -d "$RESULTS_DIR" ] ; then
+   echo "Error! $RESULTS_DIR directory missing."
+   exit -1
+fi
+if [ ! -d "$CHECKPOINTS_DIR" ] ; then
+   echo "Warning! $CHECKPOINTS_DIR directory missing."
+   echo "Checkpoints will be written to $RESULTS_DIR instead."
+   CHECKPOINTS_DIR=$RESULTS_DIR
+fi
+if [ ! -f "$BERT_CONFIG" ] ; then
+   echo "Error! BERT large configuration file not found at $BERT_CONFIG"
+   exit -1
+fi
+
+PREC=""
+if [ "$precision" = "fp16" ] ; then
+   PREC="--fp16"
+elif [ "$precision" = "fp32" ] ; then
+   PREC=""
+elif [ "$precision" = "tf32" ] ; then
+   PREC=""
+else
+   echo "Unknown <precision> argument"
+   exit -2
+fi
+
+ACCUMULATE_GRADIENTS=""
+if [ "$accumulate_gradients" == "true" ] ; then
+   ACCUMULATE_GRADIENTS="--gradient_accumulation_steps=$gradient_accumulation_steps"
+fi
+
+CHECKPOINT=""
+if [ "$resume_training" == "true" ] ; then
+   CHECKPOINT="--resume_from_checkpoint"
+fi
+
+ALL_REDUCE_POST_ACCUMULATION=""
+if [ "$allreduce_post_accumulation" == "true" ] ; then
+   ALL_REDUCE_POST_ACCUMULATION="--allreduce_post_accumulation"
+fi
+
+ALL_REDUCE_POST_ACCUMULATION_FP16=""
+if [ "$allreduce_post_accumulation_fp16" == "true" ] ; then
+   ALL_REDUCE_POST_ACCUMULATION_FP16="--allreduce_post_accumulation_fp16"
+fi
+
+INIT_CHECKPOINT=""
+if [ "$init_checkpoint" != "None" ] ; then
+   INIT_CHECKPOINT="--init_checkpoint=$init_checkpoint"
+fi
+
+echo $DATA_DIR_PHASE1
+INPUT_DIR=$DATA_DIR_PHASE1
+CMD=" $CODEDIR/run_pretraining.py"
+CMD+=" --input_dir=$DATA_DIR_PHASE1"
+CMD+=" --output_dir=$CHECKPOINTS_DIR"
+CMD+=" --config_file=$BERT_CONFIG"
+CMD+=" --bert_model=bert-large-uncased"
+CMD+=" --train_batch_size=$train_batch_size"
+CMD+=" --max_seq_length=128"
+CMD+=" --max_predictions_per_seq=20"
+CMD+=" --max_steps=$train_steps"
+CMD+=" --warmup_proportion=$warmup_proportion"
+CMD+=" --num_steps_per_checkpoint=$save_checkpoint_steps"
+CMD+=" --learning_rate=$learning_rate"
+CMD+=" --seed=$seed"
+CMD+=" $PREC"
+CMD+=" $ACCUMULATE_GRADIENTS"
+CMD+=" $CHECKPOINT"
+CMD+=" $ALL_REDUCE_POST_ACCUMULATION"
+CMD+=" $ALL_REDUCE_POST_ACCUMULATION_FP16"
+CMD+=" $INIT_CHECKPOINT"
+CMD+=" --do_train"
+CMD+=" --json-summary ${RESULTS_DIR}/dllogger.json "
+
+CMD="python3 -m torch.distributed.launch --nproc_per_node=$num_gpus $CMD"
+
+
+if [ "$create_logfile" = "true" ] ; then
+  export GBS=$(expr $train_batch_size \* $num_gpus)
+  printf -v TAG "pyt_bert_pretraining_phase1_%s_gbs%d" "$precision" $GBS
+  DATESTAMP=`date +'%y%m%d%H%M%S'`
+  LOGFILE=$RESULTS_DIR/$job_name.$TAG.$DATESTAMP.log
+  printf "Logs written to %s\n" "$LOGFILE"
+fi
+
+set -x
+if [ -z "$LOGFILE" ] ; then
+   $CMD
+else
+   (
+     $CMD
+   ) |& tee $LOGFILE
+fi
+
+set +x
+
+echo "finished pretraining"
+
+#Start Phase2
+
+PREC=""
+if [ "$precision" = "fp16" ] ; then
+   PREC="--fp16"
+elif [ "$precision" = "fp32" ] ; then
+   PREC=""
+elif [ "$precision" = "tf32" ] ; then
+   PREC=""
+else
+   echo "Unknown <precision> argument"
+   exit -2
+fi
+
+ACCUMULATE_GRADIENTS=""
+if [ "$accumulate_gradients" == "true" ] ; then
+   ACCUMULATE_GRADIENTS="--gradient_accumulation_steps=$gradient_accumulation_steps_phase2"
+fi
+
+ALL_REDUCE_POST_ACCUMULATION=""
+if [ "$allreduce_post_accumulation" == "true" ] ; then
+   ALL_REDUCE_POST_ACCUMULATION="--allreduce_post_accumulation"
+fi
+
+ALL_REDUCE_POST_ACCUMULATION_FP16=""
+if [ "$allreduce_post_accumulation_fp16" == "true" ] ; then
+   ALL_REDUCE_POST_ACCUMULATION_FP16="--allreduce_post_accumulation_fp16"
+fi
+
+echo $DATA_DIR_PHASE2
+INPUT_DIR=$DATA_DIR_PHASE2
+CMD=" $CODEDIR/run_pretraining.py"
+CMD+=" --input_dir=$DATA_DIR_PHASE2"
+CMD+=" --output_dir=$CHECKPOINTS_DIR"
+CMD+=" --config_file=$BERT_CONFIG"
+CMD+=" --bert_model=bert-large-uncased"
+CMD+=" --train_batch_size=$train_batch_size_phase2"
+CMD+=" --max_seq_length=512"
+CMD+=" --max_predictions_per_seq=80"
+CMD+=" --max_steps=$train_steps_phase2"
+CMD+=" --warmup_proportion=$warmup_proportion_phase2"
+CMD+=" --num_steps_per_checkpoint=$save_checkpoint_steps"
+CMD+=" --learning_rate=$learning_rate_phase2"
+CMD+=" --seed=$seed"
+CMD+=" $PREC"
+CMD+=" $ACCUMULATE_GRADIENTS"
+CMD+=" $CHECKPOINT"
+CMD+=" $ALL_REDUCE_POST_ACCUMULATION"
+CMD+=" $ALL_REDUCE_POST_ACCUMULATION_FP16"
+CMD+=" --do_train --phase2 --resume_from_checkpoint --phase1_end_step=$train_steps"
+CMD+=" --json-summary ${RESULTS_DIR}/dllogger.json "
+
+CMD="python3 -m torch.distributed.launch --nproc_per_node=$num_gpus $CMD"
+
+if [ "$create_logfile" = "true" ] ; then
+  export GBS=$(expr $train_batch_size_phase2 \* $num_gpus)
+  printf -v TAG "pyt_bert_pretraining_phase2_%s_gbs%d" "$precision" $GBS
+  DATESTAMP=`date +'%y%m%d%H%M%S'`
+  LOGFILE=$RESULTS_DIR/$job_name.$TAG.$DATESTAMP.log
+  printf "Logs written to %s\n" "$LOGFILE"
+fi
+
+set -x
+if [ -z "$LOGFILE" ] ; then
+   $CMD
+else
+   (
+     $CMD
+   ) |& tee $LOGFILE
+fi
+
+set +x
+
+echo "finished phase2"
--- a/scripts/run_squad.sh
+++ b/scripts/run_squad.sh
+#!/usr/bin/env bash
+
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+echo "Container nvidia build = " $NVIDIA_BUILD_ID
+
+init_checkpoint=${1:-"/workspace/bert/checkpoints/bert_uncased.pt"}
+epochs=${2:-"2.0"}
+batch_size=${3:-"4"}
+learning_rate=${4:-"3e-5"}
+precision=${5:-"fp16"}
+num_gpu=${6:-"8"}
+seed=${7:-"1"}
+squad_dir=${8:-"$BERT_PREP_WORKING_DIR/download/squad/v1.1"}
+vocab_file=${9:-"$BERT_PREP_WORKING_DIR/download/google_pretrained_weights/uncased_L-24_H-1024_A-16/vocab.txt"}
+OUT_DIR=${10:-"/workspace/bert/results/SQuAD"}
+mode=${11:-"train eval"}
+CONFIG_FILE=${12:-"/workspace/bert/bert_config.json"}
+max_steps=${13:-"-1"} 
+
+echo "out dir is $OUT_DIR"
+mkdir -p $OUT_DIR
+if [ ! -d "$OUT_DIR" ]; then
+  echo "ERROR: non existing $OUT_DIR"
+  exit 1
+fi
+
+use_fp16=""
+if [ "$precision" = "fp16" ] ; then
+  echo "fp16 activated!"
+  use_fp16=" --fp16 "
+fi
+
+if [ "$num_gpu" = "1" ] ; then
+  export CUDA_VISIBLE_DEVICES=0
+  mpi_command=""
+else
+  unset CUDA_VISIBLE_DEVICES
+  mpi_command=" -m torch.distributed.launch --nproc_per_node=$num_gpu"
+fi
+
+CMD="python  $mpi_command run_squad.py "
+CMD+="--init_checkpoint=$init_checkpoint "
+if [ "$mode" = "train" ] ; then
+  CMD+="--do_train "
+  CMD+="--train_file=$squad_dir/train-v1.1.json "
+  CMD+="--train_batch_size=$batch_size "
+elif [ "$mode" = "eval" ] ; then
+  CMD+="--do_predict "
+  CMD+="--predict_file=$squad_dir/dev-v1.1.json "
+  CMD+="--predict_batch_size=$batch_size "
+  CMD+="--eval_script=$squad_dir/evaluate-v1.1.py "
+  CMD+="--do_eval "
+elif [ "$mode" = "prediction" ] ; then
+  CMD+="--do_predict "
+  CMD+="--predict_file=$squad_dir/dev-v1.1.json "
+  CMD+="--predict_batch_size=$batch_size "
+else
+  CMD+=" --do_train "
+  CMD+=" --train_file=$squad_dir/train-v1.1.json "
+  CMD+=" --train_batch_size=$batch_size "
+  CMD+="--do_predict "
+  CMD+="--predict_file=$squad_dir/dev-v1.1.json "
+  CMD+="--predict_batch_size=$batch_size "
+  CMD+="--eval_script=$squad_dir/evaluate-v1.1.py "
+  CMD+="--do_eval "
+fi
+
+CMD+=" --do_lower_case "
+CMD+=" --bert_model=bert-large-uncased "
+CMD+=" --learning_rate=$learning_rate "
+CMD+=" --seed=$seed "
+CMD+=" --num_train_epochs=$epochs "
+CMD+=" --max_seq_length=384 "
+CMD+=" --doc_stride=128 "
+CMD+=" --output_dir=$OUT_DIR "
+CMD+=" --vocab_file=$vocab_file "
+CMD+=" --config_file=$CONFIG_FILE "
+CMD+=" --max_steps=$max_steps "
+CMD+=" $use_fp16"
+
+LOGFILE=$OUT_DIR/logfile.txt
+echo "$CMD |& tee $LOGFILE"
+time $CMD |& tee $LOGFILE
--- a/scripts/run_squad_1.sh
+++ b/scripts/run_squad_1.sh
+#!/bin/bash
+
+#set -x
+hostfile=./hostfile
+num_node=$(cat $hostfile|sort|uniq |wc -l)
+ 
+num_gpu=$(($num_node*4))
+nodename=$(cat $hostfile |sed -n "1p")
+dist_url=`echo $nodename | awk '{print $1}'`
+echo $dist_url
+rm `pwd`/hostfile-dl -f
+cat $hostfile|sort|uniq >`pwd`/tmp
+ 
+for i in `cat ./tmp`
+do
+    echo ${i} slots=4 >> `pwd`/hostfile-dl
+done
+
+mpirun -np ${num_gpu} --hostfile `pwd`/hostfile-dl --bind-to none `pwd`/single_process_squad.sh $dist_url
--- a/scripts/run_swag.sh
+++ b/scripts/run_swag.sh
+#!/bin/bash
+
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+SWAG_DIR=/workspace/bert/data/swag
+OUT_DIR=/results/SWAG
+
+mkdir -p $OUT_DIR
+
+echo "Container nvidia build = " $NVIDIA_BUILD_ID
+
+init_checkpoint=${1}
+mode=${2:-"train"}
+max_steps=${3:-"-1.0"} # if < 0, has no effect
+batch_size=${4:-"12"}
+learning_rate=${5:-"5e-6"}
+precision=${6:-"fp32"}
+num_gpu=${7:-"8"}
+epochs=${8:-"2"}
+
+if [ "$mode" != "train" ] ; then
+  num_gpu=1
+fi
+
+use_fp16=""
+if [ "$precision" = "fp16" ] ; then
+  echo "fp16 activated!"
+  use_fp16="--fp16"
+fi
+
+if [ "$num_gpu" = "1" ] ; then
+  mpi_command=""
+else
+  mpi_command="torch.distributed.launch --nproc_per_node=$num_gpu"
+fi
+
+CMD="python -m $mpi_command run_swag.py "
+CMD+="--init_checkpoint=$init_checkpoint "
+if [ "$mode" = "train" ] ; then
+  CMD+="--do_train "
+  CMD+="--train_batch_size=$batch_size "
+else
+  CMD+="--do_eval "
+  CMD+="--eval_batch_size=$batch_size "
+fi
+CMD+="--do_lower_case "
+CMD+="--data_dir $SWAG_DIR/data/ "
+CMD+="--bert_model bert-large-uncased "
+CMD+="--max_seq_length 128 "
+CMD+="--learning_rate $learning_rate "
+CMD+="--num_train_epochs $epochs "
+CMD+="--max_steps $max_steps "
+CMD+="--output_dir $OUT_DIR "
+CMD+="$use_fp16"
+
+LOGFILE=$OUT_DIR/logfile
+$CMD |& tee $LOGFILE
+
+sed -r 's/
+|(\[A)/\n/g' $LOGFILE > $LOGFILE.edit
+
+throughput=`cat $LOGFILE.edit | grep -E 'Iteration.*[0-9.]+(s/it|it/s)' | tail -1 | egrep -o '[0-9.]+(s/it|it/s)'`
+
+echo "throughput: $throughput"
+
--- a/scripts/single_process_pretrain.sh
+++ b/scripts/single_process_pretrain.sh
+#!/bin/bash
+
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+lrank=$OMPI_COMM_WORLD_LOCAL_RANK
+comm_rank=$OMPI_COMM_WORLD_RANK
+comm_size=$OMPI_COMM_WORLD_SIZE
+WORKSPACE="/workspace"
+
+#parameters
+train_batch_size=24
+learning_rate=${2:-"6e-3"}
+precision=${3:-"fp16"}
+num_gpus=${4:-4}
+warmup_proportion=${5:-"0.2843"}
+train_steps=${6:-7038}
+save_checkpoint_steps=${7:-200}
+resume_training=${8:-"false"}
+create_logfile=${9:-"true"}
+accumulate_gradients=${10:-"true"}
+gradient_accumulation_steps=${11:-4}
+seed=${12:-12439}
+job_name=${13:-"bert_lamb_pretraining"}
+allreduce_post_accumulation=${14:-"true"}
+allreduce_post_accumulation_fp16=${15:-"true"}
+train_batch_size_phase2=${16:-4096}
+learning_rate_phase2=${17:-"4e-3"}
+warmup_proportion_phase2=${18:-"0.128"}
+train_steps_phase2=${19:-1563}
+gradient_accumulation_steps_phase2=${20:-512}
+DATASET=${WORKSPACE}/lower_case_1_seq_len_128_max_pred_20_masked_lm_prob_0.15_random_seed_12345_dupe_factor_5_shard_1472_test_split_10/wikicorpus_en/training # change this for other datasets
+DATA_DIR_PHASE1=${21:-$BERT_PREP_WORKING_DIR/${DATASET}/}
+BERT_CONFIG=${WORKSPACE}/uncased_L-12_H-768_A-12/bert_config.json
+CODEDIR=${22:-"`pwd`"}
+init_checkpoint=${23:-"None"}
+RESULTS_DIR=$CODEDIR/results
+CHECKPOINTS_DIR=$RESULTS_DIR/checkpoints
+
+mkdir -p $CHECKPOINTS_DIR
+
+
+if [ ! -d "$DATA_DIR_PHASE1" ] ; then
+   echo "Warning! $DATA_DIR_PHASE1 directory missing. Training cannot start"
+fi
+if [ ! -d "$RESULTS_DIR" ] ; then
+   echo "Error! $RESULTS_DIR directory missing."
+   exit -1
+fi
+if [ ! -d "$CHECKPOINTS_DIR" ] ; then
+   echo "Warning! $CHECKPOINTS_DIR directory missing."
+   echo "Checkpoints will be written to $RESULTS_DIR instead."
+   CHECKPOINTS_DIR=$RESULTS_DIR
+fi
+if [ ! -f "$BERT_CONFIG" ] ; then
+   echo "Error! BERT large configuration file not found at $BERT_CONFIG"
+   exit -1
+fi
+
+PREC=""
+if [ "$precision" = "fp16" ] ; then
+   PREC="--fp16"
+elif [ "$precision" = "fp32" ] ; then
+   PREC=""
+elif [ "$precision" = "tf32" ] ; then
+   PREC=""
+else
+   echo "Unknown <precision> argument"
+   exit -2
+fi
+
+ACCUMULATE_GRADIENTS=""
+if [ "$accumulate_gradients" == "true" ] ; then
+   ACCUMULATE_GRADIENTS="--gradient_accumulation_steps=$gradient_accumulation_steps"
+fi
+
+CHECKPOINT=""
+if [ "$resume_training" == "true" ] ; then
+   CHECKPOINT="--resume_from_checkpoint"
+fi
+
+ALL_REDUCE_POST_ACCUMULATION=""
+if [ "$allreduce_post_accumulation" == "true" ] ; then
+   ALL_REDUCE_POST_ACCUMULATION="--allreduce_post_accumulation"
+fi
+
+ALL_REDUCE_POST_ACCUMULATION_FP16=""
+if [ "$allreduce_post_accumulation_fp16" == "true" ] ; then
+   ALL_REDUCE_POST_ACCUMULATION_FP16="--allreduce_post_accumulation_fp16"
+fi
+
+INIT_CHECKPOINT=""
+if [ "$init_checkpoint" != "None" ] ; then
+   INIT_CHECKPOINT="--init_checkpoint=$init_checkpoint"
+fi
+
+echo $DATA_DIR_PHASE1
+INPUT_DIR=$DATA_DIR_PHASE1
+CMD=" $CODEDIR/run_pretraining_v1.py"
+CMD+=" --input_dir=$DATA_DIR_PHASE1"
+CMD+=" --output_dir=$CHECKPOINTS_DIR"
+CMD+=" --config_file=$BERT_CONFIG"
+CMD+=" --bert_model=bert-large-uncased"
+CMD+=" --train_batch_size=$train_batch_size"
+CMD+=" --max_seq_length=128"
+CMD+=" --max_predictions_per_seq=20"
+CMD+=" --max_steps=$train_steps"
+CMD+=" --warmup_proportion=$warmup_proportion"
+CMD+=" --num_steps_per_checkpoint=$save_checkpoint_steps"
+CMD+=" --learning_rate=$learning_rate"
+CMD+=" --seed=$seed"
+CMD+=" $PREC"
+CMD+=" $ACCUMULATE_GRADIENTS"
+CMD+=" $CHECKPOINT"
+CMD+=" $ALL_REDUCE_POST_ACCUMULATION"
+CMD+=" $ALL_REDUCE_POST_ACCUMULATION_FP16"
+CMD+=" $INIT_CHECKPOINT"
+CMD+=" --do_train"
+CMD+=" --json-summary ${RESULTS_DIR}/dllogger.json "
+CMD+=" --local_rank ${comm_rank} "
+CMD+=" --dist_url tcp://${1}:45679 "
+CMD+=" --world_size ${comm_size} "
+
+#CMD="python3 -m torch.distributed.launch --nproc_per_node=$num_gpus $CMD"
+APP="python3 $CMD"
+
+
+if [ "$create_logfile" = "true" ] ; then
+  export GBS=$(expr $train_batch_size \* $num_gpus)
+  printf -v TAG "pyt_bert_pretraining_phase1_%s_gbs%d" "$precision" $GBS
+  DATESTAMP=`date +'%y%m%d%H%M%S'`
+  LOGFILE=$RESULTS_DIR/$job_name.$TAG.$DATESTAMP.log
+  printf "Logs written to %s\n" "$LOGFILE"
+fi
+
+echo ${CMD}
+
+case ${lrank} in
+[0])
+  export HIP_VISIBLE_DEVICES=0,1,2,3
+  export UCX_NET_DEVICES=mlx5_0:1
+  export UCX_IB_PCI_BW=mlx5_0:50Gbs
+  GLOO_SOCKET_IFNAME=ib0 numactl --cpunodebind=0 --membind=0 ${APP}
+  ;;
+[1])
+  export HIP_VISIBLE_DEVICES=0,1,2,3
+  export UCX_NET_DEVICES=mlx5_1:1
+  export UCX_IB_PCI_BW=mlx5_1:50Gbs
+  GLOO_SOCKET_IFNAME=ib1 numactl --cpunodebind=1 --membind=1 ${APP}
+  ;;
+[2])
+  export HIP_VISIBLE_DEVICES=0,1,2,3
+  export UCX_NET_DEVICES=mlx5_2:1
+  export UCX_IB_PCI_BW=mlx5_2:50Gbs
+  GLOO_SOCKET_IFNAME=ib2 numactl --cpunodebind=2 --membind=2 ${APP}
+  ;;
+[3])
+  export HIP_VISIBLE_DEVICES=0,1,2,3
+  export UCX_NET_DEVICES=mlx5_3:1
+  export UCX_IB_PCI_BW=mlx5_3:50Gbs
+  GLOO_SOCKET_IFNAME=ib3 numactl --cpunodebind=3 --membind=3 ${APP}
+  ;;
+esac
+
--- a/scripts/single_process_pretrain2.sh
+++ b/scripts/single_process_pretrain2.sh
+#!/bin/bash
+#env
+
+lrank=$OMPI_COMM_WORLD_LOCAL_RANK
+comm_rank=$OMPI_COMM_WORLD_RANK
+comm_size=$OMPI_COMM_WORLD_SIZE
+WORKSPACE=""
+
+#parameters
+precision=${3:-"fp32"}
+#1 epoch ~ 10224512 step
+train_steps=${6:-10224512}
+save_checkpoint_steps=${7:-40000}
+accumulate_gradients=${10:-"true"}
+gradient_accumulation_steps=${11:-1}
+seed=${12:-12439}
+job_name=${13:-"bert_pretraining_vs_dgx1"}
+allreduce_post_accumulation=${14:-"true"}
+allreduce_post_accumulation_fp16=${15:-"false"}
+train_batch_size_phase2=${16:-4}
+learning_rate_phase2=${17:-"4e-3"}
+warmup_proportion_phase2=${18:-"0.128"}
+train_steps_phase2=${19:-10224512}
+gradient_accumulation_steps_phase2=${5:-1}
+#bert base
+#BERT_CONFIG=${WORKSPACE}/uncased_L-12_H-768_A-12/bert_config.json
+#bert large
+BERT_CONFIG=${WORKSPACE}/bert_large_uncased/bert_config.json
+DATASET2=${WORKSPACE}/lower_case_1_seq_len_512_max_pred_80_masked_lm_prob_0.15_random_seed_12345_dupe_factor_5_shard_1472_test_split_10/wikicorpus_en/training # change this for other datasets
+DATA_DIR_PHASE2=${4:-${DATASET2}/}
+CODEDIR=${9:-"`pwd`"}
+init_checkpoint=${8:-"None"}
+RESULTS_DIR=$CODEDIR/${2}
+CHECKPOINTS_DIR=$RESULTS_DIR/checkpoints
+
+
+if [ ! -d "$RESULTS_DIR" ] ; then
+   echo "Error! $RESULTS_DIR directory missing."
+   exit -1
+fi
+if [ ! -d "$CHECKPOINTS_DIR" ] ; then
+   echo "Warning! $CHECKPOINTS_DIR directory missing."
+   echo "Checkpoints will be written to $RESULTS_DIR instead."
+   CHECKPOINTS_DIR=$RESULTS_DIR
+fi
+if [ ! -f "$BERT_CONFIG" ] ; then
+   echo "Error! BERT large configuration file not found at $BERT_CONFIG"
+   exit -1
+fi
+
+#Start Phase2
+
+PREC=""
+if [ "$precision" = "fp16" ] ; then
+   PREC="--fp16"
+elif [ "$precision" = "fp32" ] ; then
+   PREC=""
+elif [ "$precision" = "tf32" ] ; then
+   PREC=""
+else
+   echo "Unknown <precision> argument"
+   exit -2
+fi
+
+ACCUMULATE_GRADIENTS=""
+if [ "$accumulate_gradients" == "true" ] ; then
+   ACCUMULATE_GRADIENTS="--gradient_accumulation_steps=$gradient_accumulation_steps_phase2"
+fi
+
+ALL_REDUCE_POST_ACCUMULATION=""
+if [ "$allreduce_post_accumulation" == "true" ] ; then
+   ALL_REDUCE_POST_ACCUMULATION="--allreduce_post_accumulation"
+fi
+
+ALL_REDUCE_POST_ACCUMULATION_FP16=""
+if [ "$allreduce_post_accumulation_fp16" == "true" ] ; then
+   ALL_REDUCE_POST_ACCUMULATION_FP16="--allreduce_post_accumulation_fp16"
+fi
+
+echo $DATA_DIR_PHASE2
+INPUT_DIR=$DATA_DIR_PHASE2
+CMD=" $CODEDIR/run_pretraining_v1_v2.py"
+CMD+=" --input_dir=$DATA_DIR_PHASE2"
+CMD+=" --output_dir=$CHECKPOINTS_DIR"
+CMD+=" --config_file=$BERT_CONFIG"
+CMD+=" --bert_model=bert-large-uncased"
+CMD+=" --train_batch_size=$train_batch_size_phase2"
+CMD+=" --max_seq_length=512"
+CMD+=" --max_predictions_per_seq=80"
+CMD+=" --max_steps=$train_steps_phase2"
+CMD+=" --warmup_proportion=$warmup_proportion_phase2"
+CMD+=" --num_steps_per_checkpoint=$save_checkpoint_steps"
+CMD+=" --learning_rate=$learning_rate_phase2"
+CMD+=" --seed=$seed"
+CMD+=" $PREC"
+CMD+=" $ACCUMULATE_GRADIENTS"
+CMD+=" $CHECKPOINT"
+CMD+=" $ALL_REDUCE_POST_ACCUMULATION"
+CMD+=" $ALL_REDUCE_POST_ACCUMULATION_FP16"
+CMD+=" --do_train --phase2 --resume_from_checkpoint --phase1_end_step=$train_steps"
+CMD+=" --json-summary ${RESULTS_DIR}/dllogger.json "
+CMD+=" --local_rank ${comm_rank} "
+CMD+=" --dist_url tcp://${1}:45679 "
+CMD+=" --world_size ${comm_size} "
+
+APP="python3 $CMD"
+
+set +x
+
+echo ${CMD}
+
+case ${lrank} in
+[0])
+  export HIP_VISIBLE_DEVICES=0,1,2,3
+  export UCX_NET_DEVICES=mlx5_0:1
+  export UCX_IB_PCI_BW=mlx5_0:50Gbs
+  NCCL_SOCKET_IFNAME=ib0 numactl --cpunodebind=0 --membind=0 ${APP}
+  ;;
+[1])
+  export HIP_VISIBLE_DEVICES=0,1,2,3
+  export UCX_NET_DEVICES=mlx5_1:1
+  export UCX_IB_PCI_BW=mlx5_1:50Gbs
+  NCCL_SOCKET_IFNAME=ib1 numactl --cpunodebind=1 --membind=1 ${APP}
+  ;;
+[2])
+  export HIP_VISIBLE_DEVICES=0,1,2,3
+  export UCX_NET_DEVICES=mlx5_2:1
+  export UCX_IB_PCI_BW=mlx5_2:50Gbs
+  NCCL_SOCKET_IFNAME=ib2 numactl --cpunodebind=2 --membind=2 ${APP}
+  ;;
+[3])
+  export HIP_VISIBLE_DEVICES=0,1,2,3
+  export UCX_NET_DEVICES=mlx5_3:1
+  export UCX_IB_PCI_BW=mlx5_3:50Gbs
+  NCCL_SOCKET_IFNAME=ib3 numactl --cpunodebind=3 --membind=3 ${APP}
+  ;;
+esac
+
+echo "finished phase2"
--- a/scripts/single_process_squad.sh
+++ b/scripts/single_process_squad.sh
+#!/bin/bash
+ 
+lrank=$OMPI_COMM_WORLD_LOCAL_RANK
+comm_rank=$OMPI_COMM_WORLD_RANK
+comm_size=$OMPI_COMM_WORLD_SIZE
+
+HOME_PATH=""
+
+export OMP_NUM_THREADS=8
+
+#source code path
+WORK_HOME="BERT"
+#data path
+DATA_DIR="BERT/data"
+#pretrined model path
+MODEL_DIR="bs64k_32k_ckpt"
+
+APP="python3 ${WORK_HOME}/run_squad_v1.py --train_file ${DATA_DIR}/squad/v1.1/train-v1.1.json --init_checkpoint ${MODEL_DIR}/model.ckpt-28252.pt --vocab_file ${MODEL_DIR}/vocab.txt --output_dir ${WORK_HOME}/result/SQuAD --config_file ${MODEL_DIR}/bert_config.json --bert_model=bert-large-uncased --do_train --local_rank ${comm_rank} --train_batch_size 1 --gpus_per_node 1 --dist_url tcp://${1}:45679 --world_size ${comm_size}"
+
+case ${lrank} in
+[0])
+  export HIP_VISIBLE_DEVICES=0,1,2,3
+  export UCX_NET_DEVICES=mlx5_0:1
+  export UCX_IB_PCI_BW=mlx5_0:50Gbs
+  GLOO_SOCKET_IFNAME=ib0 numactl --cpunodebind=0 --membind=0 ${APP}
+  ;;
+[1])
+  export HIP_VISIBLE_DEVICES=0,1,2,3
+  export UCX_NET_DEVICES=mlx5_1:1
+  export UCX_IB_PCI_BW=mlx5_1:50Gbs
+  GLOO_SOCKET_IFNAME=ib0 numactl --cpunodebind=1 --membind=1 ${APP}
+  ;;
+[2])
+  export HIP_VISIBLE_DEVICES=0,1,2,3
+  export UCX_NET_DEVICES=mlx5_2:1
+  export UCX_IB_PCI_BW=mlx5_2:50Gbs
+  GLOO_SOCKET_IFNAME=ib0 numactl --cpunodebind=2 --membind=2 ${APP}
+  ;;
+[3])
+  export HIP_VISIBLE_DEVICES=0,1,2,3
+  export UCX_NET_DEVICES=mlx5_3:1
+  export UCX_IB_PCI_BW=mlx5_3:50Gbs
+  GLOO_SOCKET_IFNAME=ib0 numactl --cpunodebind=3 --membind=3 ${APP}
+  ;;
+esac
+
--- a/single_pre1_1.sh
+++ b/single_pre1_1.sh
+#!/bin/bash
+#export MIOPEN_DEBUG_DISABLE_FIND_DB=1
+#export NCCL_SOCKET_IFNAME=ib0
+#export HSA_USERPTR_FOR_PAGED_MEM=0
+#export MIOPEN_DEBUG_DISABLE_FIND_DB=1
+#export NCCL_SOCKET_IFNAME=eno1
+export HSA_FORCE_FINE_GRAIN_PCIE=1
+#source /public/software/apps/DeepLearning/PyTorch/pytorch-env.sh
+export MIOPEN_FIND_MODE=1
+
+#export MIOPEN_ENABLE_LOGGING_CMD=1
+#export ROCBLAS_LAYER=3
+module unload compiler/rocm/2.9
+#export MIOPEN_DEBUG_CONV_DIRECT_ASM_3X3U=0
+#export MIOPEN_DEBUG_CONV_DIRECT_ASM_WRW3X3=0
+echo "MIOPEN_FIND_MODE=$MIOPEN_FIND_MODE"
+#module load apps/PyTorch/1.5.0a0/hpcx-2.4.1-gcc-7.3.1-rocm3.3
+#source /public/home/aiss/Pytorch/env_rocm3.3_torch1.5.sh
+lrank=$OMPI_COMM_WORLD_LOCAL_RANK
+comm_rank=$OMPI_COMM_WORLD_RANK
+comm_size=$OMPI_COMM_WORLD_SIZE
+
+#下边是修改的
+
+
+#export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+#HSA_FORCE_FINE_GRAIN_PCIE=1 numactl --cpunodebind=4,5,6,7 --membind=4,5,6,7
+
+#module load compiler/rocm/3.9.1
+
+export PATH_PHRASE1=/public/software/apps/DeepLearning/Data/wikicorpus_en/lower_case_1_seq_len_128_max_pred_20_masked_lm_prob_0.15_random_seed_12345_dupe_factor_5_shard_1472_test_split_10/wikicorpus_en/training
+
+APP="python3 run_pretraining_v1.py  \
+    --input_dir=${PATH_PHRASE1}    \
+    --output_dir=/public/home/hepj/outdir/torch/pre_wiki/phrase1 \
+    --config_file=./bert_config.json \
+    --bert_model=bert-large-uncased \
+    --train_batch_size=16 \
+    --max_seq_length=128 \
+    --max_predictions_per_seq=20 \
+    --max_steps=100000 \
+    --warmup_proportion=0.0 \
+    --num_steps_per_checkpoint=20000 \
+    --learning_rate=4.0e-4 \
+    --seed=12439 \
+    --gradient_accumulation_steps=1 \
+    --allreduce_post_accumulation \
+    --gpus_per_node 1 \
+    --do_train \
+    --json-summary /public/home/hepj/outdir/torch/pre_wiki/phrase1/dllogger.json
+ "
+
+
+
+case ${lrank} in
+[0])
+  export HIP_VISIBLE_DEVICES=0
+  export UCX_NET_DEVICES=mlx5_0:1
+  export UCX_IB_PCI_BW=mlx5_0:50Gbs
+  echo numactl --cpunodebind=0 --membind=0 ${APP}
+  numactl --cpunodebind=0 --membind=0 ${APP}
+
+  #echo GLOO_SOCKET_IFNAME=ib0 numactl --cpunodebind=0 --membind=0 ${APP} 
+  #GLOO_SOCKET_IFNAME=ib0 numactl --cpunodebind=0 --membind=0 ${APP}
+  ;;
+[1])
+  export HIP_VISIBLE_DEVICES=1
+  export UCX_NET_DEVICES=mlx5_1:1
+  export UCX_IB_PCI_BW=mlx5_1:50Gbs
+  echo numactl --cpunodebind=1 --membind=1 ${APP}
+  numactl --cpunodebind=1 --membind=1 ${APP}
+  ;;
+[2])
+  export HIP_VISIBLE_DEVICES=2
+  export UCX_NET_DEVICES=mlx5_2:1
+  export UCX_IB_PCI_BW=mlx5_2:50Gbs
+  echo numactl --cpunodebind=2 --membind=2 ${APP} 
+  numactl --cpunodebind=2 --membind=2 ${APP}
+  ;;
+[3])
+  export HIP_VISIBLE_DEVICES=3
+  export UCX_NET_DEVICES=mlx5_3:1
+  export UCX_IB_PCI_BW=mlx5_3:50Gbs
+  echo numactl --cpunodebind=3 --membind=3 ${APP}
+  numactl --cpunodebind=3 --membind=3 ${APP}
+  ;;
+esac
--- a/single_pre1_1_fp16.sh
+++ b/single_pre1_1_fp16.sh
+#!/bin/bash
+#export MIOPEN_DEBUG_DISABLE_FIND_DB=1
+#export NCCL_SOCKET_IFNAME=ib0
+#export HSA_USERPTR_FOR_PAGED_MEM=0
+#export MIOPEN_DEBUG_DISABLE_FIND_DB=1
+#export NCCL_SOCKET_IFNAME=eno1
+export HSA_FORCE_FINE_GRAIN_PCIE=1
+#source /public/software/apps/DeepLearning/PyTorch/pytorch-env.sh
+export MIOPEN_FIND_MODE=1
+
+#export MIOPEN_ENABLE_LOGGING_CMD=1
+#export ROCBLAS_LAYER=3
+module unload compiler/rocm/2.9
+#export MIOPEN_DEBUG_CONV_DIRECT_ASM_3X3U=0
+#export MIOPEN_DEBUG_CONV_DIRECT_ASM_WRW3X3=0
+echo "MIOPEN_FIND_MODE=$MIOPEN_FIND_MODE"
+#module load apps/PyTorch/1.5.0a0/hpcx-2.4.1-gcc-7.3.1-rocm3.3
+#source /public/home/aiss/Pytorch/env_rocm3.3_torch1.5.sh
+lrank=$OMPI_COMM_WORLD_LOCAL_RANK
+comm_rank=$OMPI_COMM_WORLD_RANK
+comm_size=$OMPI_COMM_WORLD_SIZE
+
+#下边是修改的
+
+
+#export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+#HSA_FORCE_FINE_GRAIN_PCIE=1 numactl --cpunodebind=4,5,6,7 --membind=4,5,6,7
+
+#module load compiler/rocm/3.9.1
+
+export PATH_PHRASE1=/public/software/apps/DeepLearning/Data/wikicorpus_en/lower_case_1_seq_len_128_max_pred_20_masked_lm_prob_0.15_random_seed_12345_dupe_factor_5_shard_1472_test_split_10/wikicorpus_en/training
+
+APP="python3 run_pretraining_v1.py  \
+    --input_dir=${PATH_PHRASE1}    \
+    --output_dir=/public/home/hepj/outdir/torch/pre_wiki/phrase1 \
+    --config_file=./bert_config.json \
+    --bert_model=bert-large-uncased \
+    --train_batch_size=16 \
+    --max_seq_length=128 \
+    --max_predictions_per_seq=20 \
+    --max_steps=100000 \
+    --warmup_proportion=0.0 \
+    --num_steps_per_checkpoint=20 \
+    --learning_rate=4.0e-4 \
+    --seed=12439 \
+    --fp16 \
+    --amp \
+    --gradient_accumulation_steps=1 \
+    --allreduce_post_accumulation \
+    --gpus_per_node 1 \
+    --do_train \
+    --json-summary /public/home/hepj/outdir/torch/pre_wiki/phrase1/dllogger.json
+ "
+
+case ${lrank} in
+[0])
+  export HIP_VISIBLE_DEVICES=0
+  export UCX_NET_DEVICES=mlx5_0:1
+  export UCX_IB_PCI_BW=mlx5_0:50Gbs
+  echo numactl --cpunodebind=0 --membind=0 ${APP}
+  numactl --cpunodebind=0 --membind=0 ${APP}
+
+  #echo GLOO_SOCKET_IFNAME=ib0 numactl --cpunodebind=0 --membind=0 ${APP} 
+  #GLOO_SOCKET_IFNAME=ib0 numactl --cpunodebind=0 --membind=0 ${APP}
+  ;;
+[1])
+  export HIP_VISIBLE_DEVICES=1
+  export UCX_NET_DEVICES=mlx5_1:1
+  export UCX_IB_PCI_BW=mlx5_1:50Gbs
+  echo numactl --cpunodebind=1 --membind=1 ${APP}
+  numactl --cpunodebind=1 --membind=1 ${APP}
+  ;;
+[2])
+  export HIP_VISIBLE_DEVICES=2
+  export UCX_NET_DEVICES=mlx5_2:1
+  export UCX_IB_PCI_BW=mlx5_2:50Gbs
+  echo numactl --cpunodebind=2 --membind=2 ${APP} 
+  numactl --cpunodebind=2 --membind=2 ${APP}
+  ;;
+[3])
+  export HIP_VISIBLE_DEVICES=3
+  export UCX_NET_DEVICES=mlx5_3:1
+  export UCX_IB_PCI_BW=mlx5_3:50Gbs
+  echo numactl --cpunodebind=3 --membind=3 ${APP}
+  numactl --cpunodebind=3 --membind=3 ${APP}
+  ;;
+esac
--- a/single_pre1_4.sh
+++ b/single_pre1_4.sh
+#!/bin/bash
+export HSA_FORCE_FINE_GRAIN_PCIE=1
+export MIOPEN_FIND_MODE=1
+module unload compiler/rocm/2.9
+echo "MIOPEN_FIND_MODE=$MIOPEN_FIND_MODE"
+lrank=$OMPI_COMM_WORLD_LOCAL_RANK
+comm_rank=$OMPI_COMM_WORLD_RANK
+comm_size=$OMPI_COMM_WORLD_SIZE
+
+#下边是修改的
+
+
+export HIP_VISIBLE_DEVICES=0,1,2,3
+
+export PATH_PHRASE1=/public/software/apps/DeepLearning/Data/wikicorpus_en/lower_case_1_seq_len_128_max_pred_20_masked_lm_prob_0.15_random_seed_12345_dupe_factor_5_shard_1472_test_split_10/wikicorpus_en/training
+
+APP="python3 run_pretraining_v4.py  \
+    --input_dir=${PATH_PHRASE1}    \
+    --output_dir=/public/home/hepj/outdir/torch/pre_wiki4/phrase1/fp32 \
+    --config_file=/public/home/hepj/model_source/pytorch_bert/bert_config.json \
+    --bert_model=bert-large-uncased \
+    --train_batch_size=16 \
+    --max_seq_length=128 \
+    --max_predictions_per_seq=20 \
+    --max_steps=100000 \
+    --warmup_proportion=0.0 \
+    --num_steps_per_checkpoint=20000 \
+    --learning_rate=4.0e-4 \
+    --seed=12439 \
+    --gradient_accumulation_steps=1 \
+    --allreduce_post_accumulation \
+    --do_train \
+    --use_env \
+    --local_rank ${comm_rank} \
+    --world_size 4 \
+    --gpus_per_node  1 \
+    --dist_url tcp://localhost:34567 \
+    --json-summary  /public/home/hepj/outdir/torch/pre_wiki4/phrase1/fp32/dllogger.json
+ "
+
+case ${lrank} in
+[0])
+  export HIP_VISIBLE_DEVICES=0
+  export UCX_NET_DEVICES=mlx5_0:1
+  export UCX_IB_PCI_BW=mlx5_0:50Gbs
+  echo numactl --cpunodebind=0 --membind=0 ${APP}
+  numactl --cpunodebind=0 --membind=0 ${APP}
+  ;;
+[1])
+  export HIP_VISIBLE_DEVICES=1
+  export UCX_NET_DEVICES=mlx5_1:1
+  export UCX_IB_PCI_BW=mlx5_1:50Gbs
+  echo numactl --cpunodebind=1 --membind=1 ${APP}
+  numactl --cpunodebind=1 --membind=1 ${APP}
+  ;;
+[2])
+  export HIP_VISIBLE_DEVICES=2
+  export UCX_NET_DEVICES=mlx5_2:1
+  export UCX_IB_PCI_BW=mlx5_2:50Gbs
+  echo numactl --cpunodebind=2 --membind=2 ${APP} 
+  numactl --cpunodebind=2 --membind=2 ${APP}
+  ;;
+[3])
+  export HIP_VISIBLE_DEVICES=3
+  export UCX_NET_DEVICES=mlx5_3:1
+  export UCX_IB_PCI_BW=mlx5_3:50Gbs
+  echo numactl --cpunodebind=3 --membind=3 ${APP}
+  numactl --cpunodebind=3 --membind=3 ${APP}
+  ;;
+esac
--- a/single_pre1_4_fp16.sh
+++ b/single_pre1_4_fp16.sh
+#!/bin/bash
+export HSA_FORCE_FINE_GRAIN_PCIE=1
+export MIOPEN_FIND_MODE=1
+module unload compiler/rocm/2.9
+echo "MIOPEN_FIND_MODE=$MIOPEN_FIND_MODE"
+lrank=$OMPI_COMM_WORLD_LOCAL_RANK
+comm_rank=$OMPI_COMM_WORLD_RANK
+comm_size=$OMPI_COMM_WORLD_SIZE
+
+#下边是修改的
+
+
+export HIP_VISIBLE_DEVICES=0,1,2,3
+
+export PATH_PHRASE1=/public/software/apps/DeepLearning/Data/wikicorpus_en/lower_case_1_seq_len_128_max_pred_20_masked_lm_prob_0.15_random_seed_12345_dupe_factor_5_shard_1472_test_split_10/wikicorpus_en/training
+
+APP="python3 run_pretraining_v4.py  \
+    --input_dir=${PATH_PHRASE1}    \
+    --output_dir=/public/home/hepj/outdir/torch/pre_wiki4/phrase1/fp32 \
+    --config_file=/public/home/hepj/model_source/pytorch_bert/bert_config.json \
+    --bert_model=bert-large-uncased \
+    --train_batch_size=16 \
+    --max_seq_length=128 \
+    --max_predictions_per_seq=20 \
+    --max_steps=100000 \
+    --warmup_proportion=0.0 \
+    --num_steps_per_checkpoint=20000 \
+    --learning_rate=4.0e-4 \
+    --seed=12439 \
+    --gradient_accumulation_steps=1 \
+    --allreduce_post_accumulation \
+    --do_train \
+    --fp16 \
+    --amp \
+    --use_env \
+    --local_rank ${comm_rank} \
+    --world_size 4 \
+    --gpus_per_node  1 \
+    --dist_url tcp://localhost:34567 \
+    --json-summary  /public/home/hepj/outdir/torch/pre_wiki4/phrase1/fp32/dllogger.json
+ "
+case ${lrank} in
+[0])
+  export HIP_VISIBLE_DEVICES=0
+  export UCX_NET_DEVICES=mlx5_0:1
+  export UCX_IB_PCI_BW=mlx5_0:50Gbs
+  echo numactl --cpunodebind=0 --membind=0 ${APP}
+  numactl --cpunodebind=0 --membind=0 ${APP}
+
+  #echo GLOO_SOCKET_IFNAME=ib0 numactl --cpunodebind=0 --membind=0 ${APP} 
+  #GLOO_SOCKET_IFNAME=ib0 numactl --cpunodebind=0 --membind=0 ${APP}
+  ;;
+[1])
+  export HIP_VISIBLE_DEVICES=1
+  export UCX_NET_DEVICES=mlx5_1:1
+  export UCX_IB_PCI_BW=mlx5_1:50Gbs
+  echo numactl --cpunodebind=1 --membind=1 ${APP}
+  numactl --cpunodebind=1 --membind=1 ${APP}
+  ;;
+[2])
+  export HIP_VISIBLE_DEVICES=2
+  export UCX_NET_DEVICES=mlx5_2:1
+  export UCX_IB_PCI_BW=mlx5_2:50Gbs
+  echo numactl --cpunodebind=2 --membind=2 ${APP} 
+  numactl --cpunodebind=2 --membind=2 ${APP}
+  ;;
+[3])
+  export HIP_VISIBLE_DEVICES=3
+  export UCX_NET_DEVICES=mlx5_3:1
+  export UCX_IB_PCI_BW=mlx5_3:50Gbs
+  echo numactl --cpunodebind=3 --membind=3 ${APP}
+  numactl --cpunodebind=3 --membind=3 ${APP}
+  ;;
+esac
--- a/single_pre2_1.sh
+++ b/single_pre2_1.sh
+#!/bin/bash
+#export MIOPEN_DEBUG_DISABLE_FIND_DB=1
+#export NCCL_SOCKET_IFNAME=ib0
+#export HSA_USERPTR_FOR_PAGED_MEM=0
+#export MIOPEN_DEBUG_DISABLE_FIND_DB=1
+#export NCCL_SOCKET_IFNAME=eno1
+export HSA_FORCE_FINE_GRAIN_PCIE=1
+#source /public/software/apps/DeepLearning/PyTorch/pytorch-env.sh
+export MIOPEN_FIND_MODE=1
+#export MIOPEN_ENABLE_LOGGING_CMD=1
+#export ROCBLAS_LAYER=3
+module unload compiler/rocm/2.9
+#export MIOPEN_DEBUG_CONV_DIRECT_ASM_3X3U=0
+#export MIOPEN_DEBUG_CONV_DIRECT_ASM_WRW3X3=0
+echo "MIOPEN_FIND_MODE=$MIOPEN_FIND_MODE"
+#module load apps/PyTorch/1.5.0a0/hpcx-2.4.1-gcc-7.3.1-rocm3.3
+#source /public/home/aiss/Pytorch/env_rocm3.3_torch1.5.sh
+lrank=$OMPI_COMM_WORLD_LOCAL_RANK
+comm_rank=$OMPI_COMM_WORLD_RANK
+comm_size=$OMPI_COMM_WORLD_SIZE
+
+#下边是修改的
+
+
+#export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+#HSA_FORCE_FINE_GRAIN_PCIE=1 numactl --cpunodebind=4,5,6,7 --membind=4,5,6,7
+
+#module load compiler/rocm/3.9.1
+
+export PATH_PHRASE2=/public/software/apps/DeepLearning/Data/wikicorpus_en/lower_case_1_seq_len_512_max_pred_80_masked_lm_prob_0.15_random_seed_12345_dupe_factor_5_shard_1472_test_split_10/wikicorpus_en/training
+
+APP="python3 run_pretraining_v1.py  \
+    --input_dir=${PATH_PHRASE2}    \
+    --output_dir=/public/home/hepj/outdir/torch/pre_wiki/phrase2/fp32 \
+    --config_file=/public/home/hepj/model_source/pytorch_bert/bert_config.json \
+    --bert_model=bert-large-uncased \
+    --train_batch_size=2 \
+    --max_seq_length=512 \
+    --max_predictions_per_seq=80 \
+    --max_steps=400000 \
+    --warmup_proportion=0128. \
+    --num_steps_per_checkpoint=20000 \
+    --learning_rate=4.0e-3 \
+    --seed=12439 \
+    --gradient_accumulation_steps=1 \
+    --allreduce_post_accumulation \
+    --do_train \
+    --phase2 \
+    --phase1_end_step=0 \
+    --gpus_per_node 1 \
+    --local_rank -1 \
+    --json-summary /public/home/hepj/outdir/torch/pre_wiki/phrase2/fp32/dllogger_dtk22.04.json
+
+ "
+#--fp16  \
+# --amp \
+
+
+#--json-summary  /public/home/hepj/out_dir/tourch/SQuAD/results.json 
+
+
+case ${lrank} in
+[0])
+  export HIP_VISIBLE_DEVICES=0
+  export UCX_NET_DEVICES=mlx5_0:1
+  export UCX_IB_PCI_BW=mlx5_0:50Gbs
+  echo numactl --cpunodebind=0 --membind=0 ${APP}
+  numactl --cpunodebind=0 --membind=0 ${APP}
+
+  #echo GLOO_SOCKET_IFNAME=ib0 numactl --cpunodebind=0 --membind=0 ${APP} 
+  #GLOO_SOCKET_IFNAME=ib0 numactl --cpunodebind=0 --membind=0 ${APP}
+  ;;
+[1])
+  export HIP_VISIBLE_DEVICES=1
+  export UCX_NET_DEVICES=mlx5_1:1
+  export UCX_IB_PCI_BW=mlx5_1:50Gbs
+  echo numactl --cpunodebind=1 --membind=1 ${APP}
+  numactl --cpunodebind=1 --membind=1 ${APP}
+  ;;
+[2])
+  export HIP_VISIBLE_DEVICES=2
+  export UCX_NET_DEVICES=mlx5_2:1
+  export UCX_IB_PCI_BW=mlx5_2:50Gbs
+  echo numactl --cpunodebind=2 --membind=2 ${APP} 
+  numactl --cpunodebind=2 --membind=2 ${APP}
+  ;;
+[3])
+  export HIP_VISIBLE_DEVICES=3
+  export UCX_NET_DEVICES=mlx5_3:1
+  export UCX_IB_PCI_BW=mlx5_3:50Gbs
+  echo numactl --cpunodebind=3 --membind=3 ${APP}
+  numactl --cpunodebind=3 --membind=3 ${APP}
+  ;;
+esac
--- a/single_pre2_1_fp16.sh
+++ b/single_pre2_1_fp16.sh
+#!/bin/bash
+export HSA_FORCE_FINE_GRAIN_PCIE=1
+export MIOPEN_FIND_MODE=1
+#export MIOPEN_ENABLE_LOGGING_CMD=1
+#export ROCBLAS_LAYER=3
+module unload compiler/rocm/2.9
+echo "MIOPEN_FIND_MODE=$MIOPEN_FIND_MODE"
+lrank=$OMPI_COMM_WORLD_LOCAL_RANK
+comm_rank=$OMPI_COMM_WORLD_RANK
+comm_size=$OMPI_COMM_WORLD_SIZE
+
+#下边是修改的
+
+export PATH_PHRASE2=/public/software/apps/DeepLearning/Data/wikicorpus_en/lower_case_1_seq_len_512_max_pred_80_masked_lm_prob_0.15_random_seed_12345_dupe_factor_5_shard_1472_test_split_10/wikicorpus_en/training
+
+APP="python3 run_pretraining_v1.py  \
+    --input_dir=${PATH_PHRASE2}    \
+    --output_dir=/public/home/hepj/outdir/torch/pre_wiki/phrase2/fp16 \
+    --config_file=/public/home/hepj/model_source/pytorch_bert/bert_config.json \
+    --bert_model=bert-large-uncased \
+    --train_batch_size=2 \
+    --max_seq_length=512 \
+    --max_predictions_per_seq=80 \
+    --max_steps=400000 \
+    --warmup_proportion=0128. \
+    --num_steps_per_checkpoint=20000 \
+    --learning_rate=4.0e-3 \
+    --seed=12439 \
+    --gradient_accumulation_steps=1 \
+    --allreduce_post_accumulation \
+    --do_train \
+    --fp16 \
+    --amp \
+    --phase2 \
+    --phase1_end_step=0 \
+    --gpus_per_node 1 \
+    --local_rank -1 \
+    --json-summary /public/home/hepj/outdir/torch/pre_wiki/phrase2/fp16/dllogger_dtk22.04.json
+
+ "
+
+
+#--json-summary  /public/home/hepj/out_dir/tourch/SQuAD/results.json 
+
+
+case ${lrank} in
+[0])
+  export HIP_VISIBLE_DEVICES=0
+  export UCX_NET_DEVICES=mlx5_0:1
+  export UCX_IB_PCI_BW=mlx5_0:50Gbs
+  echo numactl --cpunodebind=0 --membind=0 ${APP}
+  numactl --cpunodebind=0 --membind=0 ${APP}
+
+  #echo GLOO_SOCKET_IFNAME=ib0 numactl --cpunodebind=0 --membind=0 ${APP} 
+  #GLOO_SOCKET_IFNAME=ib0 numactl --cpunodebind=0 --membind=0 ${APP}
+  ;;
+[1])
+  export HIP_VISIBLE_DEVICES=1
+  export UCX_NET_DEVICES=mlx5_1:1
+  export UCX_IB_PCI_BW=mlx5_1:50Gbs
+  echo numactl --cpunodebind=1 --membind=1 ${APP}
+  numactl --cpunodebind=1 --membind=1 ${APP}
+  ;;
+[2])
+  export HIP_VISIBLE_DEVICES=2
+  export UCX_NET_DEVICES=mlx5_2:1
+  export UCX_IB_PCI_BW=mlx5_2:50Gbs
+  echo numactl --cpunodebind=2 --membind=2 ${APP} 
+  numactl --cpunodebind=2 --membind=2 ${APP}
+  ;;
+[3])
+  export HIP_VISIBLE_DEVICES=3
+  export UCX_NET_DEVICES=mlx5_3:1
+  export UCX_IB_PCI_BW=mlx5_3:50Gbs
+  echo numactl --cpunodebind=3 --membind=3 ${APP}
+  numactl --cpunodebind=3 --membind=3 ${APP}
+  ;;
+esac
--- a/single_pre2_4.sh
+++ b/single_pre2_4.sh
+#!/bin/bash
+export HSA_FORCE_FINE_GRAIN_PCIE=1
+export MIOPEN_FIND_MODE=1
+module unload compiler/rocm/2.9
+echo "MIOPEN_FIND_MODE=$MIOPEN_FIND_MODE"
+lrank=$OMPI_COMM_WORLD_LOCAL_RANK
+comm_rank=$OMPI_COMM_WORLD_RANK
+comm_size=$OMPI_COMM_WORLD_SIZE
+export HIP_VISIBLE_DEVICES=0,1,2,3
+
+export PATH_PHRASE2=/public/software/apps/DeepLearning/Data/wikicorpus_en/lower_case_1_seq_len_512_max_pred_80_masked_lm_prob_0.15_random_seed_12345_dupe_factor_5_shard_1472_test_split_10/wikicorpus_en/training
+
+APP="python3 run_pretraining_v4.py  \
+    --input_dir=${PATH_PHRASE2}    \
+    --output_dir=/public/home/hepj/outdir/tourch/pre_wiki4/phrase2/fp32 \
+    --config_file=/public/home/hepj/model_source/pytorch_bert/bert_config.json \
+    --bert_model=bert-large-uncased \
+    --train_batch_size=2 \
+    --max_seq_length=512 \
+    --max_predictions_per_seq=80 \
+    --max_steps=400000 \
+    --warmup_proportion=0128. \
+    --num_steps_per_checkpoint=200000 \
+    --learning_rate=4.0e-3 \
+    --seed=12439 \
+    --gradient_accumulation_steps=1 \
+    --allreduce_post_accumulation \
+    --do_train \
+    --phase2 \
+    --phase1_end_step 0 \
+    --local_rank ${comm_rank} \
+    --world_size 4 \
+    --gpus_per_node 1 \
+    --dist_url tcp://localhost:34567 \
+    --json-summary /public/home/hepj/outdir/torch/pre_wiki4/phrase2/fp32/dllogger.json
+ "
+#--fp16  \
+# --amp \
+
+
+#--json-summary  /public/home/hepj/out_dir/tourch/SQuAD/results.json 
+
+
+case ${lrank} in
+[0])
+  export HIP_VISIBLE_DEVICES=0
+  export UCX_NET_DEVICES=mlx5_0:1
+  export UCX_IB_PCI_BW=mlx5_0:50Gbs
+  echo numactl --cpunodebind=0 --membind=0 ${APP}
+  numactl --cpunodebind=0 --membind=0 ${APP}
+
+  #echo GLOO_SOCKET_IFNAME=ib0 numactl --cpunodebind=0 --membind=0 ${APP} 
+  #GLOO_SOCKET_IFNAME=ib0 numactl --cpunodebind=0 --membind=0 ${APP}
+  ;;
+[1])
+  export HIP_VISIBLE_DEVICES=1
+  export UCX_NET_DEVICES=mlx5_1:1
+  export UCX_IB_PCI_BW=mlx5_1:50Gbs
+  echo numactl --cpunodebind=1 --membind=1 ${APP}
+  numactl --cpunodebind=1 --membind=1 ${APP}
+  ;;
+[2])
+  export HIP_VISIBLE_DEVICES=2
+  export UCX_NET_DEVICES=mlx5_2:1
+  export UCX_IB_PCI_BW=mlx5_2:50Gbs
+  echo numactl --cpunodebind=2 --membind=2 ${APP} 
+  numactl --cpunodebind=2 --membind=2 ${APP}
+  ;;
+[3])
+  export HIP_VISIBLE_DEVICES=3
+  export UCX_NET_DEVICES=mlx5_3:1
+  export UCX_IB_PCI_BW=mlx5_3:50Gbs
+  echo numactl --cpunodebind=3 --membind=3 ${APP}
+  numactl --cpunodebind=3 --membind=3 ${APP}
+  ;;
+esac
--- a/single_pre2_4_fp16.sh
+++ b/single_pre2_4_fp16.sh
+#!/bin/bash
+export HSA_FORCE_FINE_GRAIN_PCIE=1
+export MIOPEN_FIND_MODE=1
+module unload compiler/rocm/2.9
+echo "MIOPEN_FIND_MODE=$MIOPEN_FIND_MODE"
+lrank=$OMPI_COMM_WORLD_LOCAL_RANK
+comm_rank=$OMPI_COMM_WORLD_RANK
+comm_size=$OMPI_COMM_WORLD_SIZE
+
+#下边是修改的
+export HIP_LAUNCH_BLOCKING=1
+
+export HIP_VISIBLE_DEVICES=0,1,2,3
+
+export PATH_PHRASE2=/public/software/apps/DeepLearning/Data/wikicorpus_en/lower_case_1_seq_len_512_max_pred_80_masked_lm_prob_0.15_random_seed_12345_dupe_factor_5_shard_1472_test_split_10/wikicorpus_en/training
+
+APP="python3 run_pretraining_v4.py  \
+    --input_dir=${PATH_PHRASE2}    \
+    --output_dir=/public/home/hepj/outdir/tourch/pre_wiki4/phrase2/fp16 \
+    --config_file=/public/home/hepj/model_source/pytorch_bert/bert_config.json \
+    --bert_model=bert-large-uncased \
+    --train_batch_size=2 \
+    --max_seq_length=512 \
+    --max_predictions_per_seq=80 \
+    --max_steps=40000 \
+    --warmup_proportion=0128. \
+    --num_steps_per_checkpoint=10000 \
+    --learning_rate=4.0e-3 \
+    --seed=12439 \
+    --gradient_accumulation_steps=1 \
+    --allreduce_post_accumulation \
+    --do_train \
+    --phase2 \
+    --phase1_end_step 0 \
+    --local_rank ${comm_rank} \
+    --world_size 4 \
+    --gpus_per_node 1 \
+    --fp16 \
+    --amp \
+    --dist_url tcp://localhost:34567 \
+    --json-summary /public/home/hepj/outdir/torch/pre_wiki4/phrase2/fp16/dllogger.json
+ "
+
+case ${lrank} in
+[0])
+  export HIP_VISIBLE_DEVICES=0
+  export UCX_NET_DEVICES=mlx5_0:1
+  export UCX_IB_PCI_BW=mlx5_0:50Gbs
+  echo numactl --cpunodebind=0 --membind=0 ${APP}
+  numactl --cpunodebind=0 --membind=0 ${APP}
+
+  #echo GLOO_SOCKET_IFNAME=ib0 numactl --cpunodebind=0 --membind=0 ${APP} 
+  #GLOO_SOCKET_IFNAME=ib0 numactl --cpunodebind=0 --membind=0 ${APP}
+  ;;
+[1])
+  export HIP_VISIBLE_DEVICES=1
+  export UCX_NET_DEVICES=mlx5_1:1
+  export UCX_IB_PCI_BW=mlx5_1:50Gbs
+  echo numactl --cpunodebind=1 --membind=1 ${APP}
+  numactl --cpunodebind=1 --membind=1 ${APP}
+  ;;
+[2])
+  export HIP_VISIBLE_DEVICES=2
+  export UCX_NET_DEVICES=mlx5_2:1
+  export UCX_IB_PCI_BW=mlx5_2:50Gbs
+  echo numactl --cpunodebind=2 --membind=2 ${APP} 
+  numactl --cpunodebind=2 --membind=2 ${APP}
+  ;;
+[3])
+  export HIP_VISIBLE_DEVICES=3
+  export UCX_NET_DEVICES=mlx5_3:1
+  export UCX_IB_PCI_BW=mlx5_3:50Gbs
+  echo numactl --cpunodebind=3 --membind=3 ${APP}
+  numactl --cpunodebind=3 --membind=3 ${APP}
+  ;;
+esac
--- a/single_squad.sh
+++ b/single_squad.sh
+#!/bin/bash
+#export MIOPEN_DEBUG_DISABLE_FIND_DB=1
+#export NCCL_SOCKET_IFNAME=ib0
+#export HSA_USERPTR_FOR_PAGED_MEM=0
+#export MIOPEN_DEBUG_DISABLE_FIND_DB=1
+#export NCCL_SOCKET_IFNAME=eno1
+export HSA_FORCE_FINE_GRAIN_PCIE=1
+#source /public/software/apps/DeepLearning/PyTorch/pytorch-env.sh
+export MIOPEN_FIND_MODE=3
+#export MIOPEN_ENABLE_LOGGING_CMD=1
+#export ROCBLAS_LAYER=3
+module unload compiler/rocm/2.9
+#export MIOPEN_DEBUG_CONV_DIRECT_ASM_3X3U=0
+#export MIOPEN_DEBUG_CONV_DIRECT_ASM_WRW3X3=0
+echo "MIOPEN_FIND_MODE=$MIOPEN_FIND_MODE"
+#module load apps/PyTorch/1.5.0a0/hpcx-2.4.1-gcc-7.3.1-rocm3.3
+#source /public/home/aiss/Pytorch/env_rocm3.3_torch1.5.sh
+lrank=$OMPI_COMM_WORLD_LOCAL_RANK
+comm_rank=$OMPI_COMM_WORLD_RANK
+comm_size=$OMPI_COMM_WORLD_SIZE
+
+
+
+#module load compiler/rocm/3.9.1
+
+
+
+
+APP="python3 run_squad_v1.py \
+  --train_file  /public/home/hepj/data/sq1.1/train-v1.1.json \
+  --predict_file  /public/home/hepj/data/sq1.1/dev-v1.1.json \
+  --init_checkpoint  /public/home/hepj/model_source/pytorch_bert/model.ckpt-28252.pt \
+  --vocab_file  /public/home/hepj/model_source/pytorch_bert/vocab.txt \
+  --output_dir  /public/home/hepj/outdir/torch/SQuAD \
+  --config_file  /public/home/hepj/model_source/pytorch_bert/bert_config.json \
+  --json-summary  ./log/results.json \
+  --bert_model bert-large-uncased \
+  --do_train \
+  --do_predict \
+  --train_batch_size  4 \
+  --predict_batch_size 4 \
+  --gpus_per_node  1 \
+  --local_rank -1 \
+  --eval_script ./evaluate-v1.1.py
+ "
+
+
+case ${lrank} in
+[0])
+  export HIP_VISIBLE_DEVICES=0
+  export UCX_NET_DEVICES=mlx5_0:1
+  export UCX_IB_PCI_BW=mlx5_0:50Gbs
+  echo numactl --cpunodebind=0 --membind=0 ${APP}
+  numactl --cpunodebind=0 --membind=0 ${APP}
+
+  #echo GLOO_SOCKET_IFNAME=ib0 numactl --cpunodebind=0 --membind=0 ${APP} 
+  #GLOO_SOCKET_IFNAME=ib0 numactl --cpunodebind=0 --membind=0 ${APP}
+  ;;
+[1])
+  export HIP_VISIBLE_DEVICES=1
+  export UCX_NET_DEVICES=mlx5_1:1
+  export UCX_IB_PCI_BW=mlx5_1:50Gbs
+  echo numactl --cpunodebind=1 --membind=1 ${APP}
+  numactl --cpunodebind=1 --membind=1 ${APP}
+  ;;
+[2])
+  export HIP_VISIBLE_DEVICES=2
+  export UCX_NET_DEVICES=mlx5_2:1
+  export UCX_IB_PCI_BW=mlx5_2:50Gbs
+  echo numactl --cpunodebind=2 --membind=2 ${APP} 
+  numactl --cpunodebind=2 --membind=2 ${APP}
+  ;;
+[3])
+  export HIP_VISIBLE_DEVICES=3
+  export UCX_NET_DEVICES=mlx5_3:1
+  export UCX_IB_PCI_BW=mlx5_3:50Gbs
+  echo numactl --cpunodebind=3 --membind=3 ${APP}
+  numactl --cpunodebind=3 --membind=3 ${APP}
+  ;;
+esac