Commit e5ca7e62 authored by hepj987's avatar hepj987
Browse files

初始化仓库

parents
#!/bin/bash
CMD=${1:-/bin/bash}
NV_VISIBLE_DEVICES=${2:-"all"}
DOCKER_BRIDGE=${3:-"host"}
docker run -it --rm \
--gpus device=$NV_VISIBLE_DEVICES \
--net=$DOCKER_BRIDGE \
--shm-size=1g \
--ulimit memlock=-1 \
--ulimit stack=67108864 \
-e LD_LIBRARY_PATH='/workspace/install/lib/' \
-v $PWD:/workspace/bert \
-v $PWD/results:/results \
bert $CMD
#!/bin/bash
# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
set -e
echo "Container nvidia build = " $NVIDIA_BUILD_ID
init_checkpoint=${1:-"/workspace/bert/checkpoints/bert_uncased.pt"}
data_dir=${2:-"$BERT_PREP_WORKING_DIR/download/glue/MRPC/"}
vocab_file=${3:-"$BERT_PREP_WORKING_DIR/download/google_pretrained_weights/uncased_L-24_H-1024_A-16/vocab.txt"}
config_file=${4:-"/workspace/bert/bert_config.json"}
out_dir=${5:-"/workspace/bert/results/MRPC"}
task_name=${6:-"mrpc"}
num_gpu=${7:-"8"}
batch_size=${8:-"16"}
gradient_accumulation_steps=${9:-"1"}
learning_rate=${10:-"2.4e-5"}
warmup_proportion=${11:-"0.1"}
epochs=${12:-"3"}
max_steps=${13:-"-1.0"}
precision=${14:-"fp16"}
seed=${15:-"2"}
mode=${16:-"train eval"}
mkdir -p $out_dir
if [ "$mode" = "eval" ] ; then
num_gpu=1
fi
use_fp16=""
if [ "$precision" = "fp16" ] ; then
echo "fp16 activated!"
use_fp16="--fp16"
fi
if [ "$num_gpu" = "1" ] ; then
export CUDA_VISIBLE_DEVICES=0
mpi_command=""
else
unset CUDA_VISIBLE_DEVICES
mpi_command=" -m torch.distributed.launch --nproc_per_node=$num_gpu"
fi
CMD="python $mpi_command run_glue.py "
CMD+="--task_name ${task_name} "
if [[ $mode == *"train"* ]] ; then
CMD+="--do_train "
CMD+="--train_batch_size=$batch_size "
fi
if [[ $mode == *"eval"* ]] || [[ $mode == *"prediction"* ]]; then
if [[ $mode == *"eval"* ]] ; then
CMD+="--do_eval "
fi
if [[ $mode == *"prediction"* ]] ; then
CMD+="--do_predict "
fi
CMD+="--eval_batch_size=$batch_size "
fi
CMD+="--gradient_accumulation_steps=$gradient_accumulation_steps "
CMD+="--do_lower_case "
CMD+="--data_dir $data_dir "
CMD+="--bert_model bert-large-uncased "
CMD+="--seed $seed "
CMD+="--init_checkpoint $init_checkpoint "
CMD+="--warmup_proportion $warmup_proportion "
CMD+="--max_seq_length 128 "
CMD+="--learning_rate $learning_rate "
CMD+="--num_train_epochs $epochs "
CMD+="--max_steps $max_steps "
CMD+="--vocab_file=$vocab_file "
CMD+="--config_file=$config_file "
CMD+="--output_dir $out_dir "
CMD+="$use_fp16"
LOGFILE=$out_dir/logfile
$CMD |& tee $LOGFILE
#!/bin/bash
#set -x
hostfile=./hostfile
num_node=$(cat $hostfile|sort|uniq |wc -l)
num_gpu=$(($num_node*4))
nodename=$(cat $hostfile |sed -n "1p")
dist_url=`echo $nodename | awk '{print $1}'`
echo $dist_url
rm `pwd`/hostfile-dl -f
cat $hostfile|sort|uniq >`pwd`/tmp
for i in `cat ./tmp`
do
echo ${i} slots=4 >> `pwd`/hostfile-dl
done
mpirun -np ${num_gpu} --hostfile `pwd`/hostfile-dl --bind-to none `pwd`/single_process_pretrain.sh $dist_url
#!/bin/bash
#set -x
hostfile=./hostfile
num_node=$(cat $hostfile|sort|uniq |wc -l)
num_gpu=$(($num_node*4))
nodename=$(cat $hostfile |sed -n "1p")
dist_url=`echo $nodename | awk '{print $1}'`
echo $dist_url
rm `pwd`/hostfile-dl -f
cat $hostfile|sort|uniq >`pwd`/tmp
for i in `cat ./tmp`
do
echo ${i} slots=4 >> `pwd`/hostfile-dl
done
mpirun -np ${num_gpu} --hostfile `pwd`/hostfile-dl --bind-to none -x LD_LIBRARY_PATH -x PATH `pwd`/ingle_process_pretrain2.sh $dist_url results
#!/bin/bash
# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
echo "Container nvidia build = " $NVIDIA_BUILD_ID
train_batch_size=${1:-8192}
learning_rate=${2:-"6e-3"}
precision=${3:-"fp16"}
num_gpus=${4:-8}
warmup_proportion=${5:-"0.2843"}
train_steps=${6:-7038}
save_checkpoint_steps=${7:-200}
resume_training=${8:-"false"}
create_logfile=${9:-"true"}
accumulate_gradients=${10:-"true"}
gradient_accumulation_steps=${11:-128}
seed=${12:-12439}
job_name=${13:-"bert_lamb_pretraining"}
allreduce_post_accumulation=${14:-"true"}
allreduce_post_accumulation_fp16=${15:-"true"}
train_batch_size_phase2=${16:-4096}
learning_rate_phase2=${17:-"4e-3"}
warmup_proportion_phase2=${18:-"0.128"}
train_steps_phase2=${19:-1563}
gradient_accumulation_steps_phase2=${20:-512}
DATASET=hdf5_lower_case_1_seq_len_128_max_pred_20_masked_lm_prob_0.15_random_seed_12345_dupe_factor_5_shard_1472_test_split_10/books_wiki_en_corpus/training # change this for other datasets
DATA_DIR_PHASE1=${21:-$BERT_PREP_WORKING_DIR/${DATASET}/}
BERT_CONFIG=bert_config.json
DATASET2=hdf5_lower_case_1_seq_len_512_max_pred_80_masked_lm_prob_0.15_random_seed_12345_dupe_factor_5_shard_1472_test_split_10/books_wiki_en_corpus/training # change this for other datasets
DATA_DIR_PHASE2=${22:-$BERT_PREP_WORKING_DIR/${DATASET2}/}
CODEDIR=${23:-"/workspace/bert"}
init_checkpoint=${24:-"None"}
RESULTS_DIR=$CODEDIR/results
CHECKPOINTS_DIR=$RESULTS_DIR/checkpoints
mkdir -p $CHECKPOINTS_DIR
if [ ! -d "$DATA_DIR_PHASE1" ] ; then
echo "Warning! $DATA_DIR_PHASE1 directory missing. Training cannot start"
fi
if [ ! -d "$RESULTS_DIR" ] ; then
echo "Error! $RESULTS_DIR directory missing."
exit -1
fi
if [ ! -d "$CHECKPOINTS_DIR" ] ; then
echo "Warning! $CHECKPOINTS_DIR directory missing."
echo "Checkpoints will be written to $RESULTS_DIR instead."
CHECKPOINTS_DIR=$RESULTS_DIR
fi
if [ ! -f "$BERT_CONFIG" ] ; then
echo "Error! BERT large configuration file not found at $BERT_CONFIG"
exit -1
fi
PREC=""
if [ "$precision" = "fp16" ] ; then
PREC="--fp16"
elif [ "$precision" = "fp32" ] ; then
PREC=""
elif [ "$precision" = "tf32" ] ; then
PREC=""
else
echo "Unknown <precision> argument"
exit -2
fi
ACCUMULATE_GRADIENTS=""
if [ "$accumulate_gradients" == "true" ] ; then
ACCUMULATE_GRADIENTS="--gradient_accumulation_steps=$gradient_accumulation_steps"
fi
CHECKPOINT=""
if [ "$resume_training" == "true" ] ; then
CHECKPOINT="--resume_from_checkpoint"
fi
ALL_REDUCE_POST_ACCUMULATION=""
if [ "$allreduce_post_accumulation" == "true" ] ; then
ALL_REDUCE_POST_ACCUMULATION="--allreduce_post_accumulation"
fi
ALL_REDUCE_POST_ACCUMULATION_FP16=""
if [ "$allreduce_post_accumulation_fp16" == "true" ] ; then
ALL_REDUCE_POST_ACCUMULATION_FP16="--allreduce_post_accumulation_fp16"
fi
INIT_CHECKPOINT=""
if [ "$init_checkpoint" != "None" ] ; then
INIT_CHECKPOINT="--init_checkpoint=$init_checkpoint"
fi
echo $DATA_DIR_PHASE1
INPUT_DIR=$DATA_DIR_PHASE1
CMD=" $CODEDIR/run_pretraining.py"
CMD+=" --input_dir=$DATA_DIR_PHASE1"
CMD+=" --output_dir=$CHECKPOINTS_DIR"
CMD+=" --config_file=$BERT_CONFIG"
CMD+=" --bert_model=bert-large-uncased"
CMD+=" --train_batch_size=$train_batch_size"
CMD+=" --max_seq_length=128"
CMD+=" --max_predictions_per_seq=20"
CMD+=" --max_steps=$train_steps"
CMD+=" --warmup_proportion=$warmup_proportion"
CMD+=" --num_steps_per_checkpoint=$save_checkpoint_steps"
CMD+=" --learning_rate=$learning_rate"
CMD+=" --seed=$seed"
CMD+=" $PREC"
CMD+=" $ACCUMULATE_GRADIENTS"
CMD+=" $CHECKPOINT"
CMD+=" $ALL_REDUCE_POST_ACCUMULATION"
CMD+=" $ALL_REDUCE_POST_ACCUMULATION_FP16"
CMD+=" $INIT_CHECKPOINT"
CMD+=" --do_train"
CMD+=" --json-summary ${RESULTS_DIR}/dllogger.json "
CMD="python3 -m torch.distributed.launch --nproc_per_node=$num_gpus $CMD"
if [ "$create_logfile" = "true" ] ; then
export GBS=$(expr $train_batch_size \* $num_gpus)
printf -v TAG "pyt_bert_pretraining_phase1_%s_gbs%d" "$precision" $GBS
DATESTAMP=`date +'%y%m%d%H%M%S'`
LOGFILE=$RESULTS_DIR/$job_name.$TAG.$DATESTAMP.log
printf "Logs written to %s\n" "$LOGFILE"
fi
set -x
if [ -z "$LOGFILE" ] ; then
$CMD
else
(
$CMD
) |& tee $LOGFILE
fi
set +x
echo "finished pretraining"
#Start Phase2
PREC=""
if [ "$precision" = "fp16" ] ; then
PREC="--fp16"
elif [ "$precision" = "fp32" ] ; then
PREC=""
elif [ "$precision" = "tf32" ] ; then
PREC=""
else
echo "Unknown <precision> argument"
exit -2
fi
ACCUMULATE_GRADIENTS=""
if [ "$accumulate_gradients" == "true" ] ; then
ACCUMULATE_GRADIENTS="--gradient_accumulation_steps=$gradient_accumulation_steps_phase2"
fi
ALL_REDUCE_POST_ACCUMULATION=""
if [ "$allreduce_post_accumulation" == "true" ] ; then
ALL_REDUCE_POST_ACCUMULATION="--allreduce_post_accumulation"
fi
ALL_REDUCE_POST_ACCUMULATION_FP16=""
if [ "$allreduce_post_accumulation_fp16" == "true" ] ; then
ALL_REDUCE_POST_ACCUMULATION_FP16="--allreduce_post_accumulation_fp16"
fi
echo $DATA_DIR_PHASE2
INPUT_DIR=$DATA_DIR_PHASE2
CMD=" $CODEDIR/run_pretraining.py"
CMD+=" --input_dir=$DATA_DIR_PHASE2"
CMD+=" --output_dir=$CHECKPOINTS_DIR"
CMD+=" --config_file=$BERT_CONFIG"
CMD+=" --bert_model=bert-large-uncased"
CMD+=" --train_batch_size=$train_batch_size_phase2"
CMD+=" --max_seq_length=512"
CMD+=" --max_predictions_per_seq=80"
CMD+=" --max_steps=$train_steps_phase2"
CMD+=" --warmup_proportion=$warmup_proportion_phase2"
CMD+=" --num_steps_per_checkpoint=$save_checkpoint_steps"
CMD+=" --learning_rate=$learning_rate_phase2"
CMD+=" --seed=$seed"
CMD+=" $PREC"
CMD+=" $ACCUMULATE_GRADIENTS"
CMD+=" $CHECKPOINT"
CMD+=" $ALL_REDUCE_POST_ACCUMULATION"
CMD+=" $ALL_REDUCE_POST_ACCUMULATION_FP16"
CMD+=" --do_train --phase2 --resume_from_checkpoint --phase1_end_step=$train_steps"
CMD+=" --json-summary ${RESULTS_DIR}/dllogger.json "
CMD="python3 -m torch.distributed.launch --nproc_per_node=$num_gpus $CMD"
if [ "$create_logfile" = "true" ] ; then
export GBS=$(expr $train_batch_size_phase2 \* $num_gpus)
printf -v TAG "pyt_bert_pretraining_phase2_%s_gbs%d" "$precision" $GBS
DATESTAMP=`date +'%y%m%d%H%M%S'`
LOGFILE=$RESULTS_DIR/$job_name.$TAG.$DATESTAMP.log
printf "Logs written to %s\n" "$LOGFILE"
fi
set -x
if [ -z "$LOGFILE" ] ; then
$CMD
else
(
$CMD
) |& tee $LOGFILE
fi
set +x
echo "finished phase2"
#!/usr/bin/env bash
# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
echo "Container nvidia build = " $NVIDIA_BUILD_ID
init_checkpoint=${1:-"/workspace/bert/checkpoints/bert_uncased.pt"}
epochs=${2:-"2.0"}
batch_size=${3:-"4"}
learning_rate=${4:-"3e-5"}
precision=${5:-"fp16"}
num_gpu=${6:-"8"}
seed=${7:-"1"}
squad_dir=${8:-"$BERT_PREP_WORKING_DIR/download/squad/v1.1"}
vocab_file=${9:-"$BERT_PREP_WORKING_DIR/download/google_pretrained_weights/uncased_L-24_H-1024_A-16/vocab.txt"}
OUT_DIR=${10:-"/workspace/bert/results/SQuAD"}
mode=${11:-"train eval"}
CONFIG_FILE=${12:-"/workspace/bert/bert_config.json"}
max_steps=${13:-"-1"}
echo "out dir is $OUT_DIR"
mkdir -p $OUT_DIR
if [ ! -d "$OUT_DIR" ]; then
echo "ERROR: non existing $OUT_DIR"
exit 1
fi
use_fp16=""
if [ "$precision" = "fp16" ] ; then
echo "fp16 activated!"
use_fp16=" --fp16 "
fi
if [ "$num_gpu" = "1" ] ; then
export CUDA_VISIBLE_DEVICES=0
mpi_command=""
else
unset CUDA_VISIBLE_DEVICES
mpi_command=" -m torch.distributed.launch --nproc_per_node=$num_gpu"
fi
CMD="python $mpi_command run_squad.py "
CMD+="--init_checkpoint=$init_checkpoint "
if [ "$mode" = "train" ] ; then
CMD+="--do_train "
CMD+="--train_file=$squad_dir/train-v1.1.json "
CMD+="--train_batch_size=$batch_size "
elif [ "$mode" = "eval" ] ; then
CMD+="--do_predict "
CMD+="--predict_file=$squad_dir/dev-v1.1.json "
CMD+="--predict_batch_size=$batch_size "
CMD+="--eval_script=$squad_dir/evaluate-v1.1.py "
CMD+="--do_eval "
elif [ "$mode" = "prediction" ] ; then
CMD+="--do_predict "
CMD+="--predict_file=$squad_dir/dev-v1.1.json "
CMD+="--predict_batch_size=$batch_size "
else
CMD+=" --do_train "
CMD+=" --train_file=$squad_dir/train-v1.1.json "
CMD+=" --train_batch_size=$batch_size "
CMD+="--do_predict "
CMD+="--predict_file=$squad_dir/dev-v1.1.json "
CMD+="--predict_batch_size=$batch_size "
CMD+="--eval_script=$squad_dir/evaluate-v1.1.py "
CMD+="--do_eval "
fi
CMD+=" --do_lower_case "
CMD+=" --bert_model=bert-large-uncased "
CMD+=" --learning_rate=$learning_rate "
CMD+=" --seed=$seed "
CMD+=" --num_train_epochs=$epochs "
CMD+=" --max_seq_length=384 "
CMD+=" --doc_stride=128 "
CMD+=" --output_dir=$OUT_DIR "
CMD+=" --vocab_file=$vocab_file "
CMD+=" --config_file=$CONFIG_FILE "
CMD+=" --max_steps=$max_steps "
CMD+=" $use_fp16"
LOGFILE=$OUT_DIR/logfile.txt
echo "$CMD |& tee $LOGFILE"
time $CMD |& tee $LOGFILE
#!/bin/bash
#set -x
hostfile=./hostfile
num_node=$(cat $hostfile|sort|uniq |wc -l)
num_gpu=$(($num_node*4))
nodename=$(cat $hostfile |sed -n "1p")
dist_url=`echo $nodename | awk '{print $1}'`
echo $dist_url
rm `pwd`/hostfile-dl -f
cat $hostfile|sort|uniq >`pwd`/tmp
for i in `cat ./tmp`
do
echo ${i} slots=4 >> `pwd`/hostfile-dl
done
mpirun -np ${num_gpu} --hostfile `pwd`/hostfile-dl --bind-to none `pwd`/single_process_squad.sh $dist_url
#!/bin/bash
# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
SWAG_DIR=/workspace/bert/data/swag
OUT_DIR=/results/SWAG
mkdir -p $OUT_DIR
echo "Container nvidia build = " $NVIDIA_BUILD_ID
init_checkpoint=${1}
mode=${2:-"train"}
max_steps=${3:-"-1.0"} # if < 0, has no effect
batch_size=${4:-"12"}
learning_rate=${5:-"5e-6"}
precision=${6:-"fp32"}
num_gpu=${7:-"8"}
epochs=${8:-"2"}
if [ "$mode" != "train" ] ; then
num_gpu=1
fi
use_fp16=""
if [ "$precision" = "fp16" ] ; then
echo "fp16 activated!"
use_fp16="--fp16"
fi
if [ "$num_gpu" = "1" ] ; then
mpi_command=""
else
mpi_command="torch.distributed.launch --nproc_per_node=$num_gpu"
fi
CMD="python -m $mpi_command run_swag.py "
CMD+="--init_checkpoint=$init_checkpoint "
if [ "$mode" = "train" ] ; then
CMD+="--do_train "
CMD+="--train_batch_size=$batch_size "
else
CMD+="--do_eval "
CMD+="--eval_batch_size=$batch_size "
fi
CMD+="--do_lower_case "
CMD+="--data_dir $SWAG_DIR/data/ "
CMD+="--bert_model bert-large-uncased "
CMD+="--max_seq_length 128 "
CMD+="--learning_rate $learning_rate "
CMD+="--num_train_epochs $epochs "
CMD+="--max_steps $max_steps "
CMD+="--output_dir $OUT_DIR "
CMD+="$use_fp16"
LOGFILE=$OUT_DIR/logfile
$CMD |& tee $LOGFILE
sed -r 's/
|(\[A)/\n/g' $LOGFILE > $LOGFILE.edit
throughput=`cat $LOGFILE.edit | grep -E 'Iteration.*[0-9.]+(s/it|it/s)' | tail -1 | egrep -o '[0-9.]+(s/it|it/s)'`
echo "throughput: $throughput"
#!/bin/bash
# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
lrank=$OMPI_COMM_WORLD_LOCAL_RANK
comm_rank=$OMPI_COMM_WORLD_RANK
comm_size=$OMPI_COMM_WORLD_SIZE
WORKSPACE="/workspace"
#parameters
train_batch_size=24
learning_rate=${2:-"6e-3"}
precision=${3:-"fp16"}
num_gpus=${4:-4}
warmup_proportion=${5:-"0.2843"}
train_steps=${6:-7038}
save_checkpoint_steps=${7:-200}
resume_training=${8:-"false"}
create_logfile=${9:-"true"}
accumulate_gradients=${10:-"true"}
gradient_accumulation_steps=${11:-4}
seed=${12:-12439}
job_name=${13:-"bert_lamb_pretraining"}
allreduce_post_accumulation=${14:-"true"}
allreduce_post_accumulation_fp16=${15:-"true"}
train_batch_size_phase2=${16:-4096}
learning_rate_phase2=${17:-"4e-3"}
warmup_proportion_phase2=${18:-"0.128"}
train_steps_phase2=${19:-1563}
gradient_accumulation_steps_phase2=${20:-512}
DATASET=${WORKSPACE}/lower_case_1_seq_len_128_max_pred_20_masked_lm_prob_0.15_random_seed_12345_dupe_factor_5_shard_1472_test_split_10/wikicorpus_en/training # change this for other datasets
DATA_DIR_PHASE1=${21:-$BERT_PREP_WORKING_DIR/${DATASET}/}
BERT_CONFIG=${WORKSPACE}/uncased_L-12_H-768_A-12/bert_config.json
CODEDIR=${22:-"`pwd`"}
init_checkpoint=${23:-"None"}
RESULTS_DIR=$CODEDIR/results
CHECKPOINTS_DIR=$RESULTS_DIR/checkpoints
mkdir -p $CHECKPOINTS_DIR
if [ ! -d "$DATA_DIR_PHASE1" ] ; then
echo "Warning! $DATA_DIR_PHASE1 directory missing. Training cannot start"
fi
if [ ! -d "$RESULTS_DIR" ] ; then
echo "Error! $RESULTS_DIR directory missing."
exit -1
fi
if [ ! -d "$CHECKPOINTS_DIR" ] ; then
echo "Warning! $CHECKPOINTS_DIR directory missing."
echo "Checkpoints will be written to $RESULTS_DIR instead."
CHECKPOINTS_DIR=$RESULTS_DIR
fi
if [ ! -f "$BERT_CONFIG" ] ; then
echo "Error! BERT large configuration file not found at $BERT_CONFIG"
exit -1
fi
PREC=""
if [ "$precision" = "fp16" ] ; then
PREC="--fp16"
elif [ "$precision" = "fp32" ] ; then
PREC=""
elif [ "$precision" = "tf32" ] ; then
PREC=""
else
echo "Unknown <precision> argument"
exit -2
fi
ACCUMULATE_GRADIENTS=""
if [ "$accumulate_gradients" == "true" ] ; then
ACCUMULATE_GRADIENTS="--gradient_accumulation_steps=$gradient_accumulation_steps"
fi
CHECKPOINT=""
if [ "$resume_training" == "true" ] ; then
CHECKPOINT="--resume_from_checkpoint"
fi
ALL_REDUCE_POST_ACCUMULATION=""
if [ "$allreduce_post_accumulation" == "true" ] ; then
ALL_REDUCE_POST_ACCUMULATION="--allreduce_post_accumulation"
fi
ALL_REDUCE_POST_ACCUMULATION_FP16=""
if [ "$allreduce_post_accumulation_fp16" == "true" ] ; then
ALL_REDUCE_POST_ACCUMULATION_FP16="--allreduce_post_accumulation_fp16"
fi
INIT_CHECKPOINT=""
if [ "$init_checkpoint" != "None" ] ; then
INIT_CHECKPOINT="--init_checkpoint=$init_checkpoint"
fi
echo $DATA_DIR_PHASE1
INPUT_DIR=$DATA_DIR_PHASE1
CMD=" $CODEDIR/run_pretraining_v1.py"
CMD+=" --input_dir=$DATA_DIR_PHASE1"
CMD+=" --output_dir=$CHECKPOINTS_DIR"
CMD+=" --config_file=$BERT_CONFIG"
CMD+=" --bert_model=bert-large-uncased"
CMD+=" --train_batch_size=$train_batch_size"
CMD+=" --max_seq_length=128"
CMD+=" --max_predictions_per_seq=20"
CMD+=" --max_steps=$train_steps"
CMD+=" --warmup_proportion=$warmup_proportion"
CMD+=" --num_steps_per_checkpoint=$save_checkpoint_steps"
CMD+=" --learning_rate=$learning_rate"
CMD+=" --seed=$seed"
CMD+=" $PREC"
CMD+=" $ACCUMULATE_GRADIENTS"
CMD+=" $CHECKPOINT"
CMD+=" $ALL_REDUCE_POST_ACCUMULATION"
CMD+=" $ALL_REDUCE_POST_ACCUMULATION_FP16"
CMD+=" $INIT_CHECKPOINT"
CMD+=" --do_train"
CMD+=" --json-summary ${RESULTS_DIR}/dllogger.json "
CMD+=" --local_rank ${comm_rank} "
CMD+=" --dist_url tcp://${1}:45679 "
CMD+=" --world_size ${comm_size} "
#CMD="python3 -m torch.distributed.launch --nproc_per_node=$num_gpus $CMD"
APP="python3 $CMD"
if [ "$create_logfile" = "true" ] ; then
export GBS=$(expr $train_batch_size \* $num_gpus)
printf -v TAG "pyt_bert_pretraining_phase1_%s_gbs%d" "$precision" $GBS
DATESTAMP=`date +'%y%m%d%H%M%S'`
LOGFILE=$RESULTS_DIR/$job_name.$TAG.$DATESTAMP.log
printf "Logs written to %s\n" "$LOGFILE"
fi
echo ${CMD}
case ${lrank} in
[0])
export HIP_VISIBLE_DEVICES=0,1,2,3
export UCX_NET_DEVICES=mlx5_0:1
export UCX_IB_PCI_BW=mlx5_0:50Gbs
GLOO_SOCKET_IFNAME=ib0 numactl --cpunodebind=0 --membind=0 ${APP}
;;
[1])
export HIP_VISIBLE_DEVICES=0,1,2,3
export UCX_NET_DEVICES=mlx5_1:1
export UCX_IB_PCI_BW=mlx5_1:50Gbs
GLOO_SOCKET_IFNAME=ib1 numactl --cpunodebind=1 --membind=1 ${APP}
;;
[2])
export HIP_VISIBLE_DEVICES=0,1,2,3
export UCX_NET_DEVICES=mlx5_2:1
export UCX_IB_PCI_BW=mlx5_2:50Gbs
GLOO_SOCKET_IFNAME=ib2 numactl --cpunodebind=2 --membind=2 ${APP}
;;
[3])
export HIP_VISIBLE_DEVICES=0,1,2,3
export UCX_NET_DEVICES=mlx5_3:1
export UCX_IB_PCI_BW=mlx5_3:50Gbs
GLOO_SOCKET_IFNAME=ib3 numactl --cpunodebind=3 --membind=3 ${APP}
;;
esac
#!/bin/bash
#env
lrank=$OMPI_COMM_WORLD_LOCAL_RANK
comm_rank=$OMPI_COMM_WORLD_RANK
comm_size=$OMPI_COMM_WORLD_SIZE
WORKSPACE=""
#parameters
precision=${3:-"fp32"}
#1 epoch ~ 10224512 step
train_steps=${6:-10224512}
save_checkpoint_steps=${7:-40000}
accumulate_gradients=${10:-"true"}
gradient_accumulation_steps=${11:-1}
seed=${12:-12439}
job_name=${13:-"bert_pretraining_vs_dgx1"}
allreduce_post_accumulation=${14:-"true"}
allreduce_post_accumulation_fp16=${15:-"false"}
train_batch_size_phase2=${16:-4}
learning_rate_phase2=${17:-"4e-3"}
warmup_proportion_phase2=${18:-"0.128"}
train_steps_phase2=${19:-10224512}
gradient_accumulation_steps_phase2=${5:-1}
#bert base
#BERT_CONFIG=${WORKSPACE}/uncased_L-12_H-768_A-12/bert_config.json
#bert large
BERT_CONFIG=${WORKSPACE}/bert_large_uncased/bert_config.json
DATASET2=${WORKSPACE}/lower_case_1_seq_len_512_max_pred_80_masked_lm_prob_0.15_random_seed_12345_dupe_factor_5_shard_1472_test_split_10/wikicorpus_en/training # change this for other datasets
DATA_DIR_PHASE2=${4:-${DATASET2}/}
CODEDIR=${9:-"`pwd`"}
init_checkpoint=${8:-"None"}
RESULTS_DIR=$CODEDIR/${2}
CHECKPOINTS_DIR=$RESULTS_DIR/checkpoints
if [ ! -d "$RESULTS_DIR" ] ; then
echo "Error! $RESULTS_DIR directory missing."
exit -1
fi
if [ ! -d "$CHECKPOINTS_DIR" ] ; then
echo "Warning! $CHECKPOINTS_DIR directory missing."
echo "Checkpoints will be written to $RESULTS_DIR instead."
CHECKPOINTS_DIR=$RESULTS_DIR
fi
if [ ! -f "$BERT_CONFIG" ] ; then
echo "Error! BERT large configuration file not found at $BERT_CONFIG"
exit -1
fi
#Start Phase2
PREC=""
if [ "$precision" = "fp16" ] ; then
PREC="--fp16"
elif [ "$precision" = "fp32" ] ; then
PREC=""
elif [ "$precision" = "tf32" ] ; then
PREC=""
else
echo "Unknown <precision> argument"
exit -2
fi
ACCUMULATE_GRADIENTS=""
if [ "$accumulate_gradients" == "true" ] ; then
ACCUMULATE_GRADIENTS="--gradient_accumulation_steps=$gradient_accumulation_steps_phase2"
fi
ALL_REDUCE_POST_ACCUMULATION=""
if [ "$allreduce_post_accumulation" == "true" ] ; then
ALL_REDUCE_POST_ACCUMULATION="--allreduce_post_accumulation"
fi
ALL_REDUCE_POST_ACCUMULATION_FP16=""
if [ "$allreduce_post_accumulation_fp16" == "true" ] ; then
ALL_REDUCE_POST_ACCUMULATION_FP16="--allreduce_post_accumulation_fp16"
fi
echo $DATA_DIR_PHASE2
INPUT_DIR=$DATA_DIR_PHASE2
CMD=" $CODEDIR/run_pretraining_v1_v2.py"
CMD+=" --input_dir=$DATA_DIR_PHASE2"
CMD+=" --output_dir=$CHECKPOINTS_DIR"
CMD+=" --config_file=$BERT_CONFIG"
CMD+=" --bert_model=bert-large-uncased"
CMD+=" --train_batch_size=$train_batch_size_phase2"
CMD+=" --max_seq_length=512"
CMD+=" --max_predictions_per_seq=80"
CMD+=" --max_steps=$train_steps_phase2"
CMD+=" --warmup_proportion=$warmup_proportion_phase2"
CMD+=" --num_steps_per_checkpoint=$save_checkpoint_steps"
CMD+=" --learning_rate=$learning_rate_phase2"
CMD+=" --seed=$seed"
CMD+=" $PREC"
CMD+=" $ACCUMULATE_GRADIENTS"
CMD+=" $CHECKPOINT"
CMD+=" $ALL_REDUCE_POST_ACCUMULATION"
CMD+=" $ALL_REDUCE_POST_ACCUMULATION_FP16"
CMD+=" --do_train --phase2 --resume_from_checkpoint --phase1_end_step=$train_steps"
CMD+=" --json-summary ${RESULTS_DIR}/dllogger.json "
CMD+=" --local_rank ${comm_rank} "
CMD+=" --dist_url tcp://${1}:45679 "
CMD+=" --world_size ${comm_size} "
APP="python3 $CMD"
set +x
echo ${CMD}
case ${lrank} in
[0])
export HIP_VISIBLE_DEVICES=0,1,2,3
export UCX_NET_DEVICES=mlx5_0:1
export UCX_IB_PCI_BW=mlx5_0:50Gbs
NCCL_SOCKET_IFNAME=ib0 numactl --cpunodebind=0 --membind=0 ${APP}
;;
[1])
export HIP_VISIBLE_DEVICES=0,1,2,3
export UCX_NET_DEVICES=mlx5_1:1
export UCX_IB_PCI_BW=mlx5_1:50Gbs
NCCL_SOCKET_IFNAME=ib1 numactl --cpunodebind=1 --membind=1 ${APP}
;;
[2])
export HIP_VISIBLE_DEVICES=0,1,2,3
export UCX_NET_DEVICES=mlx5_2:1
export UCX_IB_PCI_BW=mlx5_2:50Gbs
NCCL_SOCKET_IFNAME=ib2 numactl --cpunodebind=2 --membind=2 ${APP}
;;
[3])
export HIP_VISIBLE_DEVICES=0,1,2,3
export UCX_NET_DEVICES=mlx5_3:1
export UCX_IB_PCI_BW=mlx5_3:50Gbs
NCCL_SOCKET_IFNAME=ib3 numactl --cpunodebind=3 --membind=3 ${APP}
;;
esac
echo "finished phase2"
#!/bin/bash
lrank=$OMPI_COMM_WORLD_LOCAL_RANK
comm_rank=$OMPI_COMM_WORLD_RANK
comm_size=$OMPI_COMM_WORLD_SIZE
HOME_PATH=""
export OMP_NUM_THREADS=8
#source code path
WORK_HOME="BERT"
#data path
DATA_DIR="BERT/data"
#pretrined model path
MODEL_DIR="bs64k_32k_ckpt"
APP="python3 ${WORK_HOME}/run_squad_v1.py --train_file ${DATA_DIR}/squad/v1.1/train-v1.1.json --init_checkpoint ${MODEL_DIR}/model.ckpt-28252.pt --vocab_file ${MODEL_DIR}/vocab.txt --output_dir ${WORK_HOME}/result/SQuAD --config_file ${MODEL_DIR}/bert_config.json --bert_model=bert-large-uncased --do_train --local_rank ${comm_rank} --train_batch_size 1 --gpus_per_node 1 --dist_url tcp://${1}:45679 --world_size ${comm_size}"
case ${lrank} in
[0])
export HIP_VISIBLE_DEVICES=0,1,2,3
export UCX_NET_DEVICES=mlx5_0:1
export UCX_IB_PCI_BW=mlx5_0:50Gbs
GLOO_SOCKET_IFNAME=ib0 numactl --cpunodebind=0 --membind=0 ${APP}
;;
[1])
export HIP_VISIBLE_DEVICES=0,1,2,3
export UCX_NET_DEVICES=mlx5_1:1
export UCX_IB_PCI_BW=mlx5_1:50Gbs
GLOO_SOCKET_IFNAME=ib0 numactl --cpunodebind=1 --membind=1 ${APP}
;;
[2])
export HIP_VISIBLE_DEVICES=0,1,2,3
export UCX_NET_DEVICES=mlx5_2:1
export UCX_IB_PCI_BW=mlx5_2:50Gbs
GLOO_SOCKET_IFNAME=ib0 numactl --cpunodebind=2 --membind=2 ${APP}
;;
[3])
export HIP_VISIBLE_DEVICES=0,1,2,3
export UCX_NET_DEVICES=mlx5_3:1
export UCX_IB_PCI_BW=mlx5_3:50Gbs
GLOO_SOCKET_IFNAME=ib0 numactl --cpunodebind=3 --membind=3 ${APP}
;;
esac
#!/bin/bash
#export MIOPEN_DEBUG_DISABLE_FIND_DB=1
#export NCCL_SOCKET_IFNAME=ib0
#export HSA_USERPTR_FOR_PAGED_MEM=0
#export MIOPEN_DEBUG_DISABLE_FIND_DB=1
#export NCCL_SOCKET_IFNAME=eno1
export HSA_FORCE_FINE_GRAIN_PCIE=1
#source /public/software/apps/DeepLearning/PyTorch/pytorch-env.sh
export MIOPEN_FIND_MODE=1
#export MIOPEN_ENABLE_LOGGING_CMD=1
#export ROCBLAS_LAYER=3
module unload compiler/rocm/2.9
#export MIOPEN_DEBUG_CONV_DIRECT_ASM_3X3U=0
#export MIOPEN_DEBUG_CONV_DIRECT_ASM_WRW3X3=0
echo "MIOPEN_FIND_MODE=$MIOPEN_FIND_MODE"
#module load apps/PyTorch/1.5.0a0/hpcx-2.4.1-gcc-7.3.1-rocm3.3
#source /public/home/aiss/Pytorch/env_rocm3.3_torch1.5.sh
lrank=$OMPI_COMM_WORLD_LOCAL_RANK
comm_rank=$OMPI_COMM_WORLD_RANK
comm_size=$OMPI_COMM_WORLD_SIZE
#下边是修改的
#export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
#HSA_FORCE_FINE_GRAIN_PCIE=1 numactl --cpunodebind=4,5,6,7 --membind=4,5,6,7
#module load compiler/rocm/3.9.1
export PATH_PHRASE1=/public/software/apps/DeepLearning/Data/wikicorpus_en/lower_case_1_seq_len_128_max_pred_20_masked_lm_prob_0.15_random_seed_12345_dupe_factor_5_shard_1472_test_split_10/wikicorpus_en/training
APP="python3 run_pretraining_v1.py \
--input_dir=${PATH_PHRASE1} \
--output_dir=/public/home/hepj/outdir/torch/pre_wiki/phrase1 \
--config_file=./bert_config.json \
--bert_model=bert-large-uncased \
--train_batch_size=16 \
--max_seq_length=128 \
--max_predictions_per_seq=20 \
--max_steps=100000 \
--warmup_proportion=0.0 \
--num_steps_per_checkpoint=20000 \
--learning_rate=4.0e-4 \
--seed=12439 \
--gradient_accumulation_steps=1 \
--allreduce_post_accumulation \
--gpus_per_node 1 \
--do_train \
--json-summary /public/home/hepj/outdir/torch/pre_wiki/phrase1/dllogger.json
"
case ${lrank} in
[0])
export HIP_VISIBLE_DEVICES=0
export UCX_NET_DEVICES=mlx5_0:1
export UCX_IB_PCI_BW=mlx5_0:50Gbs
echo numactl --cpunodebind=0 --membind=0 ${APP}
numactl --cpunodebind=0 --membind=0 ${APP}
#echo GLOO_SOCKET_IFNAME=ib0 numactl --cpunodebind=0 --membind=0 ${APP}
#GLOO_SOCKET_IFNAME=ib0 numactl --cpunodebind=0 --membind=0 ${APP}
;;
[1])
export HIP_VISIBLE_DEVICES=1
export UCX_NET_DEVICES=mlx5_1:1
export UCX_IB_PCI_BW=mlx5_1:50Gbs
echo numactl --cpunodebind=1 --membind=1 ${APP}
numactl --cpunodebind=1 --membind=1 ${APP}
;;
[2])
export HIP_VISIBLE_DEVICES=2
export UCX_NET_DEVICES=mlx5_2:1
export UCX_IB_PCI_BW=mlx5_2:50Gbs
echo numactl --cpunodebind=2 --membind=2 ${APP}
numactl --cpunodebind=2 --membind=2 ${APP}
;;
[3])
export HIP_VISIBLE_DEVICES=3
export UCX_NET_DEVICES=mlx5_3:1
export UCX_IB_PCI_BW=mlx5_3:50Gbs
echo numactl --cpunodebind=3 --membind=3 ${APP}
numactl --cpunodebind=3 --membind=3 ${APP}
;;
esac
#!/bin/bash
#export MIOPEN_DEBUG_DISABLE_FIND_DB=1
#export NCCL_SOCKET_IFNAME=ib0
#export HSA_USERPTR_FOR_PAGED_MEM=0
#export MIOPEN_DEBUG_DISABLE_FIND_DB=1
#export NCCL_SOCKET_IFNAME=eno1
export HSA_FORCE_FINE_GRAIN_PCIE=1
#source /public/software/apps/DeepLearning/PyTorch/pytorch-env.sh
export MIOPEN_FIND_MODE=1
#export MIOPEN_ENABLE_LOGGING_CMD=1
#export ROCBLAS_LAYER=3
module unload compiler/rocm/2.9
#export MIOPEN_DEBUG_CONV_DIRECT_ASM_3X3U=0
#export MIOPEN_DEBUG_CONV_DIRECT_ASM_WRW3X3=0
echo "MIOPEN_FIND_MODE=$MIOPEN_FIND_MODE"
#module load apps/PyTorch/1.5.0a0/hpcx-2.4.1-gcc-7.3.1-rocm3.3
#source /public/home/aiss/Pytorch/env_rocm3.3_torch1.5.sh
lrank=$OMPI_COMM_WORLD_LOCAL_RANK
comm_rank=$OMPI_COMM_WORLD_RANK
comm_size=$OMPI_COMM_WORLD_SIZE
#下边是修改的
#export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
#HSA_FORCE_FINE_GRAIN_PCIE=1 numactl --cpunodebind=4,5,6,7 --membind=4,5,6,7
#module load compiler/rocm/3.9.1
export PATH_PHRASE1=/public/software/apps/DeepLearning/Data/wikicorpus_en/lower_case_1_seq_len_128_max_pred_20_masked_lm_prob_0.15_random_seed_12345_dupe_factor_5_shard_1472_test_split_10/wikicorpus_en/training
APP="python3 run_pretraining_v1.py \
--input_dir=${PATH_PHRASE1} \
--output_dir=/public/home/hepj/outdir/torch/pre_wiki/phrase1 \
--config_file=./bert_config.json \
--bert_model=bert-large-uncased \
--train_batch_size=16 \
--max_seq_length=128 \
--max_predictions_per_seq=20 \
--max_steps=100000 \
--warmup_proportion=0.0 \
--num_steps_per_checkpoint=20 \
--learning_rate=4.0e-4 \
--seed=12439 \
--fp16 \
--amp \
--gradient_accumulation_steps=1 \
--allreduce_post_accumulation \
--gpus_per_node 1 \
--do_train \
--json-summary /public/home/hepj/outdir/torch/pre_wiki/phrase1/dllogger.json
"
case ${lrank} in
[0])
export HIP_VISIBLE_DEVICES=0
export UCX_NET_DEVICES=mlx5_0:1
export UCX_IB_PCI_BW=mlx5_0:50Gbs
echo numactl --cpunodebind=0 --membind=0 ${APP}
numactl --cpunodebind=0 --membind=0 ${APP}
#echo GLOO_SOCKET_IFNAME=ib0 numactl --cpunodebind=0 --membind=0 ${APP}
#GLOO_SOCKET_IFNAME=ib0 numactl --cpunodebind=0 --membind=0 ${APP}
;;
[1])
export HIP_VISIBLE_DEVICES=1
export UCX_NET_DEVICES=mlx5_1:1
export UCX_IB_PCI_BW=mlx5_1:50Gbs
echo numactl --cpunodebind=1 --membind=1 ${APP}
numactl --cpunodebind=1 --membind=1 ${APP}
;;
[2])
export HIP_VISIBLE_DEVICES=2
export UCX_NET_DEVICES=mlx5_2:1
export UCX_IB_PCI_BW=mlx5_2:50Gbs
echo numactl --cpunodebind=2 --membind=2 ${APP}
numactl --cpunodebind=2 --membind=2 ${APP}
;;
[3])
export HIP_VISIBLE_DEVICES=3
export UCX_NET_DEVICES=mlx5_3:1
export UCX_IB_PCI_BW=mlx5_3:50Gbs
echo numactl --cpunodebind=3 --membind=3 ${APP}
numactl --cpunodebind=3 --membind=3 ${APP}
;;
esac
#!/bin/bash
export HSA_FORCE_FINE_GRAIN_PCIE=1
export MIOPEN_FIND_MODE=1
module unload compiler/rocm/2.9
echo "MIOPEN_FIND_MODE=$MIOPEN_FIND_MODE"
lrank=$OMPI_COMM_WORLD_LOCAL_RANK
comm_rank=$OMPI_COMM_WORLD_RANK
comm_size=$OMPI_COMM_WORLD_SIZE
#下边是修改的
export HIP_VISIBLE_DEVICES=0,1,2,3
export PATH_PHRASE1=/public/software/apps/DeepLearning/Data/wikicorpus_en/lower_case_1_seq_len_128_max_pred_20_masked_lm_prob_0.15_random_seed_12345_dupe_factor_5_shard_1472_test_split_10/wikicorpus_en/training
APP="python3 run_pretraining_v4.py \
--input_dir=${PATH_PHRASE1} \
--output_dir=/public/home/hepj/outdir/torch/pre_wiki4/phrase1/fp32 \
--config_file=/public/home/hepj/model_source/pytorch_bert/bert_config.json \
--bert_model=bert-large-uncased \
--train_batch_size=16 \
--max_seq_length=128 \
--max_predictions_per_seq=20 \
--max_steps=100000 \
--warmup_proportion=0.0 \
--num_steps_per_checkpoint=20000 \
--learning_rate=4.0e-4 \
--seed=12439 \
--gradient_accumulation_steps=1 \
--allreduce_post_accumulation \
--do_train \
--use_env \
--local_rank ${comm_rank} \
--world_size 4 \
--gpus_per_node 1 \
--dist_url tcp://localhost:34567 \
--json-summary /public/home/hepj/outdir/torch/pre_wiki4/phrase1/fp32/dllogger.json
"
case ${lrank} in
[0])
export HIP_VISIBLE_DEVICES=0
export UCX_NET_DEVICES=mlx5_0:1
export UCX_IB_PCI_BW=mlx5_0:50Gbs
echo numactl --cpunodebind=0 --membind=0 ${APP}
numactl --cpunodebind=0 --membind=0 ${APP}
;;
[1])
export HIP_VISIBLE_DEVICES=1
export UCX_NET_DEVICES=mlx5_1:1
export UCX_IB_PCI_BW=mlx5_1:50Gbs
echo numactl --cpunodebind=1 --membind=1 ${APP}
numactl --cpunodebind=1 --membind=1 ${APP}
;;
[2])
export HIP_VISIBLE_DEVICES=2
export UCX_NET_DEVICES=mlx5_2:1
export UCX_IB_PCI_BW=mlx5_2:50Gbs
echo numactl --cpunodebind=2 --membind=2 ${APP}
numactl --cpunodebind=2 --membind=2 ${APP}
;;
[3])
export HIP_VISIBLE_DEVICES=3
export UCX_NET_DEVICES=mlx5_3:1
export UCX_IB_PCI_BW=mlx5_3:50Gbs
echo numactl --cpunodebind=3 --membind=3 ${APP}
numactl --cpunodebind=3 --membind=3 ${APP}
;;
esac
#!/bin/bash
export HSA_FORCE_FINE_GRAIN_PCIE=1
export MIOPEN_FIND_MODE=1
module unload compiler/rocm/2.9
echo "MIOPEN_FIND_MODE=$MIOPEN_FIND_MODE"
lrank=$OMPI_COMM_WORLD_LOCAL_RANK
comm_rank=$OMPI_COMM_WORLD_RANK
comm_size=$OMPI_COMM_WORLD_SIZE
#下边是修改的
export HIP_VISIBLE_DEVICES=0,1,2,3
export PATH_PHRASE1=/public/software/apps/DeepLearning/Data/wikicorpus_en/lower_case_1_seq_len_128_max_pred_20_masked_lm_prob_0.15_random_seed_12345_dupe_factor_5_shard_1472_test_split_10/wikicorpus_en/training
APP="python3 run_pretraining_v4.py \
--input_dir=${PATH_PHRASE1} \
--output_dir=/public/home/hepj/outdir/torch/pre_wiki4/phrase1/fp32 \
--config_file=/public/home/hepj/model_source/pytorch_bert/bert_config.json \
--bert_model=bert-large-uncased \
--train_batch_size=16 \
--max_seq_length=128 \
--max_predictions_per_seq=20 \
--max_steps=100000 \
--warmup_proportion=0.0 \
--num_steps_per_checkpoint=20000 \
--learning_rate=4.0e-4 \
--seed=12439 \
--gradient_accumulation_steps=1 \
--allreduce_post_accumulation \
--do_train \
--fp16 \
--amp \
--use_env \
--local_rank ${comm_rank} \
--world_size 4 \
--gpus_per_node 1 \
--dist_url tcp://localhost:34567 \
--json-summary /public/home/hepj/outdir/torch/pre_wiki4/phrase1/fp32/dllogger.json
"
case ${lrank} in
[0])
export HIP_VISIBLE_DEVICES=0
export UCX_NET_DEVICES=mlx5_0:1
export UCX_IB_PCI_BW=mlx5_0:50Gbs
echo numactl --cpunodebind=0 --membind=0 ${APP}
numactl --cpunodebind=0 --membind=0 ${APP}
#echo GLOO_SOCKET_IFNAME=ib0 numactl --cpunodebind=0 --membind=0 ${APP}
#GLOO_SOCKET_IFNAME=ib0 numactl --cpunodebind=0 --membind=0 ${APP}
;;
[1])
export HIP_VISIBLE_DEVICES=1
export UCX_NET_DEVICES=mlx5_1:1
export UCX_IB_PCI_BW=mlx5_1:50Gbs
echo numactl --cpunodebind=1 --membind=1 ${APP}
numactl --cpunodebind=1 --membind=1 ${APP}
;;
[2])
export HIP_VISIBLE_DEVICES=2
export UCX_NET_DEVICES=mlx5_2:1
export UCX_IB_PCI_BW=mlx5_2:50Gbs
echo numactl --cpunodebind=2 --membind=2 ${APP}
numactl --cpunodebind=2 --membind=2 ${APP}
;;
[3])
export HIP_VISIBLE_DEVICES=3
export UCX_NET_DEVICES=mlx5_3:1
export UCX_IB_PCI_BW=mlx5_3:50Gbs
echo numactl --cpunodebind=3 --membind=3 ${APP}
numactl --cpunodebind=3 --membind=3 ${APP}
;;
esac
#!/bin/bash
#export MIOPEN_DEBUG_DISABLE_FIND_DB=1
#export NCCL_SOCKET_IFNAME=ib0
#export HSA_USERPTR_FOR_PAGED_MEM=0
#export MIOPEN_DEBUG_DISABLE_FIND_DB=1
#export NCCL_SOCKET_IFNAME=eno1
export HSA_FORCE_FINE_GRAIN_PCIE=1
#source /public/software/apps/DeepLearning/PyTorch/pytorch-env.sh
export MIOPEN_FIND_MODE=1
#export MIOPEN_ENABLE_LOGGING_CMD=1
#export ROCBLAS_LAYER=3
module unload compiler/rocm/2.9
#export MIOPEN_DEBUG_CONV_DIRECT_ASM_3X3U=0
#export MIOPEN_DEBUG_CONV_DIRECT_ASM_WRW3X3=0
echo "MIOPEN_FIND_MODE=$MIOPEN_FIND_MODE"
#module load apps/PyTorch/1.5.0a0/hpcx-2.4.1-gcc-7.3.1-rocm3.3
#source /public/home/aiss/Pytorch/env_rocm3.3_torch1.5.sh
lrank=$OMPI_COMM_WORLD_LOCAL_RANK
comm_rank=$OMPI_COMM_WORLD_RANK
comm_size=$OMPI_COMM_WORLD_SIZE
#下边是修改的
#export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
#HSA_FORCE_FINE_GRAIN_PCIE=1 numactl --cpunodebind=4,5,6,7 --membind=4,5,6,7
#module load compiler/rocm/3.9.1
export PATH_PHRASE2=/public/software/apps/DeepLearning/Data/wikicorpus_en/lower_case_1_seq_len_512_max_pred_80_masked_lm_prob_0.15_random_seed_12345_dupe_factor_5_shard_1472_test_split_10/wikicorpus_en/training
APP="python3 run_pretraining_v1.py \
--input_dir=${PATH_PHRASE2} \
--output_dir=/public/home/hepj/outdir/torch/pre_wiki/phrase2/fp32 \
--config_file=/public/home/hepj/model_source/pytorch_bert/bert_config.json \
--bert_model=bert-large-uncased \
--train_batch_size=2 \
--max_seq_length=512 \
--max_predictions_per_seq=80 \
--max_steps=400000 \
--warmup_proportion=0128. \
--num_steps_per_checkpoint=20000 \
--learning_rate=4.0e-3 \
--seed=12439 \
--gradient_accumulation_steps=1 \
--allreduce_post_accumulation \
--do_train \
--phase2 \
--phase1_end_step=0 \
--gpus_per_node 1 \
--local_rank -1 \
--json-summary /public/home/hepj/outdir/torch/pre_wiki/phrase2/fp32/dllogger_dtk22.04.json
"
#--fp16 \
# --amp \
#--json-summary /public/home/hepj/out_dir/tourch/SQuAD/results.json
case ${lrank} in
[0])
export HIP_VISIBLE_DEVICES=0
export UCX_NET_DEVICES=mlx5_0:1
export UCX_IB_PCI_BW=mlx5_0:50Gbs
echo numactl --cpunodebind=0 --membind=0 ${APP}
numactl --cpunodebind=0 --membind=0 ${APP}
#echo GLOO_SOCKET_IFNAME=ib0 numactl --cpunodebind=0 --membind=0 ${APP}
#GLOO_SOCKET_IFNAME=ib0 numactl --cpunodebind=0 --membind=0 ${APP}
;;
[1])
export HIP_VISIBLE_DEVICES=1
export UCX_NET_DEVICES=mlx5_1:1
export UCX_IB_PCI_BW=mlx5_1:50Gbs
echo numactl --cpunodebind=1 --membind=1 ${APP}
numactl --cpunodebind=1 --membind=1 ${APP}
;;
[2])
export HIP_VISIBLE_DEVICES=2
export UCX_NET_DEVICES=mlx5_2:1
export UCX_IB_PCI_BW=mlx5_2:50Gbs
echo numactl --cpunodebind=2 --membind=2 ${APP}
numactl --cpunodebind=2 --membind=2 ${APP}
;;
[3])
export HIP_VISIBLE_DEVICES=3
export UCX_NET_DEVICES=mlx5_3:1
export UCX_IB_PCI_BW=mlx5_3:50Gbs
echo numactl --cpunodebind=3 --membind=3 ${APP}
numactl --cpunodebind=3 --membind=3 ${APP}
;;
esac
#!/bin/bash
export HSA_FORCE_FINE_GRAIN_PCIE=1
export MIOPEN_FIND_MODE=1
#export MIOPEN_ENABLE_LOGGING_CMD=1
#export ROCBLAS_LAYER=3
module unload compiler/rocm/2.9
echo "MIOPEN_FIND_MODE=$MIOPEN_FIND_MODE"
lrank=$OMPI_COMM_WORLD_LOCAL_RANK
comm_rank=$OMPI_COMM_WORLD_RANK
comm_size=$OMPI_COMM_WORLD_SIZE
#下边是修改的
export PATH_PHRASE2=/public/software/apps/DeepLearning/Data/wikicorpus_en/lower_case_1_seq_len_512_max_pred_80_masked_lm_prob_0.15_random_seed_12345_dupe_factor_5_shard_1472_test_split_10/wikicorpus_en/training
APP="python3 run_pretraining_v1.py \
--input_dir=${PATH_PHRASE2} \
--output_dir=/public/home/hepj/outdir/torch/pre_wiki/phrase2/fp16 \
--config_file=/public/home/hepj/model_source/pytorch_bert/bert_config.json \
--bert_model=bert-large-uncased \
--train_batch_size=2 \
--max_seq_length=512 \
--max_predictions_per_seq=80 \
--max_steps=400000 \
--warmup_proportion=0128. \
--num_steps_per_checkpoint=20000 \
--learning_rate=4.0e-3 \
--seed=12439 \
--gradient_accumulation_steps=1 \
--allreduce_post_accumulation \
--do_train \
--fp16 \
--amp \
--phase2 \
--phase1_end_step=0 \
--gpus_per_node 1 \
--local_rank -1 \
--json-summary /public/home/hepj/outdir/torch/pre_wiki/phrase2/fp16/dllogger_dtk22.04.json
"
#--json-summary /public/home/hepj/out_dir/tourch/SQuAD/results.json
case ${lrank} in
[0])
export HIP_VISIBLE_DEVICES=0
export UCX_NET_DEVICES=mlx5_0:1
export UCX_IB_PCI_BW=mlx5_0:50Gbs
echo numactl --cpunodebind=0 --membind=0 ${APP}
numactl --cpunodebind=0 --membind=0 ${APP}
#echo GLOO_SOCKET_IFNAME=ib0 numactl --cpunodebind=0 --membind=0 ${APP}
#GLOO_SOCKET_IFNAME=ib0 numactl --cpunodebind=0 --membind=0 ${APP}
;;
[1])
export HIP_VISIBLE_DEVICES=1
export UCX_NET_DEVICES=mlx5_1:1
export UCX_IB_PCI_BW=mlx5_1:50Gbs
echo numactl --cpunodebind=1 --membind=1 ${APP}
numactl --cpunodebind=1 --membind=1 ${APP}
;;
[2])
export HIP_VISIBLE_DEVICES=2
export UCX_NET_DEVICES=mlx5_2:1
export UCX_IB_PCI_BW=mlx5_2:50Gbs
echo numactl --cpunodebind=2 --membind=2 ${APP}
numactl --cpunodebind=2 --membind=2 ${APP}
;;
[3])
export HIP_VISIBLE_DEVICES=3
export UCX_NET_DEVICES=mlx5_3:1
export UCX_IB_PCI_BW=mlx5_3:50Gbs
echo numactl --cpunodebind=3 --membind=3 ${APP}
numactl --cpunodebind=3 --membind=3 ${APP}
;;
esac
#!/bin/bash
export HSA_FORCE_FINE_GRAIN_PCIE=1
export MIOPEN_FIND_MODE=1
module unload compiler/rocm/2.9
echo "MIOPEN_FIND_MODE=$MIOPEN_FIND_MODE"
lrank=$OMPI_COMM_WORLD_LOCAL_RANK
comm_rank=$OMPI_COMM_WORLD_RANK
comm_size=$OMPI_COMM_WORLD_SIZE
export HIP_VISIBLE_DEVICES=0,1,2,3
export PATH_PHRASE2=/public/software/apps/DeepLearning/Data/wikicorpus_en/lower_case_1_seq_len_512_max_pred_80_masked_lm_prob_0.15_random_seed_12345_dupe_factor_5_shard_1472_test_split_10/wikicorpus_en/training
APP="python3 run_pretraining_v4.py \
--input_dir=${PATH_PHRASE2} \
--output_dir=/public/home/hepj/outdir/tourch/pre_wiki4/phrase2/fp32 \
--config_file=/public/home/hepj/model_source/pytorch_bert/bert_config.json \
--bert_model=bert-large-uncased \
--train_batch_size=2 \
--max_seq_length=512 \
--max_predictions_per_seq=80 \
--max_steps=400000 \
--warmup_proportion=0128. \
--num_steps_per_checkpoint=200000 \
--learning_rate=4.0e-3 \
--seed=12439 \
--gradient_accumulation_steps=1 \
--allreduce_post_accumulation \
--do_train \
--phase2 \
--phase1_end_step 0 \
--local_rank ${comm_rank} \
--world_size 4 \
--gpus_per_node 1 \
--dist_url tcp://localhost:34567 \
--json-summary /public/home/hepj/outdir/torch/pre_wiki4/phrase2/fp32/dllogger.json
"
#--fp16 \
# --amp \
#--json-summary /public/home/hepj/out_dir/tourch/SQuAD/results.json
case ${lrank} in
[0])
export HIP_VISIBLE_DEVICES=0
export UCX_NET_DEVICES=mlx5_0:1
export UCX_IB_PCI_BW=mlx5_0:50Gbs
echo numactl --cpunodebind=0 --membind=0 ${APP}
numactl --cpunodebind=0 --membind=0 ${APP}
#echo GLOO_SOCKET_IFNAME=ib0 numactl --cpunodebind=0 --membind=0 ${APP}
#GLOO_SOCKET_IFNAME=ib0 numactl --cpunodebind=0 --membind=0 ${APP}
;;
[1])
export HIP_VISIBLE_DEVICES=1
export UCX_NET_DEVICES=mlx5_1:1
export UCX_IB_PCI_BW=mlx5_1:50Gbs
echo numactl --cpunodebind=1 --membind=1 ${APP}
numactl --cpunodebind=1 --membind=1 ${APP}
;;
[2])
export HIP_VISIBLE_DEVICES=2
export UCX_NET_DEVICES=mlx5_2:1
export UCX_IB_PCI_BW=mlx5_2:50Gbs
echo numactl --cpunodebind=2 --membind=2 ${APP}
numactl --cpunodebind=2 --membind=2 ${APP}
;;
[3])
export HIP_VISIBLE_DEVICES=3
export UCX_NET_DEVICES=mlx5_3:1
export UCX_IB_PCI_BW=mlx5_3:50Gbs
echo numactl --cpunodebind=3 --membind=3 ${APP}
numactl --cpunodebind=3 --membind=3 ${APP}
;;
esac
#!/bin/bash
export HSA_FORCE_FINE_GRAIN_PCIE=1
export MIOPEN_FIND_MODE=1
module unload compiler/rocm/2.9
echo "MIOPEN_FIND_MODE=$MIOPEN_FIND_MODE"
lrank=$OMPI_COMM_WORLD_LOCAL_RANK
comm_rank=$OMPI_COMM_WORLD_RANK
comm_size=$OMPI_COMM_WORLD_SIZE
#下边是修改的
export HIP_LAUNCH_BLOCKING=1
export HIP_VISIBLE_DEVICES=0,1,2,3
export PATH_PHRASE2=/public/software/apps/DeepLearning/Data/wikicorpus_en/lower_case_1_seq_len_512_max_pred_80_masked_lm_prob_0.15_random_seed_12345_dupe_factor_5_shard_1472_test_split_10/wikicorpus_en/training
APP="python3 run_pretraining_v4.py \
--input_dir=${PATH_PHRASE2} \
--output_dir=/public/home/hepj/outdir/tourch/pre_wiki4/phrase2/fp16 \
--config_file=/public/home/hepj/model_source/pytorch_bert/bert_config.json \
--bert_model=bert-large-uncased \
--train_batch_size=2 \
--max_seq_length=512 \
--max_predictions_per_seq=80 \
--max_steps=40000 \
--warmup_proportion=0128. \
--num_steps_per_checkpoint=10000 \
--learning_rate=4.0e-3 \
--seed=12439 \
--gradient_accumulation_steps=1 \
--allreduce_post_accumulation \
--do_train \
--phase2 \
--phase1_end_step 0 \
--local_rank ${comm_rank} \
--world_size 4 \
--gpus_per_node 1 \
--fp16 \
--amp \
--dist_url tcp://localhost:34567 \
--json-summary /public/home/hepj/outdir/torch/pre_wiki4/phrase2/fp16/dllogger.json
"
case ${lrank} in
[0])
export HIP_VISIBLE_DEVICES=0
export UCX_NET_DEVICES=mlx5_0:1
export UCX_IB_PCI_BW=mlx5_0:50Gbs
echo numactl --cpunodebind=0 --membind=0 ${APP}
numactl --cpunodebind=0 --membind=0 ${APP}
#echo GLOO_SOCKET_IFNAME=ib0 numactl --cpunodebind=0 --membind=0 ${APP}
#GLOO_SOCKET_IFNAME=ib0 numactl --cpunodebind=0 --membind=0 ${APP}
;;
[1])
export HIP_VISIBLE_DEVICES=1
export UCX_NET_DEVICES=mlx5_1:1
export UCX_IB_PCI_BW=mlx5_1:50Gbs
echo numactl --cpunodebind=1 --membind=1 ${APP}
numactl --cpunodebind=1 --membind=1 ${APP}
;;
[2])
export HIP_VISIBLE_DEVICES=2
export UCX_NET_DEVICES=mlx5_2:1
export UCX_IB_PCI_BW=mlx5_2:50Gbs
echo numactl --cpunodebind=2 --membind=2 ${APP}
numactl --cpunodebind=2 --membind=2 ${APP}
;;
[3])
export HIP_VISIBLE_DEVICES=3
export UCX_NET_DEVICES=mlx5_3:1
export UCX_IB_PCI_BW=mlx5_3:50Gbs
echo numactl --cpunodebind=3 --membind=3 ${APP}
numactl --cpunodebind=3 --membind=3 ${APP}
;;
esac
#!/bin/bash
#export MIOPEN_DEBUG_DISABLE_FIND_DB=1
#export NCCL_SOCKET_IFNAME=ib0
#export HSA_USERPTR_FOR_PAGED_MEM=0
#export MIOPEN_DEBUG_DISABLE_FIND_DB=1
#export NCCL_SOCKET_IFNAME=eno1
export HSA_FORCE_FINE_GRAIN_PCIE=1
#source /public/software/apps/DeepLearning/PyTorch/pytorch-env.sh
export MIOPEN_FIND_MODE=3
#export MIOPEN_ENABLE_LOGGING_CMD=1
#export ROCBLAS_LAYER=3
module unload compiler/rocm/2.9
#export MIOPEN_DEBUG_CONV_DIRECT_ASM_3X3U=0
#export MIOPEN_DEBUG_CONV_DIRECT_ASM_WRW3X3=0
echo "MIOPEN_FIND_MODE=$MIOPEN_FIND_MODE"
#module load apps/PyTorch/1.5.0a0/hpcx-2.4.1-gcc-7.3.1-rocm3.3
#source /public/home/aiss/Pytorch/env_rocm3.3_torch1.5.sh
lrank=$OMPI_COMM_WORLD_LOCAL_RANK
comm_rank=$OMPI_COMM_WORLD_RANK
comm_size=$OMPI_COMM_WORLD_SIZE
#module load compiler/rocm/3.9.1
APP="python3 run_squad_v1.py \
--train_file /public/home/hepj/data/sq1.1/train-v1.1.json \
--predict_file /public/home/hepj/data/sq1.1/dev-v1.1.json \
--init_checkpoint /public/home/hepj/model_source/pytorch_bert/model.ckpt-28252.pt \
--vocab_file /public/home/hepj/model_source/pytorch_bert/vocab.txt \
--output_dir /public/home/hepj/outdir/torch/SQuAD \
--config_file /public/home/hepj/model_source/pytorch_bert/bert_config.json \
--json-summary ./log/results.json \
--bert_model bert-large-uncased \
--do_train \
--do_predict \
--train_batch_size 4 \
--predict_batch_size 4 \
--gpus_per_node 1 \
--local_rank -1 \
--eval_script ./evaluate-v1.1.py
"
case ${lrank} in
[0])
export HIP_VISIBLE_DEVICES=0
export UCX_NET_DEVICES=mlx5_0:1
export UCX_IB_PCI_BW=mlx5_0:50Gbs
echo numactl --cpunodebind=0 --membind=0 ${APP}
numactl --cpunodebind=0 --membind=0 ${APP}
#echo GLOO_SOCKET_IFNAME=ib0 numactl --cpunodebind=0 --membind=0 ${APP}
#GLOO_SOCKET_IFNAME=ib0 numactl --cpunodebind=0 --membind=0 ${APP}
;;
[1])
export HIP_VISIBLE_DEVICES=1
export UCX_NET_DEVICES=mlx5_1:1
export UCX_IB_PCI_BW=mlx5_1:50Gbs
echo numactl --cpunodebind=1 --membind=1 ${APP}
numactl --cpunodebind=1 --membind=1 ${APP}
;;
[2])
export HIP_VISIBLE_DEVICES=2
export UCX_NET_DEVICES=mlx5_2:1
export UCX_IB_PCI_BW=mlx5_2:50Gbs
echo numactl --cpunodebind=2 --membind=2 ${APP}
numactl --cpunodebind=2 --membind=2 ${APP}
;;
[3])
export HIP_VISIBLE_DEVICES=3
export UCX_NET_DEVICES=mlx5_3:1
export UCX_IB_PCI_BW=mlx5_3:50Gbs
echo numactl --cpunodebind=3 --membind=3 ${APP}
numactl --cpunodebind=3 --membind=3 ${APP}
;;
esac
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment