#!/bin/bash #SBATCH -p wzhdtest #SBATCH -N 2 #SBATCH --cpus-per-task=8 #SBATCH --ntasks-per-node=4 ##SBATCH --mem 0 #SBATCH --gres=dcu:4 #SBATCH -J vit #SBATCH -o logs/%x-%j.txt #SBATCH -e logs/%x-%j.txt ulimit -u 200000 # export NCCL_DEBUG=INFO export NCCL_IB_HCA=mlx5 export NCCL_IB_DISABLE=0 export NCCL_SOCKET_IFNAME=ib0,ib1,ib2,ib3 export HSA_FORCE_FINE_GRAIN_PCIE=1 export MIOPEN_FIND_MODE=3 export OMP_NUM_THREADS=1 echo "START TIME: $(date)" module purge module load compiler/devtoolset/7.3.1 module load mpi/hpcx/gcc-7.3.1 module load compiler/dtk/23.04 # source /opt/dtk-23.04/env.sh source /public/home/xxx/dtk-23.04/env.sh module list which mpirun which hipcc # load env source /public/home/xxx/anaconda3/bin/activate megatron # conda activate megatron which python3 export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/public/home/xxx/anaconda3/envs/megatron/lib #source activate megatron export PYTHON=python3 export NPROC_PER_NODE=4 # rm -f ./hostfile/* rm -f core.* dir="./hostfile" if [ ! -d "$dir" ];then mkdir $dir echo "$dir created successfully" else echo "$dir already existed" fi # sbatch 提交目前环境配置有问题,需要先通过sleep固定节点来申请slots,然后注释掉hostfile这段命令启动sbatch提交给超算。 hostfile=./hostfile/$SLURM_JOB_ID scontrol show hostnames $SLURM_JOB_NODELIST > ${hostfile} for i in `cat $hostfile` do echo ${i} slots=4 >> `pwd`/hostfile/hostfile-dl-$SLURM_JOB_ID done np=$(cat $hostfile|sort|uniq |wc -l) np=$(($np*4)) nodename=$(cat $hostfile |sed -n "1p") dist_url=`echo $nodename | awk '{print $1}'` # sleep 10d # sbatch 提交目前环境配置有问题,需要先通过sleep固定节点来申请slots,然后注释掉hostfile这段命令启动sbatch提交给超算。 dir="./checkpoint" if [ ! -d "$dir" ];then mkdir $dir echo "$dir created successfully" else echo "$dir already existed" fi DATA_PATH="./data" CHECKPOINT_PATH="./checkpoint" DS_CONFIG="./examples/ds_config.json" MICRO_BATCH_SIZE=1 GLOBAL_BATCH_SIZE=8 deepspeed --hostfile=hostfile/hostfile-dl-$SLURM_JOB_ID pretrain_vit.py \ --num-layers 24 \ --hidden-size 1024 \ --num-attention-heads 16 \ --micro-batch-size ${MICRO_BATCH_SIZE} \ --global-batch-size ${GLOBAL_BATCH_SIZE} \ --seq-length 1024 \ --max-position-embeddings 1024 \ --train-iters 500000 \ --lr-decay-iters 320000 \ --save $CHECKPOINT_PATH \ --load $CHECKPOINT_PATH \ --data-path $DATA_PATH \ --data-impl mmap \ --split 949,50,1 \ --distributed-backend nccl \ --lr 0.00015 \ --min-lr 1.0e-5 \ --lr-decay-style cosine \ --weight-decay 1e-2 \ --clip-grad 1.0 \ --lr-warmup-fraction .01 \ --checkpoint-activations \ --log-interval 100 \ --save-interval 10000 \ --eval-interval 1000 \ --eval-iters 10 \ --fp16 \ --padded_vocab_size 224\ --deepspeed \ --deepspeed_config $DS_CONFIG \ # --eval-only True \ # --do_test True \