vit_dsp.sh 2.99 KB
Newer Older
huaerkl's avatar
v1.0  
huaerkl committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
#!/bin/bash
#SBATCH -p wzhdtest
#SBATCH -N 2
#SBATCH --cpus-per-task=8
#SBATCH --ntasks-per-node=4
##SBATCH --mem 0
#SBATCH --gres=dcu:4
#SBATCH -J vit
#SBATCH -o logs/%x-%j.txt
#SBATCH -e logs/%x-%j.txt
ulimit -u 200000

# export NCCL_DEBUG=INFO
export NCCL_IB_HCA=mlx5
export NCCL_IB_DISABLE=0
export NCCL_SOCKET_IFNAME=ib0,ib1,ib2,ib3
export HSA_FORCE_FINE_GRAIN_PCIE=1
export MIOPEN_FIND_MODE=3
export OMP_NUM_THREADS=1
echo "START TIME: $(date)"

module purge
module load compiler/devtoolset/7.3.1
module load mpi/hpcx/gcc-7.3.1
module load compiler/dtk/23.04
# source /opt/dtk-23.04/env.sh
source /public/home/xxx/dtk-23.04/env.sh
module list 
which mpirun
which hipcc


# load env
source /public/home/xxx/anaconda3/bin/activate megatron
# conda activate megatron
which python3
export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/public/home/xxx/anaconda3/envs/megatron/lib
#source activate megatron
export PYTHON=python3
export NPROC_PER_NODE=4

# rm -f ./hostfile/*
rm -f core.*

dir="./hostfile"
if [ ! -d "$dir" ];then
mkdir $dir
echo "$dir created successfully"
else
echo "$dir already existed"
fi


# sbatch 提交目前环境配置有问题,需要先通过sleep固定节点来申请slots,然后注释掉hostfile这段命令启动sbatch提交给超算。
hostfile=./hostfile/$SLURM_JOB_ID
scontrol show hostnames $SLURM_JOB_NODELIST > ${hostfile}
for i in `cat $hostfile`
do
    echo ${i} slots=4 >> `pwd`/hostfile/hostfile-dl-$SLURM_JOB_ID
done
np=$(cat $hostfile|sort|uniq |wc -l)
np=$(($np*4))
nodename=$(cat $hostfile |sed -n "1p")
dist_url=`echo $nodename | awk '{print $1}'`

# sleep 10d
# sbatch 提交目前环境配置有问题,需要先通过sleep固定节点来申请slots,然后注释掉hostfile这段命令启动sbatch提交给超算。


dir="./checkpoint"
if [ ! -d "$dir" ];then
mkdir $dir
echo "$dir created successfully"
else
echo "$dir already existed"
fi

DATA_PATH="./data"
CHECKPOINT_PATH="./checkpoint"
DS_CONFIG="./examples/ds_config.json"

MICRO_BATCH_SIZE=1
GLOBAL_BATCH_SIZE=8


deepspeed --hostfile=hostfile/hostfile-dl-$SLURM_JOB_ID pretrain_vit.py \
       --num-layers 24 \
       --hidden-size 1024 \
       --num-attention-heads 16 \
       --micro-batch-size ${MICRO_BATCH_SIZE} \
       --global-batch-size ${GLOBAL_BATCH_SIZE} \
       --seq-length 1024 \
       --max-position-embeddings 1024 \
       --train-iters 500000 \
       --lr-decay-iters 320000 \
       --save $CHECKPOINT_PATH \
       --load $CHECKPOINT_PATH \
       --data-path $DATA_PATH \
       --data-impl mmap \
       --split 949,50,1 \
       --distributed-backend nccl \
       --lr 0.00015 \
       --min-lr 1.0e-5 \
       --lr-decay-style cosine \
       --weight-decay 1e-2 \
       --clip-grad 1.0 \
       --lr-warmup-fraction .01 \
       --checkpoint-activations \
       --log-interval 100 \
       --save-interval 10000 \
       --eval-interval 1000 \
       --eval-iters 10 \
       --fp16 \
       --padded_vocab_size 224\
       --deepspeed \
       --deepspeed_config $DS_CONFIG \


# --eval-only True \
# --do_test True \