datavec_mpi.sh 1.63 KB
Newer Older
huaerkl's avatar
v1.0  
huaerkl committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
#!/bin/bash
#SBATCH -p wzhdexclu10
#SBATCH -N 2
#SBATCH --cpus-per-task=8
#SBATCH --ntasks-per-node=4
#SBATCH --mem 50G
#SBATCH --gres=dcu:4
#SBATCH -J data2vec
#SBATCH -o logs/%x-%j.txt
#SBATCH -e logs/%x-%j.txt
ulimit -u 200000

# export NCCL_DEBUG=INFO
export NCCL_IB_HCA=mlx5
export NCCL_IB_DISABLE=0
export NCCL_SOCKET_IFNAME=ib0
export HSA_FORCE_FINE_GRAIN_PCIE=1
export MIOPEN_FIND_MODE=3
export OMP_NUM_THREADS=1
echo "START TIME: $(date)"

module purge
module load compiler/devtoolset/7.3.1
module load mpi/hpcx/gcc-7.3.1
module load compiler/dtk/23.04
# source /opt/dtk-23.04/env.sh
huaerkl's avatar
huaerkl committed
27
source /public/home/xxx/dtk-23.04/env.sh
huaerkl's avatar
v1.0  
huaerkl committed
28
29
30
31
module list 
which mpirun

# load env
huaerkl's avatar
huaerkl committed
32
source /public/home/xxx/anaconda3/bin/activate fairseq
huaerkl's avatar
v1.0  
huaerkl committed
33
34
# conda activate fairseq
which python3
huaerkl's avatar
huaerkl committed
35
export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/public/home/xxx/anaconda3/envs/fairseq/lib
huaerkl's avatar
v1.0  
huaerkl committed
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
#source activate fairseq
export PYTHON=python3
export NPROC_PER_NODE=4

rm -f ./hostfile/*
rm -f core.*

dir="./hostfile"
if [ ! -d "$dir" ];then
mkdir $dir
echo "$dir created successfully"
else
echo "$dir already existed"
fi

dir="./checkpoints"
if [ ! -d "$dir" ];then
mkdir $dir
echo "$dir created successfully"
else
echo "$dir already existed"
fi

hostfile=./hostfile/$SLURM_JOB_ID
scontrol show hostnames $SLURM_JOB_NODELIST > ${hostfile}
for i in `cat $hostfile`
do
    echo ${i} slots=4 >> `pwd`/hostfile/hostfile-dl-$SLURM_JOB_ID
done
np=$(cat $hostfile|sort|uniq |wc -l)
np=$(($np*4))
nodename=$(cat $hostfile |sed -n "1p")
dist_url=`echo $nodename | awk '{print $1}'`
mpirun -np $np --allow-run-as-root --hostfile hostfile/hostfile-dl-$SLURM_JOB_ID --bind-to none `pwd`/pretrain_datavec_mpi.sh $dist_url