#!/bin/bash #SBATCH -p wzhdexclu10 #SBATCH -N 2 #SBATCH --cpus-per-task=8 #SBATCH --ntasks-per-node=4 #SBATCH --mem 50G #SBATCH --gres=dcu:4 #SBATCH -J data2vec #SBATCH -o logs/%x-%j.txt #SBATCH -e logs/%x-%j.txt ulimit -u 200000 # export NCCL_DEBUG=INFO export NCCL_IB_HCA=mlx5 export NCCL_IB_DISABLE=0 export NCCL_SOCKET_IFNAME=ib0 export HSA_FORCE_FINE_GRAIN_PCIE=1 export MIOPEN_FIND_MODE=3 export OMP_NUM_THREADS=1 echo "START TIME: $(date)" module purge module load compiler/devtoolset/7.3.1 module load mpi/hpcx/gcc-7.3.1 module load compiler/dtk/23.04 # source /opt/dtk-23.04/env.sh source /public/home/chenzk/dtk-23.04/env.sh module list which mpirun # load env source /public/home/chenzk/anaconda3/bin/activate fairseq # conda activate fairseq which python3 export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/public/home/chenzk/anaconda3/envs/fairseq/lib #source activate fairseq export PYTHON=python3 export NPROC_PER_NODE=4 rm -f ./hostfile/* rm -f core.* dir="./hostfile" if [ ! -d "$dir" ];then mkdir $dir echo "$dir created successfully" else echo "$dir already existed" fi # sleep 1d dir="./checkpoints" if [ ! -d "$dir" ];then mkdir $dir echo "$dir created successfully" else echo "$dir already existed" fi hostfile=./hostfile/$SLURM_JOB_ID scontrol show hostnames $SLURM_JOB_NODELIST > ${hostfile} for i in `cat $hostfile` do echo ${i} slots=4 >> `pwd`/hostfile/hostfile-dl-$SLURM_JOB_ID done np=$(cat $hostfile|sort|uniq |wc -l) np=$(($np*4)) nodename=$(cat $hostfile |sed -n "1p") dist_url=`echo $nodename | awk '{print $1}'` CONFIG_DIR="examples/data2vec/config/v2" # CONFIG_DIR="examples/data2vec/config/vision/finetuning" CONFIG_NAME="base_images_only_task" # CONFIG_NAME="mae_imagenet_clean" DATA_PATH=`pwd`/"data" CACHE_PATH=`pwd`/"scratch/cache_abaevski/imagenet" CHECKPOINT_PATH=`pwd`/"checkpoints/checkpoint_last.pt" srun python3 fairseq_cli/hydra_train.py -m \ task.data=$DATA_PATH \ task.local_cache_path=$CACHE_PATH \ distributed_training.distributed_init_method=tcp://$dist_url:34566 \ distributed_training.distributed_world_size=$np \ --config-dir $CONFIG_DIR \ --config-name $CONFIG_NAME \