Commit eed209e7 authored by zhaoying1's avatar zhaoying1
Browse files

调整为标准格式

parent 4fae534d
...@@ -118,7 +118,7 @@ Hugging Face模型下载地址: ...@@ -118,7 +118,7 @@ Hugging Face模型下载地址:
#### 集群训练 #### 集群训练
``` ```
cd ptuning/slurm_scripts cd ptuning/slurm_scripts
bash run.sh bash run_train.sh
``` ```
注意:请根据自己的需求配置其中的模型路径、数据集路径、batchsize、学习率等参数; 注意:请根据自己的需求配置其中的模型路径、数据集路径、batchsize、学习率等参数;
......
#!/bin/bash
#SBATCH -p kshdnormal01
#SBATCH -N 4
#SBATCH --cpus-per-task=1
#SBATCH --ntasks-per-node=32
#SBATCH --mem 100G
#SBATCH --gres=dcu:4
#SBATCH -J chatglm
#SBATCH -o logs/pt-%j.out
#SBATCH -e logs/pt-%j.err
ulimit -u 200000 ulimit -u 200000
export OMP_NUM_THREADS=1 export OMP_NUM_THREADS=1
...@@ -19,23 +9,17 @@ export NCCL_PLUGIN_P2P=ucx ...@@ -19,23 +9,17 @@ export NCCL_PLUGIN_P2P=ucx
export NCCL_SOCKET_IFNAME=ib0 export NCCL_SOCKET_IFNAME=ib0
export NCCL_P2P_LEVEL=5 export NCCL_P2P_LEVEL=5
export NCCL_NET_PLUGIN=none export NCCL_NET_PLUGIN=none
unset RCCL_NCHANNELS
unset NCCL_NET_GDR_LEVEL
rm -rf ./hostfile/*
echo "START TIME: $(date)"
hostfile=./hostfile/$SLURM_JOB_ID
scontrol show hostnames $SLURM_JOB_NODELIST > ${hostfile}
for i in `cat $hostfile` echo "START TIME: $(date)"
do hostfile=./hostfile
echo ${i} slots=4 >> `pwd`/hostfile/hostfile-dl-$SLURM_JOB_ID
done
np=$(cat $hostfile|sort|uniq |wc -l) np=$(cat $hostfile|sort|uniq |wc -l)
np=$(($np*4)) np=$(($np*8))
nodename=$(cat $hostfile |sed -n "1p") nodename=$(cat $hostfile |sed -n "1p")
dist_url=`echo $nodename | awk '{print $1}'` dist_url=`echo $nodename | awk '{print $1}'`
echo ${dist_url} which mpirun
mpirun -np $np --hostfile hostfile/hostfile-dl-$SLURM_JOB_ID --bind-to none `pwd`/run_train_single.sh $dist_url mpirun -np $np --allow-run-as-root --hostfile hostfile --bind-to none --mca btl_tcp_if_include $dist_url run_train_single.sh
echo "END TIME: $(date)"
...@@ -4,15 +4,12 @@ export HSA_FORCE_FINE_GRAIN_PCIE=1 ...@@ -4,15 +4,12 @@ export HSA_FORCE_FINE_GRAIN_PCIE=1
export MIOPEN_FIND_MODE=3 export MIOPEN_FIND_MODE=3
export MIOPEN_COMPILE_PARALLEL_LEVEL=1 export MIOPEN_COMPILE_PARALLEL_LEVEL=1
export NCCL_PLUGIN_P2P=ucx export NCCL_PLUGIN_P2P=ucx
export RCCL_NCHANNELS=2
export NCCL_SOCKET_IFNAME=ib0 export NCCL_SOCKET_IFNAME=ib0
export NCCL_P2P_LEVEL=5 export NCCL_P2P_LEVEL=5
export NCCL_IB_HCA=mlx5_0 export NCCL_IB_HCA=mlx5_0
export NCCL_DEBUG=INFO export NCCL_DEBUG=INFO
export NCCL_NET_GDR_LEVEL=SYS
export NCCL_NET_PLUGIN=none export NCCL_NET_PLUGIN=none
unset RCCL_NCHANNELS
unset NCCL_NET_GDR_LEVEL
lrank=$OMPI_COMM_WORLD_LOCAL_RANK lrank=$OMPI_COMM_WORLD_LOCAL_RANK
echo "LRANK===============================$lrank" echo "LRANK===============================$lrank"
RANK=$OMPI_COMM_WORLD_RANK RANK=$OMPI_COMM_WORLD_RANK
...@@ -42,29 +39,55 @@ APP="python3 ../main.py \ ...@@ -42,29 +39,55 @@ APP="python3 ../main.py \
--fp16 \ --fp16 \
--local_rank $lrank " --local_rank $lrank "
case ${lrank} in case ${lrank} in
[0]) [0])
export HIP_VISIBLE_DEVICES=0,1,2,3 export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
export UCX_NET_DEVICES=mlx5_0:1 export UCX_NET_DEVICES=mlx5_0:1
export UCX_IB_PCI_BW=mlx5_0:50Gbs export UCX_IB_PCI_BW=mlx5_0:50Gbs
numactl --cpunodebind=0 --membind=0 ${APP} numactl --cpunodebind=0 --membind=0 ${APP}
;; ;;
[1]) [1])
export HIP_VISIBLE_DEVICES=0,1,2,3 export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
export UCX_NET_DEVICES=mlx5_1:1 export UCX_NET_DEVICES=mlx5_1:1
export UCX_IB_PCI_BW=mlx5_1:50Gbs export UCX_IB_PCI_BW=mlx5_1:50Gbs
numactl --cpunodebind=1 --membind=1 ${APP} numactl --cpunodebind=0 --membind=0 ${APP}
;; ;;
[2]) [2])
export HIP_VISIBLE_DEVICES=0,1,2,3 export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
export UCX_NET_DEVICES=mlx5_2:1 export UCX_NET_DEVICES=mlx5_2:1
export UCX_IB_PCI_BW=mlx5_2:50Gbs export UCX_IB_PCI_BW=mlx5_2:50Gbs
numactl --cpunodebind=2 --membind=2 ${APP} numactl --cpunodebind=0 --membind=0 ${APP}
;; ;;
[3]) [3])
export HIP_VISIBLE_DEVICES=0,1,2,3 export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
export UCX_NET_DEVICES=mlx5_3:1 export UCX_NET_DEVICES=mlx5_3:1
export UCX_IB_PCI_BW=mlx5_3:50Gbs export UCX_IB_PCI_BW=mlx5_3:50Gbs
numactl --cpunodebind=0 --membind=0 ${APP}
;;
[4])
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
export UCX_NET_DEVICES=mlx5_4:1
export UCX_IB_PCI_BW=mlx5_4:50Gbs
numactl --cpunodebind=3 --membind=3 ${APP}
;;
[5])
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
export UCX_NET_DEVICES=mlx5_5:1
export UCX_IB_PCI_BW=mlx5_5:50Gbs
numactl --cpunodebind=3 --membind=3 ${APP} numactl --cpunodebind=3 --membind=3 ${APP}
;; ;;
[6])
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
export UCX_NET_DEVICES=mlx5_6:1
export UCX_IB_PCI_BW=mlx5_6:50Gbs
numactl --cpunodebind=3 --membind=3 ${APP}
;;
[7])
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
export UCX_NET_DEVICES=mlx5_7:1
export UCX_IB_PCI_BW=mlx5_7:50Gbs
numactl --cpunodebind=3 --membind=3 ${APP}
;;
esac esac
#/bin/bash
mkdir -p logs
#rm -rf log/*
mkdir -p pt_output
mkdir -p hostfile
sbatch run_train.sh
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment