Commit 1af723f8 authored by zhaoying1's avatar zhaoying1
Browse files

update multinode run

parent 9e75f8e2
#!/bin/bash
export ROCM_PATH=/opt/dtk-23.04
export ROCM_SOURCE_DIR=${ROCM_PATH}
echo $ROCM_PATH
export HIP_PATH=${ROCM_PATH}/hip
export AMDGPU_TARGETS="gfx900;gfx906"
export PATH=${ROCM_PATH}/bin:${ROCM_PATH}/llvm/bin:${ROCM_PATH}/hcc/bin:${ROCM_PATH}/hip/bin:$PATH
export LD_LIBRARY_PATH=${ROCM_PATH}/lib:${ROCM_PATH}/lib64:$LD_LIBRARY_PATH
export LD_LIBRARY_PATH=${ROCM_PATH}/hip/lib:${ROCM_PATH}/llvm/lib:${ROCM_PATH}/opencl/lib/x86_64:$LD_LIBRARY_PATH
export C_INCLUDE_PATH=${ROCM_PATH}/include:${ROCM_PATH}/hip/include/hip:${ROCM_PATH}/llvm/include:/opencl/include:${ROCM_PATH}/include/rocrand:${ROCM_PATH}/include/hiprand
export CPLUS_INCLUDE_PATH=${ROCM_PATH}/include:${ROCM_PATH}/hip/include/hip:${ROCM_PATH}/llvm/include:/opencl/include:${ROCM_PATH}/include/rocrand:${ROCM_PATH}/include/hiprand
export PATH=${ROCM_PATH}/miopen/bin:${ROCM_PATH}/rocblas/bin:${ROCM_PATH}/hipsparse/bin:$PATH
export LD_LIBRARY_PATH=${ROCM_PATH}/miopen/lib:${ROCM_PATH}/rocblas/lib:$LD_LIBRARY_PATH
export MIOPEN_SYSTEM_DB_PATH=${ROCM_PATH}/miopen/share/miopen/db/
export LD_LIBRARY_PATH=/usr/lib64:$LD_LIBRARY_PATH
export LIBRARY_PATH=/usr/lib64:$LIBRARY_PATH
export RCCL_PATH=$ROCM_PATH/rccl
export NCCL_PATH=$ROCM_PATH/rccl
export LD_LIBRARY_PATH=$RCCL_PATH/lib:$LD_LIBRARY_PATH
export MIOPEN_FIND_MODE=3
export HSA_FORCE_FINE_GRAIN_PCIE=1
export NCCL_P2P_LEVEL=5
export NCCL_GDR_FLUSH_DISABLE=1
export NCCL_NET_GDR_LEVEL=SYS
export RCCL_NCHANNELS=2
export NCCL_IB_HCA=mlx5
export NCCL_SOCKET_IFNAME=ib0
export NCCL_DEBUG=INFO
export MIOPEN_FIND_MODE=3
export HSA_FORCE_FINE_GRAIN_PCIE=1
export MIOPEN_COMPILE_PARALLEL_LEVEL=1
export NCCL_PLUGIN_P2P=ucx
export HIP_CLANG_PATH=/opt/dtk-23.04/llvm/bin
export HSA_PATH=/opt/dtk-23.04/hsa
export AOMP=/opt/dtk-23.04/llvm
export LD_LIBRARY_PATH=/opt/dtk-23.04/rccl/lib:/usr/lib64:/opt/dtk-23.04/miopen/lib:/opt/dtk-23.04/rocblas/lib:/opt/dtk-23.04/hip/lib:/opt/dtk-23.04/llvm/lib:/opt/dtk-23.04/opencl/lib/x86_64:/opt/dtk-23.04/lib:/opt/dtk-23.04/lib64:/opt/dtk-23.04/rccl/lib:/usr/lib64:/opt/dtk-23.04/miopen/lib:/opt/dtk-23.04/rocblas/lib:/opt/dtk-23.04/hip/lib:/opt/dtk-23.04/llvm/lib:/opt/dtk-23.04/opencl/lib/x86_64:/opt/dtk-23.04/lib:/opt/dtk-23.04/lib64:/opt/dtk-23.04/roctracer/lib:/opt/dtk-23.04/rocthrust/lib:/opt/dtk-23.04/rocsparse/lib:/opt/dtk-23.04/rocsolver/lib:/opt/dtk-23.04/rocrand/lib:/opt/dtk-23.04/rocprofiler/lib:/opt/dtk-23.04/rocprim/lib:/opt/dtk-23.04/dtk-23.04_smi/lib:/opt/dtk-23.04/rocfft/lib:/opt/dtk-23.04/rocblas/lib:/opt/dtk-23.04/rocalution/lib:/opt/dtk-23.04/rccl/lib:/opt/dtk-23.04/opencl/lib:/opt/dtk-23.04/oam/lib:/opt/dtk-23.04/migraphx/lib:/opt/dtk-23.04/miopengemm/lib:/opt/dtk-23.04/miopen/lib:/opt/dtk-23.04/llvm/lib-debug/src/openmp/libomptarget/plugins/remote/lib:/opt/dtk-23.04/llvm/lib/clang/14.0.0/lib:/opt/dtk-23.04/llvm/lib:/opt/dtk-23.04/hsa/lib:/opt/dtk-23.04/hipsparse/lib:/opt/dtk-23.04/hipsolver/lib:/opt/dtk-23.04/hiprand/lib:/opt/dtk-23.04/hipfft/lib:/opt/dtk-23.04/hipcub/lib:/opt/dtk-23.04/hipblas-clients/lib:/opt/dtk-23.04/hipblas/lib:/opt/dtk-23.04/hip/lib:/opt/dtk-23.04/lib:/opt/dtk-23.04/lib64:/opt/mpi/lib:/usr/local/lib/:/usr/local/lib64/:/usr/lib64/
export PATH=/opt/dtk-23.04/miopen/bin:/opt/dtk-23.04/rocblas/bin:/opt/dtk-23.04/hipsparse/bin:/opt/dtk-23.04/bin:/opt/dtk-23.04/llvm/bin:/opt/dtk-23.04/hcc/bin:/opt/dtk-23.04/hip/bin:/opt/dtk-23.04/miopen/bin:/opt/dtk-23.04/rocblas/bin:/opt/dtk-23.04/hipsparse/bin:/opt/dtk-23.04/bin:/opt/dtk-23.04/llvm/bin:/opt/dtk-23.04/hcc/bin:/opt/dtk-23.04/hip/bin:/opt/dtk-23.04/libexec/rocprofiler:/opt/dtk-23.04/libexec/dtk-23.04_smi:/opt/dtk-23.04/rocprofiler/bin:/opt/dtk-23.04/opencl/bin:/opt/dtk-23.04/miopen/bin:/opt/dtk-23.04/llvm/lib/clang/14.0.0/bin:/opt/dtk-23.04/llvm/bin:/opt/dtk-23.04/hip/bin:/opt/dtk-23.04/bin:/opt/mpi/bin:/root/anaconda3/bin:/root/anaconda3/condabin:/usr/lib64/qt-3.3/bin:/root/perl5/bin:/opt/dtk-23.04/bin:/opt/dtk-23.04/hip/bin:/opt/dtk-23.04/llvm/bin:/opt/dtk-23.04/llvm/lib/clang/14.0.0/bin:/opt/dtk-23.04/miopen/bin:/opt/dtk-23.04/opencl/bin:/opt/dtk-23.04/rocprofiler/bin:/opt/dtk-23.04/libexec/dtk-23.04_smi:/opt/dtk-23.04/libexec/rocprofiler:/opt/rh/devtoolset-7/root/usr/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/root/bin
export ROCM_ROOT=/opt/dtk-23.04
export ROCBLAS_TENSILE_LIBPATH=/opt/dtk-23.04/lib/rocblas/library
export HIP_ROCCLR_HOME=/opt/dtk-23.04/hip
export HIP_LIB_PATH=/opt/dtk-23.04/hip/lib
export DEVICE_LIB_PATH=/opt/dtk-23.04/amdgcn/bitcode
#!/bin/bash
source env.sh
GPUS=$1
string=""
......@@ -8,8 +7,14 @@ for ((i=0; i<$GPUS; i++)); do
string="$string$i,"
done
string=${string%","}
export HIP_VISIBLE_DEVICES=$string
# echo "$HIP_VISIBLE_DEVICES"
export MASTER_ADDR=${2}
export WORLD_SIZE=$OMPI_COMM_WORLD_SIZE
export RANK=$OMPI_COMM_WORLD_RANK
local_rank=$OMPI_COMM_WORLD_LOCAL_RANK
export HSA_FORCE_FINE_GRAIN_PCIE=1
export LOCAL_RANK=$OMPI_COMM_WORLD_LOCAL_RANK
export MASTER_PORT=12365
export OMP_NUM_THREADS=1
APP="python3 ../src/train_bash.py --stage sft \
......@@ -31,46 +36,38 @@ APP="python3 ../src/train_bash.py --stage sft \
--fp16 \
--deepspeed deepspeed.json
"
local_rank=$OMPI_COMM_WORLD_LOCAL_RANK
case ${local_rank} in
[0])
export HIP_VISIBLE_DEVICES=$string
echo numactl --cpunodebind=0 --membind=0 ${APP}
numactl --cpunodebind=0 --membind=0 ${APP}
;;
[1])
export HIP_VISIBLE_DEVICES=$string
echo numactl --cpunodebind=0 --membind=0 ${APP}
numactl --cpunodebind=0 --membind=0 ${APP}
numactl --cpunodebind=1 --membind=1 ${APP}
;;
[2])
export HIP_VISIBLE_DEVICES=$string
echo numactl --cpunodebind=0 --membind=0 ${APP}
numactl --cpunodebind=0 --membind=0 ${APP}
numactl --cpunodebind=2 --membind=2 ${APP}
;;
[3])
export HIP_VISIBLE_DEVICES=$string
echo numactl --cpunodebind=0 --membind=0 ${APP}
numactl --cpunodebind=0 --membind=0 ${APP}
numactl --cpunodebind=3 --membind=3 ${APP}
;;
[4])
export HIP_VISIBLE_DEVICES=$string
echo numactl --cpunodebind=3 --membind=3 ${APP}
numactl --cpunodebind=3 --membind=3 ${APP}
numactl --cpunodebind=4 --membind=4 ${APP}
;;
[5])
export HIP_VISIBLE_DEVICES=$string
echo numactl --cpunodebind=3 --membind=3 ${APP}
numactl --cpunodebind=3 --membind=3 ${APP}
numactl --cpunodebind=5 --membind=5 ${APP}
;;
[6])
export HIP_VISIBLE_DEVICES=$string
echo numactl --cpunodebind=3 --membind=3 ${APP}
numactl --cpunodebind=3 --membind=3 ${APP}
numactl --cpunodebind=6 --membind=6 ${APP}
;;
[7])
export HIP_VISIBLE_DEVICES=$string
echo numactl --cpunodebind=3 --membind=3 ${APP}
numactl --cpunodebind=3 --membind=3 ${APP}
numactl --cpunodebind=7 --membind=7 ${APP}
;;
esac
source env.sh
echo "START TIME: $(date)"
hostfile=./hostfile
np=$(cat $hostfile|sort|uniq |wc -l)
np=$(($np*8))
nodename=$(cat $hostfile |sed -n "1p")
dist_url=`echo $nodename | awk '{print $1}'`
which mpirun
mpirun -np $np --allow-run-as-root --hostfile hostfile --bind-to none --mca btl_tcp_if_include enp97s0f1 run-13b-sft-single.sh 8
mpirun -np $np --allow-run-as-root --hostfile ./hostfile1 -mca plm_rsh_args "-p 2345" -mca btl ^openib run-13b-sft-single.sh 8 $dist_url
echo "END TIME: $(date)"
......
#!/bin/bash
source env.sh
GPUS=$1
string=""
......@@ -8,8 +7,14 @@ for ((i=0; i<$GPUS; i++)); do
string="$string$i,"
done
string=${string%","}
export HIP_VISIBLE_DEVICES=$string
# echo "$HIP_VISIBLE_DEVICES"\
export MASTER_ADDR=${2}
export WORLD_SIZE=$OMPI_COMM_WORLD_SIZE
export RANK=$OMPI_COMM_WORLD_RANK
local_rank=$OMPI_COMM_WORLD_LOCAL_RANK
export HSA_FORCE_FINE_GRAIN_PCIE=1
export LOCAL_RANK=$OMPI_COMM_WORLD_LOCAL_RANK
export MASTER_PORT=12365
export OMP_NUM_THREADS=1
......@@ -40,48 +45,40 @@ APP="python3 ../src/train_bash.py --stage sft \
--fp16 \
--deepspeed deepspeed.json
"
local_rank=$OMPI_COMM_WORLD_LOCAL_RANK
echo $local_rank
case ${local_rank} in
[0])
export HIP_VISIBLE_DEVICES=$string
echo numactl --cpunodebind=0 --membind=0 ${APP}
numactl --cpunodebind=0 --membind=0 ${APP}
;;
[1])
export HIP_VISIBLE_DEVICES=$string
echo numactl --cpunodebind=0 --membind=0 ${APP}
numactl --cpunodebind=0 --membind=0 ${APP}
numactl --cpunodebind=1 --membind=1 ${APP}
;;
[2])
export HIP_VISIBLE_DEVICES=$string
echo numactl --cpunodebind=0 --membind=0 ${APP}
numactl --cpunodebind=0 --membind=0 ${APP}
numactl --cpunodebind=2 --membind=2 ${APP}
;;
[3])
export HIP_VISIBLE_DEVICES=$string
echo numactl --cpunodebind=0 --membind=0 ${APP}
numactl --cpunodebind=0 --membind=0 ${APP}
numactl --cpunodebind=3 --membind=3 ${APP}
;;
[4])
export HIP_VISIBLE_DEVICES=$string
echo numactl --cpunodebind=3 --membind=3 ${APP}
numactl --cpunodebind=3 --membind=3 ${APP}
numactl --cpunodebind=4 --membind=4 ${APP}
;;
[5])
export HIP_VISIBLE_DEVICES=$string
echo numactl --cpunodebind=3 --membind=3 ${APP}
numactl --cpunodebind=3 --membind=3 ${APP}
numactl --cpunodebind=5 --membind=5 ${APP}
;;
[6])
export HIP_VISIBLE_DEVICES=$string
echo numactl --cpunodebind=3 --membind=3 ${APP}
numactl --cpunodebind=3 --membind=3 ${APP}
numactl --cpunodebind=6 --membind=6 ${APP}
;;
[7])
export HIP_VISIBLE_DEVICES=$string
echo numactl --cpunodebind=3 --membind=3 ${APP}
numactl --cpunodebind=3 --membind=3 ${APP}
numactl --cpunodebind=7 --membind=7 ${APP}
;;
esac
source env.sh
echo "START TIME: $(date)"
hostfile=./hostfile
np=$(cat $hostfile|sort|uniq |wc -l)
np=$(($np*8))
nodename=$(cat $hostfile |sed -n "1p")
dist_url=`echo $nodename | awk '{print $1}'`
which mpirun
mpirun -np $np --allow-run-as-root --hostfile hostfile --bind-to none --mca btl_tcp_if_include enp97s0f1 `pwd`/run-7b-single-lora.sh 8
mpirun -np $np --allow-run-as-root --hostfile ./hostfile1 -mca plm_rsh_args "-p 2345" -mca btl ^openib run-7b-single-lora.sh 8 $dist_url
echo "END TIME: $(date)"
......@@ -5,17 +5,17 @@ export MIOPEN_FIND_MODE=3
export MIOPEN_COMPILE_PARALLEL_LEVEL=1
export NCCL_PLUGIN_P2P=ucx
export RCCL_NCHANNELS=2
export NCCL_SOCKET_IFNAME=ib0
export NCCL_P2P_LEVEL=5
export NCCL_IB_HCA=mlx5_0 #0号网卡
export MASTER_ADDR=${1}
lrank=$OMPI_COMM_WORLD_LOCAL_RANK
echo "LRANK===============================$lrank"
RANK=$OMPI_COMM_WORLD_RANK
WORLD_SIZE=$OMPI_COMM_WORLD_SIZE
export LOCAL_RANK=$OMPI_COMM_WORLD_LOCAL_RANK
export RANK=$OMPI_COMM_WORLD_RANK
export WORLD_SIZE=$OMPI_COMM_WORLD_SIZE
export MASTER_PORT=12365
export NCCL_IB_HCA=mlx5_0 #0号网卡
APP="python3 ../src/train_bash.py --stage sft \
......
......@@ -8,19 +8,6 @@
#SBATCH -o logs-13B/baichuan-ft-%j.out
#SBATCH -e logs-13B/baichuan-ft-%j.err
ulimit -u 200000
export HSA_FORCE_FINE_GRAIN_PCIE=1
export OMP_NUM_THREADS=1
export NCCL_DEBUG=INFO
# export NCCL_DEBUG_SUBSYS=ALL
export MIOPEN_FIND_MODE=3
export HSA_FORCE_FINE_GRAIN_PCIE=1
export MIOPEN_COMPILE_PARALLEL_LEVEL=1
export NCCL_PLUGIN_P2P=ucx
export NCCL_SOCKET_IFNAME=ib0
export NCCL_P2P_LEVEL=5
echo "START TIME: $(date)"
hostfile=./hostfile/$SLURM_JOB_ID
......@@ -35,5 +22,5 @@ np=$(($np*4))
nodename=$(cat $hostfile |sed -n "1p")
dist_url=`echo $nodename | awk '{print $1}'`
mpirun -np $np --allow-run-as-root --hostfile hostfile/hostfile-dl-$SLURM_JOB_ID --bind-to none `pwd`/run-13b-sft-single.sh
mpirun -np $np --allow-run-as-root --hostfile hostfile/hostfile-dl-$SLURM_JOB_ID --bind-to none `pwd`/run-13b-sft-single.sh $dist_url
#!/bin/bash
export HSA_FORCE_FINE_GRAIN_PCIE=1
export MIOPEN_FIND_MODE=3
export GPU_MAX_HW_QUEUES=16
export MIOPEN_COMPILE_PARALLEL_LEVEL=1
export NCCL_PLUGIN_P2P=ucx
export RCCL_NCHANNELS=2
export NCCL_SOCKET_IFNAME=ib0
export NCCL_P2P_LEVEL=5
export NCCL_IB_HCA=mlx5_0 #0号网卡
export MASTER_ADDR=${1}
lrank=$OMPI_COMM_WORLD_LOCAL_RANK
comm_rank=$OMPI_COMM_WORLD_RANK
comm_size=$OMPI_COMM_WORLD_SIZE
export LOCAL_RANK=$OMPI_COMM_WORLD_LOCAL_RANK
export RANK=$comm_rank
export WORLD_SIZE=$comm_size
export MASTER_ADDR=$1
export MASTER_PORT=29500
export NCCL_IB_HCA=mlx5
export NCCL_SOCKET_IFNAME=ib0
export HIP_DIRECT_DISPATCH=0
export RANK=$OMPI_COMM_WORLD_RANK
export WORLD_SIZE=$OMPI_COMM_WORLD_SIZE
export MASTER_PORT=12365
APP="python3 ../src/train_bash.py --stage sft \
......
......@@ -10,21 +10,6 @@
#SBATCH --exclusive
ulimit -s unlimited
export HIP_VISIBLE_DEVICES=0,1,2,3
export MIOPEN_FIND_MODE=3
export MIOPEN_DEBUG_CONV_IMPLICIT_GEMM=0
export MIOPEN_USER_DB_PATH=/tmp/miopen-udb
export MIOPEN_CUSTOM_CACHE_DIR=/tmp/miopen-cache
export NCCL_SOCKET_IFNAME=ib0
export HSA_FORCE_FINE_GRAIN_PCIE=1
export OMP_NUM_THREADS=1
export NCCL_IB_HCA=mlx5
export NCCL_DEBUG=INFO
export MIOPEN_COMPILE_PARALLEL_LEVEL=1
export NCCL_PLUGIN_P2P=ucx
export NCCL_P2P_LEVEL=5
echo "START TIME: $(date)"
......@@ -50,5 +35,5 @@ np=$(($np*4))
nodename=$(cat $hostfile |sed -n "1p") #读取每行节点 第一个是主节点
dist_url=`echo $nodename | awk '{print $1}'`
mpirun -np $np --allow-run-as-root --hostfile hostfile/hostfile-dl-$SLURM_JOB_ID --bind-to none `pwd`/run-7b-single-lora.sh
mpirun -np $np --allow-run-as-root --hostfile hostfile/hostfile-dl-$SLURM_JOB_ID --bind-to none `pwd`/run-7b-single-lora.sh $dist_url
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment