Commit 70d83fd3 authored by qianyj's avatar qianyj
Browse files

update

parent 7b47409b
#!/bin/bash
export ROCM_PATH=/opt/dtk-23.04
export ROCM_SOURCE_DIR=${ROCM_PATH}
echo $ROCM_PATH
export HIP_PATH=${ROCM_PATH}/hip
export AMDGPU_TARGETS="gfx900;gfx906"
export PATH=${ROCM_PATH}/bin:${ROCM_PATH}/llvm/bin:${ROCM_PATH}/hcc/bin:${ROCM_PATH}/hip/bin:$PATH
export LD_LIBRARY_PATH=${ROCM_PATH}/lib:${ROCM_PATH}/lib64:$LD_LIBRARY_PATH
export LD_LIBRARY_PATH=${ROCM_PATH}/hip/lib:${ROCM_PATH}/llvm/lib:${ROCM_PATH}/opencl/lib/x86_64:$LD_LIBRARY_PATH
export C_INCLUDE_PATH=${ROCM_PATH}/include:${ROCM_PATH}/hip/include/hip:${ROCM_PATH}/llvm/include:/opencl/include:${ROCM_PATH}/include/rocrand:${ROCM_PATH}/include/hiprand
export CPLUS_INCLUDE_PATH=${ROCM_PATH}/include:${ROCM_PATH}/hip/include/hip:${ROCM_PATH}/llvm/include:/opencl/include:${ROCM_PATH}/include/rocrand:${ROCM_PATH}/include/hiprand
export PATH=${ROCM_PATH}/miopen/bin:${ROCM_PATH}/rocblas/bin:${ROCM_PATH}/hipsparse/bin:$PATH
export LD_LIBRARY_PATH=${ROCM_PATH}/miopen/lib:${ROCM_PATH}/rocblas/lib:$LD_LIBRARY_PATH
export MIOPEN_SYSTEM_DB_PATH=${ROCM_PATH}/miopen/share/miopen/db/
export LD_LIBRARY_PATH=/usr/lib64:$LD_LIBRARY_PATH
export LIBRARY_PATH=/usr/lib64:$LIBRARY_PATH
export RCCL_PATH=$ROCM_PATH/rccl
export NCCL_PATH=$ROCM_PATH/rccl
export LD_LIBRARY_PATH=$RCCL_PATH/lib:$LD_LIBRARY_PATH
export MIOPEN_FIND_MODE=3
export HSA_FORCE_FINE_GRAIN_PCIE=1
export NCCL_P2P_LEVEL=5
export NCCL_GDR_FLUSH_DISABLE=1
export NCCL_NET_GDR_LEVEL=SYS
export RCCL_NCHANNELS=2
export NCCL_IB_HCA=mlx5
export NCCL_SOCKET_IFNAME=ib0
export NCCL_DEBUG=INFO
export MIOPEN_FIND_MODE=3
export HSA_FORCE_FINE_GRAIN_PCIE=1
export MIOPEN_COMPILE_PARALLEL_LEVEL=1
export NCCL_PLUGIN_P2P=ucx
export HIP_CLANG_PATH=/opt/dtk-23.04/llvm/bin
export HSA_PATH=/opt/dtk-23.04/hsa
export AOMP=/opt/dtk-23.04/llvm
export LD_LIBRARY_PATH=/opt/dtk-23.04/rccl/lib:/usr/lib64:/opt/dtk-23.04/miopen/lib:/opt/dtk-23.04/rocblas/lib:/opt/dtk-23.04/hip/lib:/opt/dtk-23.04/llvm/lib:/opt/dtk-23.04/opencl/lib/x86_64:/opt/dtk-23.04/lib:/opt/dtk-23.04/lib64:/opt/dtk-23.04/rccl/lib:/usr/lib64:/opt/dtk-23.04/miopen/lib:/opt/dtk-23.04/rocblas/lib:/opt/dtk-23.04/hip/lib:/opt/dtk-23.04/llvm/lib:/opt/dtk-23.04/opencl/lib/x86_64:/opt/dtk-23.04/lib:/opt/dtk-23.04/lib64:/opt/dtk-23.04/roctracer/lib:/opt/dtk-23.04/rocthrust/lib:/opt/dtk-23.04/rocsparse/lib:/opt/dtk-23.04/rocsolver/lib:/opt/dtk-23.04/rocrand/lib:/opt/dtk-23.04/rocprofiler/lib:/opt/dtk-23.04/rocprim/lib:/opt/dtk-23.04/dtk-23.04_smi/lib:/opt/dtk-23.04/rocfft/lib:/opt/dtk-23.04/rocblas/lib:/opt/dtk-23.04/rocalution/lib:/opt/dtk-23.04/rccl/lib:/opt/dtk-23.04/opencl/lib:/opt/dtk-23.04/oam/lib:/opt/dtk-23.04/migraphx/lib:/opt/dtk-23.04/miopengemm/lib:/opt/dtk-23.04/miopen/lib:/opt/dtk-23.04/llvm/lib-debug/src/openmp/libomptarget/plugins/remote/lib:/opt/dtk-23.04/llvm/lib/clang/14.0.0/lib:/opt/dtk-23.04/llvm/lib:/opt/dtk-23.04/hsa/lib:/opt/dtk-23.04/hipsparse/lib:/opt/dtk-23.04/hipsolver/lib:/opt/dtk-23.04/hiprand/lib:/opt/dtk-23.04/hipfft/lib:/opt/dtk-23.04/hipcub/lib:/opt/dtk-23.04/hipblas-clients/lib:/opt/dtk-23.04/hipblas/lib:/opt/dtk-23.04/hip/lib:/opt/dtk-23.04/lib:/opt/dtk-23.04/lib64:/opt/mpi/lib:/usr/local/lib/:/usr/local/lib64/:/usr/lib64/
export PATH=/opt/dtk-23.04/miopen/bin:/opt/dtk-23.04/rocblas/bin:/opt/dtk-23.04/hipsparse/bin:/opt/dtk-23.04/bin:/opt/dtk-23.04/llvm/bin:/opt/dtk-23.04/hcc/bin:/opt/dtk-23.04/hip/bin:/opt/dtk-23.04/miopen/bin:/opt/dtk-23.04/rocblas/bin:/opt/dtk-23.04/hipsparse/bin:/opt/dtk-23.04/bin:/opt/dtk-23.04/llvm/bin:/opt/dtk-23.04/hcc/bin:/opt/dtk-23.04/hip/bin:/opt/dtk-23.04/libexec/rocprofiler:/opt/dtk-23.04/libexec/dtk-23.04_smi:/opt/dtk-23.04/rocprofiler/bin:/opt/dtk-23.04/opencl/bin:/opt/dtk-23.04/miopen/bin:/opt/dtk-23.04/llvm/lib/clang/14.0.0/bin:/opt/dtk-23.04/llvm/bin:/opt/dtk-23.04/hip/bin:/opt/dtk-23.04/bin:/opt/mpi/bin:/root/anaconda3/bin:/root/anaconda3/condabin:/usr/lib64/qt-3.3/bin:/root/perl5/bin:/opt/dtk-23.04/bin:/opt/dtk-23.04/hip/bin:/opt/dtk-23.04/llvm/bin:/opt/dtk-23.04/llvm/lib/clang/14.0.0/bin:/opt/dtk-23.04/miopen/bin:/opt/dtk-23.04/opencl/bin:/opt/dtk-23.04/rocprofiler/bin:/opt/dtk-23.04/libexec/dtk-23.04_smi:/opt/dtk-23.04/libexec/rocprofiler:/opt/rh/devtoolset-7/root/usr/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/root/bin
export ROCM_ROOT=/opt/dtk-23.04
export ROCBLAS_TENSILE_LIBPATH=/opt/dtk-23.04/lib/rocblas/library
export HIP_ROCCLR_HOME=/opt/dtk-23.04/hip
export HIP_LIB_PATH=/opt/dtk-23.04/hip/lib
export DEVICE_LIB_PATH=/opt/dtk-23.04/amdgcn/bitcode
#!/bin/bash #!/bin/bash
export HSA_FORCE_FINE_GRAIN_PCIE=1
export MIOPEN_FIND_MODE=3
export MIOPEN_COMPILE_PARALLEL_LEVEL=1
export NCCL_PLUGIN_P2P=ucx
export RCCL_NCHANNELS=2
export NCCL_SOCKET_IFNAME=ib0
export NCCL_P2P_LEVEL=5
source env.sh
GPUS=$1
lrank=$OMPI_COMM_WORLD_LOCAL_RANK string=""
echo "LRANK===============================$lrank" for ((i=0; i<$GPUS; i++)); do
RANK=$OMPI_COMM_WORLD_RANK string="$string$i,"
WORLD_SIZE=$OMPI_COMM_WORLD_SIZE done
string=${string%","}
export NCCL_IB_HCA=mlx5_0 #0号网卡 export HIP_VISIBLE_DEVICES=$string
# echo "$HIP_VISIBLE_DEVICES"
APP="python3 ../src/train_bash.py --stage sft \ APP="python3 ../src/train_bash.py --stage sft \
...@@ -35,30 +31,46 @@ APP="python3 ../src/train_bash.py --stage sft \ ...@@ -35,30 +31,46 @@ APP="python3 ../src/train_bash.py --stage sft \
--fp16 \ --fp16 \
--deepspeed deepspeed.json --deepspeed deepspeed.json
" "
local_rank=$OMPI_COMM_WORLD_LOCAL_RANK
case ${lrank} in case ${local_rank} in
[0]) [0])
export HIP_VISIBLE_DEVICES=0,1,2,3 export HIP_VISIBLE_DEVICES=$string
export UCX_NET_DEVICES=mlx5_0:1 echo numactl --cpunodebind=0 --membind=0 ${APP}
export UCX_IB_PCI_BW=mlx5_0:50Gbs
numactl --cpunodebind=0 --membind=0 ${APP} numactl --cpunodebind=0 --membind=0 ${APP}
;; ;;
[1]) [1])
export HIP_VISIBLE_DEVICES=0,1,2,3 export HIP_VISIBLE_DEVICES=$string
export UCX_NET_DEVICES=mlx5_1:1 echo numactl --cpunodebind=0 --membind=0 ${APP}
export UCX_IB_PCI_BW=mlx5_1:50Gbs numactl --cpunodebind=0 --membind=0 ${APP}
numactl --cpunodebind=1 --membind=1 ${APP}
;; ;;
[2]) [2])
export HIP_VISIBLE_DEVICES=0,1,2,3 export HIP_VISIBLE_DEVICES=$string
export UCX_NET_DEVICES=mlx5_2:1 echo numactl --cpunodebind=0 --membind=0 ${APP}
export UCX_IB_PCI_BW=mlx5_2:50Gbs numactl --cpunodebind=0 --membind=0 ${APP}
numactl --cpunodebind=2 --membind=2 ${APP}
;; ;;
[3]) [3])
export HIP_VISIBLE_DEVICES=0,1,2,3 export HIP_VISIBLE_DEVICES=$string
export UCX_NET_DEVICES=mlx5_3:1 echo numactl --cpunodebind=0 --membind=0 ${APP}
export UCX_IB_PCI_BW=mlx5_3:50Gbs numactl --cpunodebind=0 --membind=0 ${APP}
;;
[4])
export HIP_VISIBLE_DEVICES=$string
echo numactl --cpunodebind=3 --membind=3 ${APP}
numactl --cpunodebind=3 --membind=3 ${APP}
;;
[5])
export HIP_VISIBLE_DEVICES=$string
echo numactl --cpunodebind=3 --membind=3 ${APP}
numactl --cpunodebind=3 --membind=3 ${APP}
;;
[6])
export HIP_VISIBLE_DEVICES=$string
echo numactl --cpunodebind=3 --membind=3 ${APP}
numactl --cpunodebind=3 --membind=3 ${APP}
;;
[7])
export HIP_VISIBLE_DEVICES=$string
echo numactl --cpunodebind=3 --membind=3 ${APP}
numactl --cpunodebind=3 --membind=3 ${APP} numactl --cpunodebind=3 --membind=3 ${APP}
;; ;;
esac esac
ulimit -u 200000 source env.sh
export HSA_FORCE_FINE_GRAIN_PCIE=1
export OMP_NUM_THREADS=1
export NCCL_DEBUG=INFO
export MIOPEN_FIND_MODE=3
export HSA_FORCE_FINE_GRAIN_PCIE=1
export MIOPEN_COMPILE_PARALLEL_LEVEL=1
export NCCL_PLUGIN_P2P=ucx
export NCCL_SOCKET_IFNAME=ib0
export NCCL_P2P_LEVEL=5
echo "START TIME: $(date)" echo "START TIME: $(date)"
hostfile=./hostfile hostfile=./hostfile
np=$(cat $hostfile|sort|uniq |wc -l) np=$(cat $hostfile|sort|uniq |wc -l)
np=$(($np*8)) np=$(($np*8))
which mpirun which mpirun
mpirun -np $np --allow-run-as-root --hostfile hostfile --bind-to none --mca btl_tcp_if_include enp97s0f1 mpi_single.sh 8 mpirun -np $np --allow-run-as-root --hostfile hostfile --bind-to none --mca btl_tcp_if_include enp97s0f1 mpi_single.sh 8
echo "END TIME: $(date)" echo "END TIME: $(date)"
......
#!/bin/bash #!/bin/bash
export MIOPEN_FIND_MODE=3
export GPU_MAX_HW_QUEUES=16 source env.sh
lrank=$OMPI_COMM_WORLD_LOCAL_RANK GPUS=$1
comm_rank=$OMPI_COMM_WORLD_RANK
comm_size=$OMPI_COMM_WORLD_SIZE string=""
export LOCAL_RANK=$OMPI_COMM_WORLD_LOCAL_RANK for ((i=0; i<$GPUS; i++)); do
export RANK=$comm_rank string="$string$i,"
export WORLD_SIZE=$comm_size done
export NCCL_IB_HCA=mlx5 string=${string%","}
export NCCL_SOCKET_IFNAME=ib0 export HIP_VISIBLE_DEVICES=$string
export HIP_DIRECT_DISPATCH=0 # echo "$HIP_VISIBLE_DEVICES"\
APP="python3 ../src/train_bash.py --stage sft \ APP="python3 ../src/train_bash.py --stage sft \
...@@ -39,30 +40,48 @@ APP="python3 ../src/train_bash.py --stage sft \ ...@@ -39,30 +40,48 @@ APP="python3 ../src/train_bash.py --stage sft \
--fp16 \ --fp16 \
--deepspeed deepspeed.json --deepspeed deepspeed.json
" "
local_rank=$OMPI_COMM_WORLD_LOCAL_RANK
case ${lrank} in echo $local_rank
case ${local_rank} in
[0]) [0])
export HIP_VISIBLE_DEVICES=0,1,2,3 export HIP_VISIBLE_DEVICES=$string
export UCX_NET_DEVICES=mlx5_0:1 echo numactl --cpunodebind=0 --membind=0 ${APP}
export UCX_IB_PCI_BW=mlx5_0:50Gbs
numactl --cpunodebind=0 --membind=0 ${APP} numactl --cpunodebind=0 --membind=0 ${APP}
;; ;;
[1]) [1])
export HIP_VISIBLE_DEVICES=0,1,2,3 export HIP_VISIBLE_DEVICES=$string
export UCX_NET_DEVICES=mlx5_1:1 echo numactl --cpunodebind=0 --membind=0 ${APP}
export UCX_IB_PCI_BW=mlx5_1:50Gbs numactl --cpunodebind=0 --membind=0 ${APP}
numactl --cpunodebind=1 --membind=1 ${APP}
;; ;;
[2]) [2])
export HIP_VISIBLE_DEVICES=0,1,2,3 export HIP_VISIBLE_DEVICES=$string
export UCX_NET_DEVICES=mlx5_2:1 echo numactl --cpunodebind=0 --membind=0 ${APP}
export UCX_IB_PCI_BW=mlx5_2:50Gbs numactl --cpunodebind=0 --membind=0 ${APP}
numactl --cpunodebind=2 --membind=2 ${APP}
;; ;;
[3]) [3])
export HIP_VISIBLE_DEVICES=0,1,2,3 export HIP_VISIBLE_DEVICES=$string
export UCX_NET_DEVICES=mlx5_3:1 echo numactl --cpunodebind=0 --membind=0 ${APP}
export UCX_IB_PCI_BW=mlx5_3:50Gbs numactl --cpunodebind=0 --membind=0 ${APP}
;;
[4])
export HIP_VISIBLE_DEVICES=$string
echo numactl --cpunodebind=3 --membind=3 ${APP}
numactl --cpunodebind=3 --membind=3 ${APP}
;;
[5])
export HIP_VISIBLE_DEVICES=$string
echo numactl --cpunodebind=3 --membind=3 ${APP}
numactl --cpunodebind=3 --membind=3 ${APP}
;;
[6])
export HIP_VISIBLE_DEVICES=$string
echo numactl --cpunodebind=3 --membind=3 ${APP}
numactl --cpunodebind=3 --membind=3 ${APP}
;;
[7])
export HIP_VISIBLE_DEVICES=$string
echo numactl --cpunodebind=3 --membind=3 ${APP}
numactl --cpunodebind=3 --membind=3 ${APP} numactl --cpunodebind=3 --membind=3 ${APP}
;; ;;
esac esac
ulimit -u 200000 source env.sh
export HSA_FORCE_FINE_GRAIN_PCIE=1
export OMP_NUM_THREADS=1
export NCCL_DEBUG=INFO
export MIOPEN_FIND_MODE=3
export HSA_FORCE_FINE_GRAIN_PCIE=1
export MIOPEN_COMPILE_PARALLEL_LEVEL=1
export NCCL_PLUGIN_P2P=ucx
export NCCL_SOCKET_IFNAME=ib0
export NCCL_P2P_LEVEL=5
echo "START TIME: $(date)" echo "START TIME: $(date)"
hostfile=./hostfile hostfile=./hostfile
np=$(cat $hostfile|sort|uniq |wc -l) np=$(cat $hostfile|sort|uniq |wc -l)
np=$(($np*8)) np=$(($np*8))
which mpirun which mpirun
mpirun -np $np --allow-run-as-root --hostfile hostfile --bind-to none --mca btl_tcp_if_include enp97s0f1 `pwd`/run-7b-single-lora.sh 8 mpirun -np $np --allow-run-as-root --hostfile hostfile --bind-to none --mca btl_tcp_if_include enp97s0f1 `pwd`/run-7b-single-lora.sh 8
echo "END TIME: $(date)" echo "END TIME: $(date)"
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment