#!/bin/bash export HSA_FORCE_FINE_GRAIN_PCIE=1 export MIOPEN_FIND_MODE=3 export MIOPEN_COMPILE_PARALLEL_LEVEL=1 export NCCL_SOCKET_IFNAME=ib0 export NCCL_P2P_LEVEL=5 export RCCL_NCHANNELS=2 export NCCL_IB_HCA=mlx5_0 lrank=$OMPI_COMM_WORLD_LOCAL_RANK echo "LRANK===============================$lrank" RANK=$OMPI_COMM_WORLD_RANK WORLD_SIZE=$OMPI_COMM_WORLD_SIZE DATASET_PATH=../data/dataset.pt MODEL_PATH=model_scope/Llama-2-7b-chat-hf/llama-7b.bin SPM_MODEL_PATH=model_scope/Llama-2-7b-chat-hf/tokenizer.model APP="python3 ../pretrain.py --deepspeed --deepspeed_config ../models/deepspeed_zero3_config.json \ --pretrained_model_path $MODEL_PATH \ --dataset_path $DATASET_PATH --spm_model_path $SPM_MODEL_PATH \ --config_path ../models/llama/7b_config.json \ --output_model_path output/7b/ --deepspeed_checkpoint_activations \ --world_size ${1} --data_processor alpaca --prefix_lm_loss \ --total_steps 10000 --save_checkpoint_steps 500 --batch_size 2 --enable_zero3 \ " case ${lrank} in [0]) export HIP_VISIBLE_DEVICES=0,1,2,3 export UCX_NET_DEVICES=mlx5_0:1 export UCX_IB_PCI_BW=mlx5_0:50Gbs numactl --cpunodebind=0 --membind=0 ${APP} ;; [1]) export HIP_VISIBLE_DEVICES=0,1,2,3 export UCX_NET_DEVICES=mlx5_1:1 export UCX_IB_PCI_BW=mlx5_1:50Gbs numactl --cpunodebind=1 --membind=1 ${APP} ;; [2]) export HIP_VISIBLE_DEVICES=0,1,2,3 export UCX_NET_DEVICES=mlx5_2:1 export UCX_IB_PCI_BW=mlx5_2:50Gbs numactl --cpunodebind=2 --membind=2 ${APP} ;; [3]) export HIP_VISIBLE_DEVICES=0,1,2,3 export UCX_NET_DEVICES=mlx5_3:1 export UCX_IB_PCI_BW=mlx5_3:50Gbs numactl --cpunodebind=3 --membind=3 ${APP} ;; esac