run-13b-sft-single.sh 1.7 KB
Newer Older
zhaoying1's avatar
zhaoying1 committed
1
2
3
4
5
6
7
8
9
#!/bin/bash

export HSA_FORCE_FINE_GRAIN_PCIE=1
export MIOPEN_FIND_MODE=3
export MIOPEN_COMPILE_PARALLEL_LEVEL=1
export NCCL_PLUGIN_P2P=ucx
export RCCL_NCHANNELS=2
export NCCL_SOCKET_IFNAME=ib0
export NCCL_P2P_LEVEL=5
zhaoying1's avatar
zhaoying1 committed
10
export NCCL_IB_HCA=mlx5_0  #0号网卡
zhaoying1's avatar
zhaoying1 committed
11

zhaoying1's avatar
zhaoying1 committed
12
export MASTER_ADDR=${1}
zhaoying1's avatar
zhaoying1 committed
13
lrank=$OMPI_COMM_WORLD_LOCAL_RANK
zhaoying1's avatar
zhaoying1 committed
14
15
16
17
export LOCAL_RANK=$OMPI_COMM_WORLD_LOCAL_RANK
export RANK=$OMPI_COMM_WORLD_RANK
export WORLD_SIZE=$OMPI_COMM_WORLD_SIZE
export MASTER_PORT=12365
zhaoying1's avatar
zhaoying1 committed
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66



APP="python3 ../src/train_bash.py --stage sft \
    --model_name_or_path ../../baichuan-13b-base/ \
    --do_train \
    --template default \
    --dataset alpaca_gpt4_en,alpaca_gpt4_zh,self_cognition,oaast_sft,lima \
    --finetuning_type full \
    --output_dir output/baichuan-13b \
    --per_device_train_batch_size 1 \
    --gradient_accumulation_steps 1 \
    --preprocessing_num_workers 16 \
    --lr_scheduler_type cosine \
    --logging_steps 10 \
    --save_steps 2000 \
    --learning_rate 1e-4 \
    --num_train_epochs 1.0 \
    --plot_loss \
    --fp16 \
    --deepspeed deepspeed.json
"

case ${lrank} in
[0])
  export HIP_VISIBLE_DEVICES=0,1,2,3
  export UCX_NET_DEVICES=mlx5_0:1
  export UCX_IB_PCI_BW=mlx5_0:50Gbs
  numactl --cpunodebind=0 --membind=0 ${APP}
  ;;
[1])
  export HIP_VISIBLE_DEVICES=0,1,2,3
  export UCX_NET_DEVICES=mlx5_1:1
  export UCX_IB_PCI_BW=mlx5_1:50Gbs
  numactl --cpunodebind=1 --membind=1 ${APP}
  ;;
[2])
  export HIP_VISIBLE_DEVICES=0,1,2,3
  export UCX_NET_DEVICES=mlx5_2:1
  export UCX_IB_PCI_BW=mlx5_2:50Gbs
  numactl --cpunodebind=2 --membind=2 ${APP}
  ;;
[3])
  export HIP_VISIBLE_DEVICES=0,1,2,3
  export UCX_NET_DEVICES=mlx5_3:1
  export UCX_IB_PCI_BW=mlx5_3:50Gbs
  numactl --cpunodebind=3 --membind=3 ${APP}
  ;;
esac