run-7b-sft-lora-single.sh 1.9 KB
Newer Older
zhaoying1's avatar
zhaoying1 committed
1
#!/bin/bash
zhaoying1's avatar
zhaoying1 committed
2
export HSA_FORCE_FINE_GRAIN_PCIE=1
zhaoying1's avatar
zhaoying1 committed
3
export MIOPEN_FIND_MODE=3
zhaoying1's avatar
zhaoying1 committed
4
5
6
7
8
9
10
11
export MIOPEN_COMPILE_PARALLEL_LEVEL=1
export NCCL_PLUGIN_P2P=ucx
export RCCL_NCHANNELS=2
export NCCL_SOCKET_IFNAME=ib0
export NCCL_P2P_LEVEL=5
export NCCL_IB_HCA=mlx5_0  #0号网卡

export MASTER_ADDR=${1}
zhaoying1's avatar
zhaoying1 committed
12
lrank=$OMPI_COMM_WORLD_LOCAL_RANK
13
export LOCAL_RANK=$OMPI_COMM_WORLD_LOCAL_RANK
zhaoying1's avatar
zhaoying1 committed
14
15
16
export RANK=$OMPI_COMM_WORLD_RANK
export WORLD_SIZE=$OMPI_COMM_WORLD_SIZE
export MASTER_PORT=12365
zhaoying1's avatar
zhaoying1 committed
17
18
19


APP="python3 ../src/train_bash.py --stage sft \
20
    --model_name_or_path ../../baichuan-13b-base \
zhaoying1's avatar
zhaoying1 committed
21
22
    --do_train \
    --template default \
23
    --dataset alpaca_gpt4_en \
zhaoying1's avatar
zhaoying1 committed
24
25
26
    --finetuning_type lora \
    --lora_rank 16 \
    --lora_target W_pack,o_proj,gate_proj,down_proj,up_proj \
27
28
29
    --output_dir out/baichuan-7b-lora-test7 \
    --per_device_train_batch_size 1 \
    --per_device_eval_batch_size 1 \
zhaoying1's avatar
zhaoying1 committed
30
    --gradient_accumulation_steps 1 \
31
    --preprocessing_num_workers 8 \
zhaoying1's avatar
zhaoying1 committed
32
33
    --lr_scheduler_type cosine \
    --logging_steps 10 \
34
35
    --save_steps 2 \
    --eval_steps 2 \
zhaoying1's avatar
zhaoying1 committed
36
37
38
    --learning_rate 1e-4 \
    --max_grad_norm 0.5 \
    --num_train_epochs 1.0 \
39
40
41
    --val_size 0.001 \
    --evaluation_strategy steps \
    --load_best_model_at_end \
zhaoying1's avatar
zhaoying1 committed
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
    --plot_loss \
    --fp16 \
    --deepspeed deepspeed.json
"

case ${lrank} in
[0])
  export HIP_VISIBLE_DEVICES=0,1,2,3
  export UCX_NET_DEVICES=mlx5_0:1
  export UCX_IB_PCI_BW=mlx5_0:50Gbs
  numactl --cpunodebind=0 --membind=0 ${APP}
  ;;
[1])
  export HIP_VISIBLE_DEVICES=0,1,2,3
  export UCX_NET_DEVICES=mlx5_1:1
  export UCX_IB_PCI_BW=mlx5_1:50Gbs
  numactl --cpunodebind=1 --membind=1 ${APP}
  ;;
[2])
  export HIP_VISIBLE_DEVICES=0,1,2,3
  export UCX_NET_DEVICES=mlx5_2:1
  export UCX_IB_PCI_BW=mlx5_2:50Gbs
  numactl --cpunodebind=2 --membind=2 ${APP}
  ;;
[3])
  export HIP_VISIBLE_DEVICES=0,1,2,3
  export UCX_NET_DEVICES=mlx5_3:1
  export UCX_IB_PCI_BW=mlx5_3:50Gbs
  numactl --cpunodebind=3 --membind=3 ${APP}
  ;;
esac