"...training_service/pai/paiYarn/paiYarnTrainingService.ts" did not exist on "cb15be491b991c2bdf495db44cb7f987bbbde36a"
run-7b-sft-lora-single.sh 1.83 KB
Newer Older
zhaoying1's avatar
zhaoying1 committed
1
2
#!/bin/bash
export MIOPEN_FIND_MODE=3
3
export GPU_MAX_HW_QUEUES=16
zhaoying1's avatar
zhaoying1 committed
4
lrank=$OMPI_COMM_WORLD_LOCAL_RANK
5
6
7
8
9
10
11
12
13
14
comm_rank=$OMPI_COMM_WORLD_RANK
comm_size=$OMPI_COMM_WORLD_SIZE
export LOCAL_RANK=$OMPI_COMM_WORLD_LOCAL_RANK
export RANK=$comm_rank
export WORLD_SIZE=$comm_size
export MASTER_ADDR=$1
export MASTER_PORT=29500
export NCCL_IB_HCA=mlx5
export NCCL_SOCKET_IFNAME=ib0 
export HIP_DIRECT_DISPATCH=0
zhaoying1's avatar
zhaoying1 committed
15
16
17


APP="python3 ../src/train_bash.py --stage sft \
18
    --model_name_or_path ../../baichuan-13b-base \
zhaoying1's avatar
zhaoying1 committed
19
20
    --do_train \
    --template default \
21
    --dataset alpaca_gpt4_en \
zhaoying1's avatar
zhaoying1 committed
22
23
24
    --finetuning_type lora \
    --lora_rank 16 \
    --lora_target W_pack,o_proj,gate_proj,down_proj,up_proj \
25
26
27
    --output_dir out/baichuan-7b-lora-test7 \
    --per_device_train_batch_size 1 \
    --per_device_eval_batch_size 1 \
zhaoying1's avatar
zhaoying1 committed
28
    --gradient_accumulation_steps 1 \
29
    --preprocessing_num_workers 8 \
zhaoying1's avatar
zhaoying1 committed
30
31
    --lr_scheduler_type cosine \
    --logging_steps 10 \
32
33
    --save_steps 2 \
    --eval_steps 2 \
zhaoying1's avatar
zhaoying1 committed
34
35
36
    --learning_rate 1e-4 \
    --max_grad_norm 0.5 \
    --num_train_epochs 1.0 \
37
38
39
    --val_size 0.001 \
    --evaluation_strategy steps \
    --load_best_model_at_end \
zhaoying1's avatar
zhaoying1 committed
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
    --plot_loss \
    --fp16 \
    --deepspeed deepspeed.json
"

case ${lrank} in
[0])
  export HIP_VISIBLE_DEVICES=0,1,2,3
  export UCX_NET_DEVICES=mlx5_0:1
  export UCX_IB_PCI_BW=mlx5_0:50Gbs
  numactl --cpunodebind=0 --membind=0 ${APP}
  ;;
[1])
  export HIP_VISIBLE_DEVICES=0,1,2,3
  export UCX_NET_DEVICES=mlx5_1:1
  export UCX_IB_PCI_BW=mlx5_1:50Gbs
  numactl --cpunodebind=1 --membind=1 ${APP}
  ;;
[2])
  export HIP_VISIBLE_DEVICES=0,1,2,3
  export UCX_NET_DEVICES=mlx5_2:1
  export UCX_IB_PCI_BW=mlx5_2:50Gbs
  numactl --cpunodebind=2 --membind=2 ${APP}
  ;;
[3])
  export HIP_VISIBLE_DEVICES=0,1,2,3
  export UCX_NET_DEVICES=mlx5_3:1
  export UCX_IB_PCI_BW=mlx5_3:50Gbs
  numactl --cpunodebind=3 --membind=3 ${APP}
  ;;
esac