single_ddp.sh 2.62 KB
Newer Older
hepj987's avatar
hepj987 committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
#!/bin/bash
export MIOPEN_FIND_MODE=3
export GPU_MAX_HW_QUEUES=16
lrank=$OMPI_COMM_WORLD_LOCAL_RANK
comm_rank=$OMPI_COMM_WORLD_RANK
comm_size=$OMPI_COMM_WORLD_SIZE
export LOCAL_RANK=$OMPI_COMM_WORLD_LOCAL_RANK
export RANK=$comm_rank
export WORLD_SIZE=$comm_size
export MASTER_ADDR=$1
export MASTER_PORT=29500

# export NCCL_DEBUG=info   #打印nccl通信的日志
export NCCL_IB_HCA=mlx5
export NCCL_SOCKET_IFNAME=ib0 
export HIP_DIRECT_DISPATCH=0

    #   "offload_optimizer": {
    #     "device": "cpu",
    #     "pin_memory": true
    #   },
    #   "offload_param": {
    #     "device": "cpu",
    #     "pin_memory": true
    #   }

APP="python ./src/train_bash.py \
    --deepspeed deepspeed.json  --model_name_or_path /work/home/hepj/model/Qwen-7B-Chat \
    --do_train --dataset alpaca_gpt4_zh --template chatml --finetuning_type lora --lora_target c_attn  \
    --output_dir ./output/ft_qwen \
    --per_device_train_batch_size 1 --gradient_accumulation_steps 1  --lr_scheduler_type cosine \
    --logging_steps 10 --save_steps 1000 --learning_rate 5e-5 --num_train_epochs 3.0 --fp16"
#--overwrite_cache

case ${lrank} in
[0])
hepj987's avatar
hepj987 committed
37
38
39
40
41
  export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
  export UCX_NET_DEVICES=mlx5_0:1
  export UCX_IB_PCI_BW=mlx5_0:50Gbs
  NCCL_SOCKET_IFNAME=ib0 numactl --cpunodebind=0 --membind=0 ${APP}
  ;;
hepj987's avatar
hepj987 committed
42
[1])
hepj987's avatar
hepj987 committed
43
44
45
46
47
  export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
  export UCX_NET_DEVICES=mlx5_1:1
  export UCX_IB_PCI_BW=mlx5_1:50Gbs
  NCCL_SOCKET_IFNAME=ib0 numactl --cpunodebind=0 --membind=0 ${APP}
  ;;
hepj987's avatar
hepj987 committed
48
[2])
hepj987's avatar
hepj987 committed
49
50
51
52
53
  export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
  export UCX_NET_DEVICES=mlx5_2:1
  export UCX_IB_PCI_BW=mlx5_2:50Gbs
  NCCL_SOCKET_IFNAME=ib0 numactl --cpunodebind=0 --membind=0 ${APP}
  ;;
hepj987's avatar
hepj987 committed
54
[3])
hepj987's avatar
hepj987 committed
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
  export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
  export UCX_NET_DEVICES=mlx5_3:1
  export UCX_IB_PCI_BW=mlx5_3:50Gbs
  NCCL_SOCKET_IFNAME=ib0 numactl --cpunodebind=0 --membind=0 ${APP}
  ;;
[4])
  export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
  export UCX_NET_DEVICES=mlx5_4:1
  export UCX_IB_PCI_BW=mlx5_4:50Gbs
  NCCL_SOCKET_IFNAME=ib0 numactl --cpunodebind=3 --membind=3 ${APP}
  ;;
[5])
  export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
  export UCX_NET_DEVICES=mlx5_5:1
  export UCX_IB_PCI_BW=mlx5_5:50Gbs
  NCCL_SOCKET_IFNAME=ib0 numactl --cpunodebind=3 --membind=3 ${APP}
  ;;
[6])
  export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
  export UCX_NET_DEVICES=mlx5_6:1
  export UCX_IB_PCI_BW=mlx5_6:50Gbs
  NCCL_SOCKET_IFNAME=ib0 numactl --cpunodebind=3 --membind=3 ${APP}
  ;;
[7])
  export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
  export UCX_NET_DEVICES=mlx5_7:1
  export UCX_IB_PCI_BW=mlx5_7:50Gbs
  NCCL_SOCKET_IFNAME=ib0 numactl --cpunodebind=3 --membind=3 ${APP}
  ;;  
hepj987's avatar
hepj987 committed
84
esac
hepj987's avatar
hepj987 committed
85