run_train_single.sh 1.84 KB
Newer Older
zhaoying1's avatar
zhaoying1 committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
#!/bin/bash

export HSA_FORCE_FINE_GRAIN_PCIE=1
export MIOPEN_FIND_MODE=3
export MIOPEN_COMPILE_PARALLEL_LEVEL=1
export NCCL_PLUGIN_P2P=ucx
export NCCL_SOCKET_IFNAME=ib0
export NCCL_P2P_LEVEL=5
export NCCL_IB_HCA=mlx5_0
export NCCL_DEBUG=INFO
export NCCL_NET_PLUGIN=none

lrank=$OMPI_COMM_WORLD_LOCAL_RANK
echo "LRANK===============================$lrank"
RANK=$OMPI_COMM_WORLD_RANK
WORLD_SIZE=$OMPI_COMM_WORLD_SIZE
zhaoying1's avatar
zhaoying1 committed
17

zhaoying1's avatar
zhaoying1 committed
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50

LR=1e-5
APP="python3 ../main.py \
    --deepspeed ../deepspeed.json \
    --do_train \
    --train_file AdvertiseGen/train.json \
    --prompt_column content  \
    --response_column summary \
    --model_name_or_path THUDM/chatglm-6b \
    --output_dir ./output_ft/pretrain \
    --overwrite_output_dir \
    --max_source_length 64 \
    --max_target_length 64 \
    --per_device_train_batch_size 1 \
    --per_device_eval_batch_size 1 \
    --gradient_accumulation_steps 1 \
    --predict_with_generate \
    --max_steps 2000 \
    --logging_steps 5 \
    --save_steps 1000 \
    --learning_rate $LR \
    --fp16 \
    --local_rank $lrank "



case ${lrank} in
[0])
  export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
  numactl --cpunodebind=0 --membind=0 ${APP}
  ;;
[1])
  export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
zhaoying1's avatar
zhaoying1 committed
51
  numactl --cpunodebind=1 --membind=1 ${APP}
zhaoying1's avatar
zhaoying1 committed
52
53
54
  ;;
[2])
  export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
zhaoying1's avatar
zhaoying1 committed
55
  numactl --cpunodebind=2 --membind=2 ${APP}
zhaoying1's avatar
zhaoying1 committed
56
57
58
  ;;
[3])
  export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
zhaoying1's avatar
zhaoying1 committed
59
  numactl --cpunodebind=3 --membind=3 ${APP}
zhaoying1's avatar
zhaoying1 committed
60
61
62
  ;;
[4])
  export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
zhaoying1's avatar
zhaoying1 committed
63
  numactl --cpunodebind=4 --membind=4 ${APP}
zhaoying1's avatar
zhaoying1 committed
64
65
66
  ;;
[5])
  export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
zhaoying1's avatar
zhaoying1 committed
67
  numactl --cpunodebind=5 --membind=5 ${APP}
zhaoying1's avatar
zhaoying1 committed
68
69
70
  ;;
[6])
  export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
zhaoying1's avatar
zhaoying1 committed
71
  numactl --cpunodebind=6 --membind=6 ${APP}
zhaoying1's avatar
zhaoying1 committed
72
73
74
  ;;
[7])
  export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
zhaoying1's avatar
zhaoying1 committed
75
76
  numactl --cpunodebind=7 --membind=7 ${APP}
  ;;
zhaoying1's avatar
zhaoying1 committed
77
esac
zhaoying1's avatar
zhaoying1 committed
78