single.sh_7b_bf16 4.39 KB
Newer Older
liangjing's avatar
update  
liangjing committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
#!/bin/bash
# This example script is contributed by external user https://github.com/nrailgun
set -ex
######################################
#####################################
export HSA_FORCE_FINE_GRAIN_PCIE=1
export OMP_NUM_THREADS=1
export NCCL_P2P_LEVEL=5

export HIP_ALLOC_INITIALIZE=0
export GPU_MAX_HW_QUEUES=16

export NCCL_NCHANNELS_PER_PEER=16
export NCCL_MIN_NCHANNELS=16

lrank=$OMPI_COMM_WORLD_LOCAL_RANK
RANK=$OMPI_COMM_WORLD_RANK
WORLD_SIZE=$OMPI_COMM_WORLD_SIZE

export NCCL_IB_TIMEOUT=22
# Change the below configurations here
BASE_PATH=./tmp
DS_CONFIG=${BASE_PATH}/deepspeed.json
DATASET_1="./dataset/my-llama_text_document"
DATASET="1 ${DATASET_1}"
CHECKPOINT_PATH=./tmp
TOKENIZER_PATH=./tokenizer.model
export HIP_PRINTF_DEBUG_FOR_FP64=0

TP=1
PP=4
ZERO_STAGE=1

HIDDEN_SIZE=4096
FFN_HIDDEN_SIZE=11008
NUM_LAYERS=32
NUM_HEADS=32
SEQ_LENGTH=4096
NUM_KV_HEADS=32

MICRO_BATCH_SIZE=1
GLOBAL_BATCH_SIZE=60  
TRAIN_STEPS=250000 
LR=3e-4
MIN_LR=3e-5
LR_WARMUP_STEPS=2000
WEIGHT_DECAY=0.1
GRAD_CLIP=1

## Activation checkpointing saves GPU memory, but reduces training speed
activation_checkpoint="true"
#activation_checkpoint="false"

######################################



cat <<EOT > $DS_CONFIG
{
  "train_batch_size" : $GLOBAL_BATCH_SIZE,
  "train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE,
  "steps_per_print": 1,
  "zero_optimization": {
    "stage": $ZERO_STAGE
  },
  "bf16": {
    "enabled": true
  },
    "data_types": { 
    "grad_accum_dtype": "fp32" 
  },
  "overlap_comm":true,
  "wall_clock_breakdown": true,
  "zero_allow_untested_optimizer": true
}
EOT

ds_args=""
ds_args=" --deepspeed ${ds_args}"
ds_args=" --deepspeed_config=$DS_CONFIG ${ds_args}"
ds_args=" --zero-stage=$ZERO_STAGE ${ds_args}"

if [ "${activation_checkpoint}" = "true" ]; then
  ds_args="--deepspeed-activation-checkpointing ${ds_args}"

  ## old argument for recomputing the transformer layer
  # ds_args="--checkpoint-activations ${ds_args}"

  ## new argument for recomputing the transformer layer
  ds_args="--recompute-granularity full --recompute-method uniform ${ds_args}"
  ## new argument for recomputing only the attention layer
  # ds_args="--recompute-granularity selective ${ds_args}"
fi

APP="python3 -u pretrain_gpt.py \
       --tensor-model-parallel-size $TP \
       --pipeline-model-parallel-size $PP \
       --num-layers $NUM_LAYERS \
       --num-key-value-heads $NUM_KV_HEADS \
       --hidden-size $HIDDEN_SIZE \
       --ffn-hidden-size $FFN_HIDDEN_SIZE \
       --num-attention-heads $NUM_HEADS \
       --micro-batch-size $MICRO_BATCH_SIZE \
       --global-batch-size $GLOBAL_BATCH_SIZE \
       --seq-length $SEQ_LENGTH \
       --max-position-embeddings $SEQ_LENGTH \
       --train-iters $TRAIN_STEPS \
       --save $CHECKPOINT_PATH \
       --load $CHECKPOINT_PATH \
       --data-path $DATASET \
       --data-impl mmap \
       --tokenizer-type GPTSentencePieceTokenizer \
       --tokenizer-model $TOKENIZER_PATH \
       --split 949,50,1 \
       --distributed-backend nccl \
       --lr $LR \
       --lr-decay-style cosine \
       --min-lr $MIN_LR \
       --weight-decay $WEIGHT_DECAY \
       --clip-grad $GRAD_CLIP \
       --lr-warmup-iters $LR_WARMUP_STEPS \
       --optimizer adam \
       --adam-beta1 0.9 \
       --adam-beta2 0.95 \
       --log-interval 1 \
       --save-interval 1000 \
       --eval-interval 1000 \
       --eval-iters 1000 \
       --bf16 \
       --no-query-key-layer-scaling \
       --attention-dropout 0 \
       --hidden-dropout 0 \
       --no-gradient-accumulation-fusion \
       --use-rotary-position-embeddings \
       --untie-embeddings-and-output-weights \
       --swiglu \
       --use-flash-attn-triton \
       --normalization rmsnorm \
       --disable-bias-linear \
       $ds_args \
       --rank ${RANK} \
       --world_size ${WORLD_SIZE} \
       --dist_url tcp://${1}:34566 \
       --num-workers 2 \
      "
#--use-flash-attn-triton \
#--use-flash-attn-v2 \
case ${lrank} in
[0])
  export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
  ${APP}
  ;;
[1])
  export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
  ${APP}
  ;;
[2])
  export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
  ${APP}
  ;;
[3])
  export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
  ${APP}
  ;;
[4])
  export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
  ${APP}
  ;;
[5])
  export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
  ${APP}
  ;;
[6])
  export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
  ${APP}
  ;;
[7])
  export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
  ${APP}
  ;;
esac