run_ds_train_huggingface_finetune.sh 8 KB
Newer Older
hepj's avatar
hepj committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
#!/bin/bash

# bash run_ds_train_huggingface_finetune.sh \
#     --env dsw \
#     --model-size 13B \
#     --micro-batch-size 1 \
#     --gradient-accumulation-steps 2 \
#     --learning-rate 1e-5 \
#     --sequence-length 2048 \
#     --precision bf16 \
#     --zero-stage 2 \
#     --enable-gradient-checkpointing true \
#     --model-name llama2-13b \
#     --flash-attention true \
#     --epoch 2 \
#     --train-dataset /mnt/llama2-datasets/wudao_train.json \
#     --validation-dataset /mnt/llama2-datasets/wudao_valid.json \
#     --pretrain-model-path /mnt/llama2-ckpts/Llama-2-13b-hf \
#     --finetune-output-path /mnt/output_llama2_finetune

function usage() {
    echo '
Usage: bash run_ds_train_huggingface_finetune.sh \
    [--env ENV default dsw] \
    [--model-size MODEL_SIZE] \
    [--micro-batch-size MICRO_BATCH_SIZE default 1] \
    [--gradient-accumulation-steps GRADIENT_ACCUMULATION_STEPS default 1] \
    [--learning-rate LEARNING_RATE default 1e-5] \
    [--sequence-length SEQUENCE_LENGTH default 2048] \
    [--precision PRECISION default bf16] \
    [--zero-stage ZERO_STAGE default 2] \
    [--enable-gradient-checkpointing ENABLE_GRADIENT_CHECKPOINTING default true] \
    [--model-name MODEL_NAME {llama2-13b, qwen-7b, qwen-14b, qwen-72b}] \
    [--flash-attention FLASH_ATTENTION default false] \
    [--epoch EPOCH default 1] \
    [--train-dataset TRAIN_DATASET] \
    [--validation-dataset VALIDATION_DATASET] \
    [--pretrain-model-path PRETRAIN_MODEL_PATH] \
    [--finetune-output-path FINETUNE_OUTPUT_PATH]
'
}

ENV="dsw"
MODEL_SIZE=""
MICRO_BATCH_SIZE="1"
GRADIENT_ACCUMULATION_STEPS="1"
LEARNING_RATE="1e-5"
SEQUENCE_LENGTH="2048"
PRECISION="bf16"
ZERO_STAGE="2"
ENABLE_GRADIENT_CHECKPOINTING="true"
MODEL_NAME=""       # llama2-13b, qwen-7b, qwen-14b, qwen1.5-32b, qwen-72b
FLASH_ATTENTION="false"
EPOCH="1"
TRAIN_DATASET=""
VALIDATION_DATASET=""
PRETRAIN_MODEL_PATH=""
FINETUNE_OUTPUT_PATH=""
while [[ "$1" != "" ]]; do
    case $1 in
        --env )
            shift
            ENV=$1
            ;;
        --model-size )
            shift
            MODEL_SIZE=$1
            ;;
        --micro-batch-size )
            shift
            MICRO_BATCH_SIZE=$1
            ;;
        --gradient-accumulation-steps )
            shift
            GRADIENT_ACCUMULATION_STEPS=$1
            ;;
        --model-name )
            shift
            MODEL_NAME=$1
            ;;
        --learning-rate )
            shift
            LEARNING_RATE=$1
            ;;
        --sequence-length )
            shift
            SEQUENCE_LENGTH=$1
            ;;
        --precision )
            shift
            PRECISION=$1
            ;;
        --zero-stage )
            shift
            ZERO_STAGE=$1
            ;;
        --enable-gradient-checkpointing )
            shift
            ENABLE_GRADIENT_CHECKPOINTING=$1
            ;;
        --flash-attention )
            shift
            FLASH_ATTENTION=$1
            ;;
        --epoch )
            shift
            EPOCH=$1
            ;;
        --train-dataset )
            shift
            TRAIN_DATASET=$1
            ;;
        --validation-dataset )
            shift
            VALIDATION_DATASET=$1
            ;;
        --pretrain-model-path )
            shift
            PRETRAIN_MODEL_PATH=$1
            ;;
        --finetune-output-path )
            shift
            FINETUNE_OUTPUT_PATH=$1
            ;;
        -h | --help )
            usage
            exit 0
            ;;
        * )
            echo "Unknown argument ${1}"
            usage
            exit 1
            ;;
    esac
    shift
done

for i in "ENV=$ENV" \
    "MODEL_SIZE=$MODEL_SIZE" \
    "MICRO_BATCH_SIZE=$MICRO_BATCH_SIZE" \
    "GRADIENT_ACCUMULATION_STEPS=$GRADIENT_ACCUMULATION_STEPS" \
    "LEARNING_RATE=$LEARNING_RATE" \
    "SEQUENCE_LENGTH=$SEQUENCE_LENGTH" \
    "PRECISION=$PRECISION" \
    "ZERO_STAGE=$ZERO_STAGE" \
    "ENABLE_GRADIENT_CHECKPOINTING=$ENABLE_GRADIENT_CHECKPOINTING" \
    "FLASH_ATTENTION=$FLASH_ATTENTION" \
    "MODEL_NAME=$MODEL_NAME" \
    "EPOCH=$EPOCH" \
    "TRAIN_DATASET=$TRAIN_DATASET" \
    "VALIDATION_DATASET=$VALIDATION_DATASET" \
    "PRETRAIN_MODEL_PATH=$PRETRAIN_MODEL_PATH" \
    "PRETRAIN_MODEL_PATH=$PRETRAIN_MODEL_PATH" \
    "FINETUNE_OUTPUT_PATH=$FINETUNE_OUTPUT_PATH"
do
    config=(${i//=/ })
    config_name=${config[0]}
    config_value=${config[1]}
    if [ -z $config_value ]; then
        echo "$config_name is null"
        usage
        exit 1
    fi
done


set -e
export CUDA_DEVICE_MAX_CONNECTIONS=1
if [ $ENV = dsw ]; then
    export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
    MASTER_ADDR=localhost
    MASTER_PORT=$(shuf -n 1 -i 10000-65535)
    NNODES=1
    NODE_RANK=0
    GPUS_PER_NODE=8
elif [ $ENV = dlc ]; then
    NNODES=${WORLD_SIZE}
    NODE_RANK=${RANK}
    GPUS_PER_NODE=${KUBERNETES_CONTAINER_RESOURCE_GPU}
fi

DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT"
GLOBAL_BATCH_SIZE=$(( ${MICRO_BATCH_SIZE} * ${GRADIENT_ACCUMULATION_STEPS} * ${GPUS_PER_NODE} * ${NNODES}))

if [ $MODEL_SIZE = 7B ]; then

    NUM_LAYERS=32
    HIDDEN_SIZE=4096
    NUM_ATTN_HEADS=32
    INTERMEDIATE_SIZE=11008

elif [ $MODEL_SIZE = 13B ]; then

    NUM_LAYERS=40
    HIDDEN_SIZE=5120
    NUM_ATTN_HEADS=40
    INTERMEDIATE_SIZE=13824

elif [ $MODEL_SIZE = 14B ]; then

    NUM_LAYERS=40
    HIDDEN_SIZE=5120
    NUM_ATTN_HEADS=40
    INTERMEDIATE_SIZE=13696

elif [ $MODEL_SIZE = 32B ]; then

    NUM_LAYERS=64
    HIDDEN_SIZE=5120
    NUM_ATTN_HEADS=40
    INTERMEDIATE_SIZE=27392

elif [ $MODEL_SIZE = 65B ]; then

    NUM_LAYERS=80
    HIDDEN_SIZE=8192
    NUM_ATTN_HEADS=64
    INTERMEDIATE_SIZE=22016

elif [ $MODEL_SIZE = 70B ]; then

    NUM_LAYERS=80
    HIDDEN_SIZE=8192
    NUM_ATTN_HEADS=64
    INTERMEDIATE_SIZE=28672

fi

if [ $PRECISION = fp16 ]; then
    pr_options="--fp16"
    FP16='true'
    BF16='false'
elif [ $PRECISION = bf16 ]; then
    pr_options="--bf16"
    FP16='false'
    BF16='true'
fi

if [ $ENABLE_GRADIENT_CHECKPOINTING = true ]; then
    gc_options="--enable-gradient-checkpointing"
elif [ $ENABLE_GRADIENT_CHECKPOINTING = false ]; then
    gc_options=""
fi

if [ $FLASH_ATTENTION = true ]; then
    flash_options="--flash"
elif [ $FLASH_ATTENTION = false ]; then
    flash_options=""
fi

NAME="${ENV}-ds-train-huggingface-finetune-${MODEL_SIZE}-lr-${LEARNING_RATE}-bs-${MICRO_BATCH_SIZE}-epoch-${EPOCH}-zero-${ZERO_STAGE}"
mkdir -p "${FINETUNE_OUTPUT_PATH}/tensorboard/"
mkdir -p "${FINETUNE_OUTPUT_PATH}/checkpoint/"
mkdir -p "${FINETUNE_OUTPUT_PATH}/log/"
current_time=$(date "+%Y.%m.%d-%H.%M.%S")
LOGGING_DIR="${FINETUNE_OUTPUT_PATH}/log/${NAME}_${current_time}"
mkdir -p ${LOGGING_DIR}

FINETUNE_CHECKPOINT_PATH="${FINETUNE_OUTPUT_PATH}/checkpoint/${NAME}"

hf_options="  \
        --load ${PRETRAIN_MODEL_PATH} \
        --save ${FINETUNE_CHECKPOINT_PATH} \
        --train-data ${TRAIN_DATASET} \
        --valid-data ${VALIDATION_DATASET} \
        --num-layers ${NUM_LAYERS} \
        --hidden-size ${HIDDEN_SIZE} \
        --seq-length ${SEQUENCE_LENGTH} \
        --num-attention-heads ${NUM_ATTN_HEADS} \
        --intermediate-size ${INTERMEDIATE_SIZE} \
        --micro-batch-size ${MICRO_BATCH_SIZE} \
        --epochs ${EPOCH} \
        --lr ${LEARNING_RATE} \
        --num-workers 1 \
        --gradient-accumulation-steps ${GRADIENT_ACCUMULATION_STEPS} \
        --logging-dir ${LOGGING_DIR} \
        --model ${MODEL_NAME} \
        ${pr_options} \
        ${gc_options} \
        ${flash_options}
        "

template_json="ds_config_TEMPLATE.json"
config_json="ds_config.json"
sed "s/CONFIG_MBSIZE/${MICRO_BATCH_SIZE}/" ${template_json} \
    | sed "s/CONFIG_ZERO_STATE/${ZERO_STAGE}/" \
    | sed "s/CONFIG_GBSIZE/${GLOBAL_BATCH_SIZE}/" \
    | sed "s/CONFIG_GAS/${GRADIENT_ACCUMULATION_STEPS}/" \
    | sed "s/CONFIG_FP16_ENABLED/${FP16}/" \
    | sed "s/CONFIG_BF16_ENABLED/${BF16}/" \
    | sed "s/CONFIG_LR/${LEARNING_RATE}/" \
	  > ${config_json}

run_cmd="torchrun $DISTRIBUTED_ARGS ds_train_huggingface_finetune.py ${hf_options}"

echo ${run_cmd}
eval ${run_cmd}
set +x