run_demo.sh 809 Bytes
Newer Older
1
2
3
4
5
6
7
8
9
10
11
set -xe
pip install -r requirements.txt

# model name or path
MODEL="facebook/opt-350m"

# path for saving model
OUTPUT_PATH="./output_model.bin"

# plugin(training strategy)
# can only be one of "torch_ddp"/"torch_ddp_fp16"/"low_level_zero"/"gemini"
12
PLUGIN="hybrid_parallel"
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44

# number of gpus to use
GPUNUM=4

# batch size per gpu
BS=16

# learning rate
LR="5e-5"

# number of epoch
EPOCH=10

# weight decay
WEIGHT_DECAY=0.01

# ratio of warmup steps
WARMUP_RATIO=0.1

# run the script for demo
torchrun \
  --standalone \
  --nproc_per_node ${GPUNUM} \
  opt_train_demo.py \
  --model_name_or_path ${MODEL} \
  --output_path ${OUTPUT_PATH} \
  --plugin ${PLUGIN} \
  --batch_size ${BS} \
  --num_epoch ${EPOCH} \
  --learning_rate ${LR} \
  --weight_decay ${WEIGHT_DECAY} \
  --warmup_ratio ${WARMUP_RATIO}