train.sh 419 Bytes
Newer Older
Xuanlei Zhao's avatar
Xuanlei Zhao committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
NUM_GPU=8
MODEL="mistralai/Mixtral-8x7B-v0.1"
SEQ_LENGTH=2048
BATCH_SIZE=1
LR=0.00001

# hybrid
# torchrun --standalone --nproc_per_node $NUM_GPU \
colossalai run --nproc_per_node $NUM_GPU --hostfile "hostfile" \
    train.py \
    --num_epoch 1 \
    --model_name $MODEL \
    --plugin "hybrid" \
    --batch_size $BATCH_SIZE \
    --lr $LR \
    --zero_stage 1 \
    --pp_size 2 \
    --dp_size 1 \
    --ep_size 8 \