pretrain_gpt_switch.sh 1.25 KB
Newer Older
rprenger's avatar
rprenger committed
1
2
#!/bin/bash

3
# Runs a GPT model with switch MLP.
rprenger's avatar
rprenger committed
4

Lawrence McAfee's avatar
Lawrence McAfee committed
5
6
7
8
CHECKPOINT_DIR="<PATH TO CHECKPOINT DIR>"
TENSORBOARD_DIR="<PATH TO TENSORBOARD DIR>"
DATA_BLEND="<PATH TO DATA BLEND>"
BPE_DIR="<PATH TO BPE DIR>"
rprenger's avatar
rprenger committed
9

10
11
python pretrain_gpt.py \
    --num-experts 8 \
rprenger's avatar
rprenger committed
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
    --tensor-model-parallel-size 1 \
    --pipeline-model-parallel-size 1 \
    --num-layers 24 \
    --hidden-size 1024 \
    --num-attention-heads 16 \
    --seq-length 2048 \
    --max-position-embeddings 2048 \
    --micro-batch-size 4 \
    --global-batch-size 256 \
    --train-samples 192000000 \
    --lr-decay-samples 166400000 \
    --lr-warmup-samples 162761 \
    --lr 3.0e-4 \
    --min-lr 3.0e-5 \
    --lr-decay-style cosine \
    --log-interval 100 \
    --eval-iters 50 \
    --eval-interval 2000 \
    --data-path ${DATA_BLEND} \
    --vocab-file ${BPE_DIR}/gpt2-vocab.json \
    --merge-file ${BPE_DIR}/gpt2-merges.txt \
    --save-interval 10000 \
    --save ${CHECKPOINT_DIR} \
    --load ${CHECKPOINT_DIR} \
    --split 98,2,0 \
    --clip-grad 1.0 \
    --weight-decay 0.1 \
    --adam-beta1 0.9 \
    --adam-beta2 0.95 \
    --init-method-std 0.02 \
    --log-params-norm \
    --log-num-zeros-in-grad \
    --fp16 \
    --DDP-impl torch \
    --tensorboard-dir ${TENSORBOARD_DIR} \
47
    --checkpoint-activations