Commit d7bf1ab5 authored by rprenger's avatar rprenger
Browse files

Don't need an example for Switch networks until it's more polished

parent 3237cc33
#!/bin/bash
# Runs a GPT model with switch MLP.
CHECKPOINT_DIR="<PATH TO CHECKPOINT DIR>"
TENSORBOARD_DIR="<PATH TO TENSORBOARD DIR>"
DATA_BLEND="<PATH TO DATA BLEND>"
BPE_DIR="<PATH TO BPE DIR>"
python pretrain_gpt.py \
--num-experts 8 \
--tensor-model-parallel-size 1 \
--pipeline-model-parallel-size 1 \
--num-layers 24 \
--hidden-size 1024 \
--num-attention-heads 16 \
--seq-length 2048 \
--max-position-embeddings 2048 \
--micro-batch-size 4 \
--global-batch-size 256 \
--train-samples 192000000 \
--lr-decay-samples 166400000 \
--lr-warmup-samples 162761 \
--lr 3.0e-4 \
--min-lr 3.0e-5 \
--lr-decay-style cosine \
--log-interval 100 \
--eval-iters 50 \
--eval-interval 2000 \
--data-path ${DATA_BLEND} \
--vocab-file ${BPE_DIR}/gpt2-vocab.json \
--merge-file ${BPE_DIR}/gpt2-merges.txt \
--save-interval 10000 \
--save ${CHECKPOINT_DIR} \
--load ${CHECKPOINT_DIR} \
--split 98,2,0 \
--clip-grad 1.0 \
--weight-decay 0.1 \
--adam-beta1 0.9 \
--adam-beta2 0.95 \
--init-method-std 0.02 \
--log-params-norm \
--log-num-zeros-in-grad \
--fp16 \
--DDP-impl torch \
--tensorboard-dir ${TENSORBOARD_DIR} \
--checkpoint-activations
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment