#!/bin/bash # Runs a GPT model with switch MLP. CHECKPOINT_DIR="" TENSORBOARD_DIR="" DATA_BLEND="" BPE_DIR="" python pretrain_gpt.py \ --num-experts 8 \ --tensor-model-parallel-size 1 \ --pipeline-model-parallel-size 1 \ --num-layers 24 \ --hidden-size 1024 \ --num-attention-heads 16 \ --seq-length 2048 \ --max-position-embeddings 2048 \ --micro-batch-size 4 \ --global-batch-size 256 \ --train-samples 192000000 \ --lr-decay-samples 166400000 \ --lr-warmup-samples 162761 \ --lr 3.0e-4 \ --min-lr 3.0e-5 \ --lr-decay-style cosine \ --log-interval 100 \ --eval-iters 50 \ --eval-interval 2000 \ --data-path ${DATA_BLEND} \ --vocab-file ${BPE_DIR}/gpt2-vocab.json \ --merge-file ${BPE_DIR}/gpt2-merges.txt \ --save-interval 10000 \ --save ${CHECKPOINT_DIR} \ --load ${CHECKPOINT_DIR} \ --split 98,2,0 \ --clip-grad 1.0 \ --weight-decay 0.1 \ --adam-beta1 0.9 \ --adam-beta2 0.95 \ --init-method-std 0.02 \ --log-params-norm \ --log-num-zeros-in-grad \ --fp16 \ --DDP-impl torch \ --tensorboard-dir ${TENSORBOARD_DIR} \ --checkpoint-activations