Don't need an example for Switch networks until it's more polished

d7bf1ab5 · rprenger · 3237cc33 · 3237cc33
Commit d7bf1ab5 authored Feb 17, 2022 by rprenger
Hide whitespace changes
Inline Side-by-side

Showing with 0 additions and 47 deletions

examples/pretrain_gpt_switch.sh examples/pretrain_gpt_switch.sh +0 -47

No files found.
--- a/examples/pretrain_gpt_switch.sh
+++ b/examples/pretrain_gpt_switch.sh
-#!/bin/bash
-
-# Runs a GPT model with switch MLP.
-
-CHECKPOINT_DIR="<PATH TO CHECKPOINT DIR>"
-TENSORBOARD_DIR="<PATH TO TENSORBOARD DIR>"
-DATA_BLEND="<PATH TO DATA BLEND>"
-BPE_DIR="<PATH TO BPE DIR>"
-
-python pretrain_gpt.py \
-    --num-experts 8 \
-    --tensor-model-parallel-size 1 \
-    --pipeline-model-parallel-size 1 \
-    --num-layers 24 \
-    --hidden-size 1024 \
-    --num-attention-heads 16 \
-    --seq-length 2048 \
-    --max-position-embeddings 2048 \
-    --micro-batch-size 4 \
-    --global-batch-size 256 \
-    --train-samples 192000000 \
-    --lr-decay-samples 166400000 \
-    --lr-warmup-samples 162761 \
-    --lr 3.0e-4 \
-    --min-lr 3.0e-5 \
-    --lr-decay-style cosine \
-    --log-interval 100 \
-    --eval-iters 50 \
-    --eval-interval 2000 \
-    --data-path ${DATA_BLEND} \
-    --vocab-file ${BPE_DIR}/gpt2-vocab.json \
-    --merge-file ${BPE_DIR}/gpt2-merges.txt \
-    --save-interval 10000 \
-    --save ${CHECKPOINT_DIR} \
-    --load ${CHECKPOINT_DIR} \
-    --split 98,2,0 \
-    --clip-grad 1.0 \
-    --weight-decay 0.1 \
-    --adam-beta1 0.9 \
-    --adam-beta2 0.95 \
-    --init-method-std 0.02 \
-    --log-params-norm \
-    --log-num-zeros-in-grad \
-    --fp16 \
-    --DDP-impl torch \
-    --tensorboard-dir ${TENSORBOARD_DIR} \
-    --checkpoint-activations