Merge branch 'maanug/1gpu-pretrain-examples' into 'main'

Fix pretraining examples See merge request ADLR/megatron-lm!513

Merge branch 'maanug/1gpu-pretrain-examples' into 'main'
Fix pretraining examples See merge request ADLR/megatron-lm!513
0b208deb · Jared Casper · 285068c8 · 7625a9d2 · 0b208deb · 0b208deb
Commit 0b208deb authored Feb 17, 2023 by Jared Casper
Showing with 134 additions and 99 deletions

examples/pretrain_bert.sh examples/pretrain_bert.sh +43 -30

examples/pretrain_gpt.sh examples/pretrain_gpt.sh +44 -34

examples/pretrain_t5.sh examples/pretrain_t5.sh +47 -35

No files found.
--- a/examples/pretrain_bert.sh
+++ b/examples/pretrain_bert.sh
 #!/bin/bash
-RANK=0
+export CUDA_DEVICE_MAX_CONNECTIONS=1
-WORLD_SIZE=1
-DATA_PATH=<Specify path and file prefix>_text_sentence
 CHECKPOINT_PATH=<Specify path>
+VOCAB_FILE=<Specify path to file>/bert-vocab.txt
+DATA_PATH=<Specify path and file prefix>_text_sentence
-python pretrain_bert.py \
+BERT_ARGS="
    --num-layers 24 \
    --hidden-size 1024 \
    --num-attention-heads 16 \
-       --micro-batch-size 4 \
-       --global-batch-size 8 \
    --seq-length 512 \
    --max-position-embeddings 512 \
+    --micro-batch-size 4 \
+    --global-batch-size 8 \
+    --lr 0.0001 \
    --train-iters 2000000 \
    --lr-decay-iters 990000 \
-       --save $CHECKPOINT_PATH \
-       --load $CHECKPOINT_PATH \
-       --data-path $DATA_PATH \
-       --vocab-file bert-vocab.txt \
-       --data-impl mmap \
-       --split 949,50,1 \
-       --lr 0.0001 \
-       --min-lr 0.00001 \
    --lr-decay-style linear \
-       --lr-warmup-fraction .01 \
+    --min-lr 0.00001 \
    --weight-decay 1e-2 \
+    --lr-warmup-fraction .01 \
    --clip-grad 1.0 \
+    --fp16
+"
+DATA_ARGS="
+    --data-path $DATA_PATH \
+    --vocab-file $VOCAB_FILE \
+    --data-impl mmap \
+    --split 949,50,1
+"
+OUTPUT_ARGS="
    --log-interval 100 \
    --save-interval 10000 \
    --eval-interval 1000 \
-       --eval-iters 10 \
+    --eval-iters 10
-       --fp16
+"
+torchrun pretrain_bert.py \
+    $BERT_ARGS \
+    $DATA_ARGS \
+    $OUTPUT_ARGS \
+    --save $CHECKPOINT_PATH \
+    --load $CHECKPOINT_PATH
--- a/examples/pretrain_gpt.sh
+++ b/examples/pretrain_gpt.sh
-#! /bin/bash
+#!/bin/bash
 # Runs the "345M" parameter model
-RANK=0
+export CUDA_DEVICE_MAX_CONNECTIONS=1
-WORLD_SIZE=1
-DATA_PATH=<Specify path and file prefix>_text_document
 CHECKPOINT_PATH=<Specify path>
+VOCAB_FILE=<Specify path to file>/gpt2-vocab.json
+MERGE_FILE=<Specify path to file>/gpt2-merges.txt
+DATA_PATH=<Specify path and file prefix>_text_document
+GPT_ARGS="
-python pretrain_gpt.py \
    --num-layers 24 \
    --hidden-size 1024 \
    --num-attention-heads 16 \
-       --micro-batch-size 4 \
-       --global-batch-size 8 \
    --seq-length 1024 \
    --max-position-embeddings 1024 \
+    --micro-batch-size 4 \
+    --global-batch-size 8 \
+    --lr 0.00015 \
    --train-iters 500000 \
    --lr-decay-iters 320000 \
-       --save $CHECKPOINT_PATH \
-       --load $CHECKPOINT_PATH \
-       --data-path $DATA_PATH \
-       --vocab-file gpt2-vocab.json \
-       --merge-file gpt2-merges.txt \
-       --data-impl mmap \
-       --split 949,50,1 \
-       --distributed-backend nccl \
-       --lr 0.00015 \
-       --min-lr 1.0e-5 \
    --lr-decay-style cosine \
+    --min-lr 1.0e-5 \
    --weight-decay 1e-2 \
-       --clip-grad 1.0 \
    --lr-warmup-fraction .01 \
-       --activations-checkpoint-method uniform \
+    --clip-grad 1.0 \
+    --fp16
+"
+DATA_ARGS="
+    --data-path $DATA_PATH \
+    --vocab-file $VOCAB_FILE \
+    --merge-file $MERGE_FILE \
+    --data-impl mmap \
+    --split 949,50,1
+"
+OUTPUT_ARGS="
    --log-interval 100 \
    --save-interval 10000 \
    --eval-interval 1000 \
-       --eval-iters 10 \
+    --eval-iters 10
-       --fp16
+"
+torchrun pretrain_gpt.py \
+    $GPT_ARGS \
+    $DATA_ARGS \
+    $OUTPUT_ARGS \
+    --save $CHECKPOINT_PATH \
+    --load $CHECKPOINT_PATH
--- a/examples/pretrain_t5.sh
+++ b/examples/pretrain_t5.sh
 #!/bin/bash
-RANK=0
+export CUDA_DEVICE_MAX_CONNECTIONS=1
-WORLD_SIZE=1
-DATA_PATH=<Specify path and file prefix>
-VOCAB_FILE=<Specify path to vocab.txt>
 CHECKPOINT_PATH=<Specify path>
+VOCAB_FILE=<Specify path to file>/t5-vocab.txt
+DATA_PATH=<Specify path and file prefix>_text_sentence
-python pretrain_t5.py \
+T5_ARGS="
    --num-layers 12 \
    --hidden-size 768 \
    --num-attention-heads 12 \
@@ -14,26 +14,38 @@ python pretrain_t5.py \
    --ffn-hidden-size 3072 \
    --encoder-seq-length 512 \
    --decoder-seq-length 128 \
+    --max-position-embeddings 512 \
    --micro-batch-size 16 \
    --global-batch-size 16 \
-       --max-position-embeddings 512 \
+    --lr 0.0001 \
    --train-iters 1000000 \
    --lr-decay-iters 1000000 \
-       --save $CHECKPOINT_PATH \
-       --load $CHECKPOINT_PATH \
-       --data-path $DATA_PATH \
-       --vocab-file $VOCAB_FILE \
-       --data-impl mmap \
-       --split 949,50,1 \
-       --lr 0.0001 \
-       --min-lr 0.00001 \
    --lr-decay-style linear \
-       --lr-warmup-fraction .01 \
+    --min-lr 0.00001 \
    --weight-decay 1e-2 \
+    --lr-warmup-fraction .01 \
    --clip-grad 1.0 \
+    --fp16 \
+    --vocab-extra-ids 100
+"
+DATA_ARGS="
+    --data-path $DATA_PATH \
+    --vocab-file $VOCAB_FILE \
+    --data-impl mmap \
+    --split 949,50,1
+"
+OUTPUT_ARGS="
    --log-interval 100 \
    --save-interval 10000 \
    --eval-interval 1000 \
-       --eval-iters 10 \
+    --eval-iters 10
-       --fp16 \
+"
-       --vocab-extra-ids 100
+torchrun pretrain_t5.py \
+    $T5_ARGS \
+    $DATA_ARGS \
+    $OUTPUT_ARGS \
+    --save $CHECKPOINT_PATH \
+    --load $CHECKPOINT_PATH