v1.0

e4575be9 · huaerkl · e4575be9 · e4575be9 · e4575be9 · e4575be9
Commit e4575be9 authored Aug 04, 2023 by huaerkl
20 changed files
--- a/examples/finetune_mnli_distributed.sh
+++ b/examples/finetune_mnli_distributed.sh
+#!/bin/bash
+
+WORLD_SIZE=8
+
+DISTRIBUTED_ARGS="--nproc_per_node $WORLD_SIZE \
+                  --nnodes 1 \
+                  --node_rank 0 \
+                  --master_addr localhost \
+                  --master_port 6000"
+
+TRAIN_DATA="data/glue_data/MNLI/train.tsv"
+VALID_DATA="data/glue_data/MNLI/dev_matched.tsv \
+            data/glue_data/MNLI/dev_mismatched.tsv"
+PRETRAINED_CHECKPOINT=checkpoints/bert_345m
+VOCAB_FILE=bert-vocab.txt
+CHECKPOINT_PATH=checkpoints/bert_345m_mnli
+
+python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/main.py \
+               --task MNLI \
+               --seed 1234 \
+               --train-data $TRAIN_DATA \
+               --valid-data $VALID_DATA \
+               --tokenizer-type BertWordPieceLowerCase \
+               --vocab-file $VOCAB_FILE \
+               --epochs 5 \
+               --pretrained-checkpoint $PRETRAINED_CHECKPOINT \
+               --tensor-model-parallel-size 1 \
+               --num-layers 24 \
+               --hidden-size 1024 \
+               --num-attention-heads 16 \
+               --micro-batch-size 8 \
+               --checkpoint-activations \
+               --lr 5.0e-5 \
+               --lr-decay-style linear \
+               --lr-warmup-fraction 0.065 \
+               --seq-length 512 \
+               --max-position-embeddings 512 \
+               --save-interval 500000 \
+               --save $CHECKPOINT_PATH \
+               --log-interval 10 \
+               --eval-interval 100 \
+               --eval-iters 50 \
+               --weight-decay 1.0e-1 \
+               --fp16
--- a/examples/finetune_race_distributed.sh
+++ b/examples/finetune_race_distributed.sh
+#!/bin/bash
+
+WORLD_SIZE=8
+
+DISTRIBUTED_ARGS="--nproc_per_node $WORLD_SIZE \
+                  --nnodes 1 \
+                  --node_rank 0 \
+                  --master_addr localhost \
+                  --master_port 6000"
+
+TRAIN_DATA="data/RACE/train/middle"
+VALID_DATA="data/RACE/dev/middle \
+            data/RACE/dev/high"
+VOCAB_FILE=bert-vocab.txt
+PRETRAINED_CHECKPOINT=checkpoints/bert_345m
+CHECKPOINT_PATH=checkpoints/bert_345m_race
+
+python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/main.py \
+               --task RACE \
+               --seed 1234 \
+               --train-data $TRAIN_DATA \
+               --valid-data $VALID_DATA \
+               --tokenizer-type BertWordPieceLowerCase \
+               --vocab-file $VOCAB_FILE \
+               --epochs 3 \
+               --pretrained-checkpoint $PRETRAINED_CHECKPOINT \
+               --tensor-model-parallel-size 1 \
+               --num-layers 24 \
+               --hidden-size 1024 \
+               --num-attention-heads 16 \
+               --micro-batch-size 4 \
+               --checkpoint-activations \
+               --lr 1.0e-5 \
+               --lr-decay-style linear \
+               --lr-warmup-fraction 0.06 \
+               --seq-length 512 \
+               --max-position-embeddings 512 \
+               --save-interval 100000 \
+               --save $CHECKPOINT_PATH \
+               --log-interval 10 \
+               --eval-interval 100 \
+               --eval-iters 50 \
+               --weight-decay 1.0e-1 \
+               --clip-grad 1.0 \
+               --hidden-dropout 0.1 \
+               --attention-dropout 0.1 \
+               --fp16
--- a/examples/generate_text.sh
+++ b/examples/generate_text.sh
+#!/bin/bash
+
+CHECKPOINT_PATH=checkpoints/gpt2
+VOCAB_FILE=gpt2-vocab.json
+MERGE_FILE=gpt2-merges.txt
+
+python tools/generate_samples_gpt.py \
+       --tensor-model-parallel-size 1 \
+       --num-layers 24 \
+       --hidden-size 1024 \
+       --load $CHECKPOINT_PATH \
+       --num-attention-heads 16 \
+       --max-position-embeddings 1024 \
+       --tokenizer-type GPT2BPETokenizer \
+       --fp16 \
+       --batch-size 2 \
+       --seq-length 1024 \
+       --out-seq-length 1024 \
+       --temperature 1.0 \
+       --vocab-file $VOCAB_FILE \
+       --merge-file $MERGE_FILE \
+       --genfile unconditional_samples.json \
+       --num-samples 2 \
+       --top_p 0.9 \
+       --recompute
--- a/examples/merge_mp_bert.sh
+++ b/examples/merge_mp_bert.sh
+#!/bin/bash
+
+TENSOR_MODEL_PARALLEL_SIZE=2
+
+VOCAB_FILE=bert-vocab.txt
+CHECKPOINT_PATH=checkpoints/bert_345m
+
+WORLD_SIZE=$TENSOR_MODEL_PARALLEL_SIZE python tools/merge_mp_partitions.py \
+                                --model-type BERT \
+                                --tensor-model-parallel-size $TENSOR_MODEL_PARALLEL_SIZE \
+                                --tokenizer-type BertWordPieceLowerCase \
+                                --vocab-file $VOCAB_FILE \
+                                --num-layers 24 \
+                                --hidden-size 1024 \
+                                --num-attention-heads 16 \
+                                --seq-length 512 \
+                                --max-position-embeddings 512 \
+                                --load $CHECKPOINT_PATH
--- a/examples/pretrain_bert.sh
+++ b/examples/pretrain_bert.sh
+#!/bin/bash
+
+RANK=0
+WORLD_SIZE=1
+DATA_PATH=<Specify path and file prefix>_text_sentence
+CHECKPOINT_PATH=<Specify path>
+
+python pretrain_bert.py \
+       --num-layers 24 \
+       --hidden-size 1024 \
+       --num-attention-heads 16 \
+       --micro-batch-size 4 \
+       --global-batch-size 8 \
+       --seq-length 512 \
+       --max-position-embeddings 512 \
+       --train-iters 2000000 \
+       --lr-decay-iters 990000 \
+       --save $CHECKPOINT_PATH \
+       --load $CHECKPOINT_PATH \
+       --data-path $DATA_PATH \
+       --vocab-file bert-vocab.txt \
+       --data-impl mmap \
+       --split 949,50,1 \
+       --lr 0.0001 \
+       --min-lr 0.00001 \
+       --lr-decay-style linear \
+       --lr-warmup-fraction .01 \
+       --weight-decay 1e-2 \
+       --clip-grad 1.0 \
+       --log-interval 100 \
+       --save-interval 10000 \
+       --eval-interval 1000 \
+       --eval-iters 10 \
+       --fp16
--- a/examples/pretrain_bert_distributed.sh
+++ b/examples/pretrain_bert_distributed.sh
+#!/bin/bash
+
+GPUS_PER_NODE=8
+# Change for multinode config
+MASTER_ADDR=localhost
+MASTER_PORT=6000
+NNODES=1
+NODE_RANK=0
+WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
+
+DATA_PATH=<Specify path and file prefix>_text_sentence
+CHECKPOINT_PATH=<Specify path>
+
+DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT"
+
+python -m torch.distributed.launch $DISTRIBUTED_ARGS \
+       pretrain_bert.py \
+       --num-layers 24 \
+       --hidden-size 1024 \
+       --num-attention-heads 16 \
+       --micro-batch-size 4 \
+       --global-batch-size 32 \
+       --seq-length 512 \
+       --max-position-embeddings 512 \
+       --train-iters 1000000 \
+       --save $CHECKPOINT_PATH \
+       --load $CHECKPOINT_PATH \
+       --data-path $DATA_PATH \
+       --vocab-file bert-vocab.txt \
+       --data-impl mmap \
+       --split 949,50,1 \
+       --distributed-backend nccl \
+       --lr 0.0001 \
+       --lr-decay-style linear \
+       --min-lr 1.0e-5 \
+       --lr-decay-iters 990000 \
+       --weight-decay 1e-2 \
+       --clip-grad 1.0 \
+       --lr-warmup-fraction .01 \
+       --log-interval 100 \
+       --save-interval 10000 \
+       --eval-interval 1000 \
+       --eval-iters 10 \
+       --fp16
--- a/examples/pretrain_bert_distributed_with_mp.sh
+++ b/examples/pretrain_bert_distributed_with_mp.sh
+#!/bin/bash
+
+GPUS_PER_NODE=8
+# Change for multinode config
+MASTER_ADDR=localhost
+MASTER_PORT=6000
+NNODES=1
+NODE_RANK=0
+WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
+
+DATA_PATH=<Specify path and file prefix>_text_sentence
+VOCAB_FILE=<Specify path to vocab.txt>
+CHECKPOINT_PATH=<Specify path>
+
+DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT"
+
+python -m torch.distributed.launch $DISTRIBUTED_ARGS \
+       pretrain_bert.py \
+       --tensor-model-parallel-size 2 \
+       --pipeline-model-parallel-size 2 \
+       --num-layers 24 \
+       --hidden-size 1024 \
+       --num-attention-heads 16 \
+       --micro-batch-size 2 \
+       --global-batch-size 16 \
+       --max-position-embeddings 512 \
+       --train-iters 1000000 \
+       --save $CHECKPOINT_PATH \
+       --load $CHECKPOINT_PATH \
+       --data-path $DATA_PATH \
+       --vocab-file $VOCAB_FILE \
+       --data-impl mmap \
+       --split 949,50,1 \
+       --distributed-backend nccl \
+       --lr 0.0001 \
+       --lr-decay-style linear \
+       --min-lr 1.0e-5 \
+       --lr-decay-iters 990000 \
+       --weight-decay 1e-2 \
+       --clip-grad 1.0 \
+       --lr-warmup-fraction .01 \
+       --log-interval 100 \
+       --save-interval 10000 \
+       --eval-interval 1000 \
+       --eval-iters 10 \
+       --fp16
--- a/examples/pretrain_gpt.sh
+++ b/examples/pretrain_gpt.sh
+#! /bin/bash
+
+# Runs the "345M" parameter model
+
+RANK=0
+WORLD_SIZE=1
+
+DATA_PATH=<Specify path and file prefix>_text_document
+CHECKPOINT_PATH=<Specify path>
+
+
+deepspeed --num_gpus 1 pretrain_gpt.py \
+       --num-layers 24 \
+       --hidden-size 1024 \
+       --num-attention-heads 16 \
+       --micro-batch-size 4 \
+       --global-batch-size 8 \
+       --seq-length 1024 \
+       --max-position-embeddings 1024 \
+       --train-iters 500000 \
+       --lr-decay-iters 320000 \
+       --save $CHECKPOINT_PATH \
+       --load $CHECKPOINT_PATH \
+       --data-path $DATA_PATH \
+       --vocab-file gpt2-vocab.json \
+       --merge-file gpt2-merges.txt \
+       --data-impl mmap \
+       --split 949,50,1 \
+       --distributed-backend nccl \
+       --lr 0.00015 \
+       --min-lr 1.0e-5 \
+       --lr-decay-style cosine \
+       --weight-decay 1e-2 \
+       --clip-grad 1.0 \
+       --lr-warmup-fraction .01 \
+       --checkpoint-activations \
+       --log-interval 100 \
+       --save-interval 10000 \
+       --eval-interval 1000 \
+       --eval-iters 10 \
+       --fp16 \
+       --rank ${RANK} \
+       --world_size ${WORLD_SIZE} \
+       --dist_url tcp://localhost:60000 
--- a/examples/pretrain_gpt3_175B.sh
+++ b/examples/pretrain_gpt3_175B.sh
+#!/bin/bash
+
+
+#SBATCH <SLURM OPTIONS> --nodes=128 --exclusive --ntasks-per-node=8 --job-name=megatron_gpt3_175b
+
+
+DIR=`pwd`
+DATETIME=`date +'date_%y-%m-%d_time_%H-%M-%S'`
+mkdir -p $DIR/logs
+
+
+DATASET_1="<PATH TO THE FIRST DATASET>"
+DATASET_2="<PATH TO THE SECOND DATASET>"
+DATASET_3="<PATH TO THE THIRD DATASET>"
+DATASET="0.2 ${DATASET_1} 0.3 ${DATASET_2} 0.5 ${DATASET_3}"
+
+
+options=" \
+	--tensor-model-parallel-size 8 \
+	--pipeline-model-parallel-size 16 \
+        --num-layers 96 \
+        --hidden-size 12288 \
+        --num-attention-heads 96 \
+        --seq-length 2048 \
+        --max-position-embeddings 2048 \
+  --micro-batch-size 1 \
+	--global-batch-size 1536 \
+	--rampup-batch-size 16 16 5859375 \
+	--train-samples 146484375 \
+       	--lr-decay-samples 126953125 \
+        --lr-warmup-samples 183105 \
+        --lr 6.0e-5 \
+	--min-lr 6.0e-6 \
+        --lr-decay-style cosine \
+        --log-interval 10 \
+        --eval-iters 40 \
+        --eval-interval 1000 \
+	--data-path ${DATASET} \
+	--vocab-file <PATH TO gpt-vocab.json> \
+	--merge-file <PATH TO gpt-merges.txt> \
+	--save-interval 1000 \
+	--save <PATH TO CHECKPOINTS DIRECTORY> \
+	--load <PATH TO CHECKPOINTS DIRECTORY> \
+        --split 98,2,0 \
+        --clip-grad 1.0 \
+	--weight-decay 0.1 \
+	--adam-beta1 0.9 \
+	--adam-beta2 0.95 \
+	--init-method-std 0.006 \
+	--tensorboard-dir <TENSORBOARD DIRECTORY> \
+        --fp16 \
+	--checkpoint-activations "
+
+
+run_cmd="python -u ${DIR}/pretrain_gpt.py $@ ${options}"
+
+
+srun -l \
+     --container-image "nvcr.io/nvidia/pytorch:20.12-py3" \
+     --container-mounts "<DIRECTORIES TO MOUNT>" \
+     --output=$DIR/logs/%x_%j_$DATETIME.log sh -c "${run_cmd}"
+
+
+set +x
+
--- a/examples/pretrain_gpt_distributed.sh
+++ b/examples/pretrain_gpt_distributed.sh
+#! /bin/bash
+
+# Runs the "345M" parameter model
+
+GPUS_PER_NODE=4
+# Change for multinode config
+MASTER_ADDR=localhost
+MASTER_PORT=6000
+NNODES=1
+NODE_RANK=0
+WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
+
+DATA_PATH=<Specify path and file prefix>_text_document
+CHECKPOINT_PATH=<Specify path>
+
+DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT"
+
+python -m torch.distributed.launch $DISTRIBUTED_ARGS \
+       pretrain_gpt.py \
+       --num-layers 24 \
+       --hidden-size 1024 \
+       --num-attention-heads 16 \
+       --micro-batch-size 8 \
+       --global-batch-size 64 \
+       --seq-length 1024 \
+       --max-position-embeddings 1024 \
+       --train-iters 500000 \
+       --lr-decay-iters 320000 \
+       --save $CHECKPOINT_PATH \
+       --load $CHECKPOINT_PATH \
+       --data-path $DATA_PATH \
+       --vocab-file gpt2-vocab.json \
+       --merge-file gpt2-merges.txt \
+       --data-impl mmap \
+       --split 949,50,1 \
+       --distributed-backend nccl \
+       --lr 0.00015 \
+       --lr-decay-style cosine \
+       --min-lr 1.0e-5 \
+       --weight-decay 1e-2 \
+       --clip-grad 1.0 \
+       --lr-warmup-fraction .01 \
+       --checkpoint-activations \
+       --log-interval 100 \
+       --save-interval 10000 \
+       --eval-interval 1000 \
+       --eval-iters 10 \
+       --fp16 \
+       --rank ${NODE_RANK} \
+       --world_size ${WORLD_SIZE} \
+       --local_rank $NODE_RANK
--- a/examples/pretrain_gpt_distributed_with_mp.sh
+++ b/examples/pretrain_gpt_distributed_with_mp.sh
+#! /bin/bash
+
+# Runs the "345M" parameter model
+
+GPUS_PER_NODE=4
+# Change for multinode config
+MASTER_ADDR=localhost
+MASTER_PORT=6000
+NNODES=1
+NODE_RANK=0
+WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
+
+DATA_PATH=<Specify path and file prefix>_text_document
+CHECKPOINT_PATH=<Specify path>
+
+DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT"
+
+python -m torch.distributed.launch $DISTRIBUTED_ARGS \
+       pretrain_gpt.py \
+       --tensor-model-parallel-size 2 \
+       --pipeline-model-parallel-size 2 \
+       --num-layers 24 \
+       --hidden-size 1024 \
+       --num-attention-heads 16 \
+       --micro-batch-size 4 \
+       --global-batch-size 16 \
+       --seq-length 1024 \
+       --max-position-embeddings 1024 \
+       --train-iters 500000 \
+       --lr-decay-iters 320000 \
+       --save $CHECKPOINT_PATH \
+       --load $CHECKPOINT_PATH \
+       --data-path $DATA_PATH \
+       --vocab-file gpt2-vocab.json \
+       --merge-file gpt2-merges.txt \
+       --data-impl mmap \
+       --split 949,50,1 \
+       --distributed-backend nccl \
+       --lr 0.00015 \
+       --lr-decay-style cosine \
+       --min-lr 1.0e-5 \
+       --weight-decay 1e-2 \
+       --clip-grad 1.0 \
+       --lr-warmup-fraction .01 \
+       --checkpoint-activations \
+       --log-interval 100 \
+       --save-interval 10000 \
+       --eval-interval 1000 \
+       --eval-iters 10 \
+       --fp16
--- a/examples/pretrain_gpt_multilingual.sh
+++ b/examples/pretrain_gpt_multilingual.sh
+#! /bin/bash
+
+# Runs the "345M" parameter model
+
+RANK=0
+WORLD_SIZE=1
+
+# paths to multilingual preprocessed datasets
+DATA_PATH_EN=<Specify path and file prefix>_text_document
+DATA_PATH_AR=<Specify path and file prefix>_text_document
+DATA_PATH_KR=<Specify path and file prefix>_text_document
+DATA_PATH_JP=<Specify path and file prefix>_text_document
+
+CHECKPOINT_PATH=<Specify path>
+
+
+deepspeed --num_gpus 1 pretrain_gpt.py \
+       --num-layers 24 \
+       --hidden-size 1024 \
+       --num-attention-heads 16 \
+       --micro-batch-size 4 \
+       --global-batch-size 8 \
+       --seq-length 1024 \
+       --max-position-embeddings 1024 \
+       --train-iters 500000 \
+       --lr-decay-iters 320000 \
+       --save $CHECKPOINT_PATH \
+       --load $CHECKPOINT_PATH \
+       --train-weighted-split-paths "TRAIN: 0.3 0:0.6 $DATA_EN 1 0:0.6 $DATA_AR 1 0:0.6 $DATA_KR 1 0:0.6 $DATA_JP" \
+       --valid-weighted-split-paths \
+       "VALID_EN: 1 0.6:0.8 $DATA_EN" \
+       "VALID_AR: 1 0.6:0.8 $DATA_AR" \
+       "VALID_JP: 1 0.6:0.8 $DATA_KR" \
+       "VALID_KR: 1 0.6:0.8 $DATA_JP" \
+       "VALID_EN-AR-JP-KR_BALANCED: 1 0.6:0.8 $DATA_EN, 1 0.6:0.8 $DATA_AR, 1 0.6:0.8 $DATA_JP, 1 0.6:0.8 $DATA_KR" \
+       --test-weighted-split-paths \
+       "TEST_EN: 1 0.8:1 $DATA_EN" \
+       "TEST_AR: 1 0.8:1 $DATA_AR" \
+       "TEST_JP: 1 0.8:1 $DATA_JP" \
+       "TEST_KR: 1 0.8:1 $DATA_KR" \
+       "TEST_EN-AR-JP-KR_BALANCED: 1 0.8:1 $DATA_EN, 1 0.8:1 $DATA_AR, 1 0.8:1 $DATA_JP, 1 0.8:1 $DATA_KR" \
+       --vocab-file gpt2-vocab.json \
+       --merge-file gpt2-merges.txt \
+       --data-impl mmap \
+       --split 949,50,1 \
+       --distributed-backend nccl \
+       --lr 0.00015 \
+       --min-lr 1.0e-5 \
+       --lr-decay-style cosine \
+       --weight-decay 1e-2 \
+       --clip-grad 1.0 \
+       --lr-warmup-fraction .01 \
+       --checkpoint-activations \
+       --log-interval 100 \
+       --save-interval 10000 \
+       --eval-interval 1000 \
+       --eval-iters 10 \
+       --fp16
--- a/examples/pretrain_gpt_single_node.sh
+++ b/examples/pretrain_gpt_single_node.sh
+#!/bin/bash
+
+# Adapted to use deepspeed on a single node
+#
+# Multi-node will require either a `hostfile` or switching to `torch.distributed.launch`
+
+# adjust to the number of GPUs to use
+N_GPUS=1
+
+CHECKPOINT_PATH=checkpoints/gpt2
+VOCAB_FILE=data/gpt2-vocab.json
+MERGE_FILE=data/gpt2-merges.txt
+DATA_PATH=data/my-gpt2_text_document
+
+RANK=0
+WORLD_SIZE=$N_GPUS
+
+GPT_ARGS=" \
+    --num-layers 24 \
+    --hidden-size 1024 \
+    --num-attention-heads 16 \
+    --seq-length 1024 \
+    --max-position-embeddings 1024 \
+    --micro-batch-size 4 \
+    --global-batch-size 8 \
+    --lr-decay-iters 320000 \
+    --lr 0.00015 \
+    --min-lr 1.0e-5 \
+    --lr-decay-style cosine \
+    --train-iters 5000 \
+    --vocab-file $VOCAB_FILE \
+    --merge-file $MERGE_FILE \
+    --data-impl mmap \
+    --split 949,50,1 \
+    --distributed-backend nccl \
+    --weight-decay 1e-2 \
+    --clip-grad 1.0 \
+    --lr-warmup-fraction .01 \
+    --fp16 \
+    --rank ${RANK} \
+    --world_size ${WORLD_SIZE} \
+    --local_rank $RANK
+    "
+
+OUTPUT_ARGS=" \
+    --log-interval 10 \
+    --save-interval 500 \
+    --eval-interval 100 \
+    --eval-iters 10 \
+    --checkpoint-activations \
+    "
+
+DATA_ARGS=" \
+    --save $CHECKPOINT_PATH \
+    --load $CHECKPOINT_PATH \
+    --data-path $DATA_PATH \
+    "
+
+ALL_ARGS="$GPT_ARGS $OUTPUT_ARGS $DATA_ARGS"
+
+LAUNCHER="deepspeed --num_gpus $N_GPUS"
+
+CMD="$LAUNCHER pretrain_gpt.py $ALL_ARGS"
+
+echo $CMD
+
+$CMD
--- a/examples/pretrain_gpt_test.sh
+++ b/examples/pretrain_gpt_test.sh
+#! /bin/bash
+
+# Runs the "345M" parameter model
+
+RANK=0
+WORLD_SIZE=1
+
+# DATA_PATH=<Specify path and file prefix>_text_document
+# CHECKPOINT_PATH=<Specify path>
+
+DATA_PATH="../data"
+CHECKPOINT_PATH="../checkpoint"
+DS_CONFIG="./examples/ds_config.json"
+
+deepspeed --num_gpus 1 pretrain_gpt.py \
+       --num-layers 24 \
+       --hidden-size 1024 \
+       --num-attention-heads 16 \
+       --micro-batch-size 4 \
+       --global-batch-size 8 \
+       --seq-length 1024 \
+       --max-position-embeddings 1024 \
+       --train-iters 500000 \
+       --lr-decay-iters 320000 \
+       --save $CHECKPOINT_PATH \
+       --load $CHECKPOINT_PATH \
+       --data-path $DATA_PATH \
+       --vocab-file gpt2-vocab.json \
+       --merge-file gpt2-merges.txt \
+       --data-impl mmap \
+       --split 949,50,1 \
+       --distributed-backend nccl \
+       --lr 0.00015 \
+       --min-lr 1.0e-5 \
+       --lr-decay-style cosine \
+       --weight-decay 1e-2 \
+       --clip-grad 1.0 \
+       --lr-warmup-fraction .01 \
+       --checkpoint-activations \
+       --log-interval 100 \
+       --save-interval 10000 \
+       --eval-interval 1000 \
+       --eval-iters 10 \
+       --fp16 \
+       --rank ${RANK} \
+       --world_size ${WORLD_SIZE} \
+       --deepspeed \
+       --deepspeed_config $DS_CONFIG \
--- a/examples/pretrain_gpt_tiny.sh
+++ b/examples/pretrain_gpt_tiny.sh
+#! /bin/bash
+
+# Runs the "345M" parameter model
+
+RANK=0
+WORLD_SIZE=1
+
+DATA_PATH=GPT2/c4_en_partial_gpt2_text_document
+CHECKPOINT_PATH=GPT2
+
+
+deepspeed --num_gpus 1 pretrain_gpt.py \
+       --num-layers 2 \
+       --hidden-size 128 \
+       --num-attention-heads 4 \
+       --micro-batch-size 4 \
+       --global-batch-size 8 \
+       --seq-length 256 \
+       --max-position-embeddings 256 \
+       --train-iters 10000 \
+       --lr-decay-iters 5000 \
+       --save $CHECKPOINT_PATH \
+       --load $CHECKPOINT_PATH \
+       --data-path $DATA_PATH \
+        --tokenizer-type PretrainedFromHF \
+        --tokenizer-name-or-path t5-small \
+       --data-impl mmap \
+       --split 949,50,1 \
+       --distributed-backend nccl \
+       --lr 0.00015 \
+       --min-lr 1.0e-5 \
+       --lr-decay-style cosine \
+       --weight-decay 1e-2 \
+       --clip-grad 1.0 \
+       --lr-warmup-fraction .01 \
+       --checkpoint-activations \
+       --log-interval 100 \
+       --save-interval 10000 \
+       --eval-interval 1000 \
+       --eval-iters 10 \
+       --fp16 \
+       --tensorboard-dir GPT2
+
+#        --vocab-file GPT2/gpt2-vocab.json \
+#        --merge-file GPT2/gpt2-merges.txt \
--- a/examples/pretrain_ict.sh
+++ b/examples/pretrain_ict.sh
+#! /bin/bash
+
+# Runs the "217M" parameter biencoder model for ICT retriever
+
+RANK=0
+WORLD_SIZE=1
+
+PRETRAINED_BERT_PATH=<Specify path of pretrained BERT model>
+TEXT_DATA_PATH=<Specify path and file prefix of the text data>
+TITLE_DATA_PATH=<Specify path and file prefix od the titles>
+CHECKPOINT_PATH=<Specify path>
+
+
+python pretrain_ict.py \
+        --num-layers 12 \
+        --hidden-size 768 \
+        --num-attention-heads 12 \
+        --tensor-model-parallel-size 1 \
+        --micro-batch-size 32 \
+        --seq-length 256 \
+        --max-position-embeddings 512 \
+        --train-iters 100000 \
+        --vocab-file bert-vocab.txt \
+        --tokenizer-type BertWordPieceLowerCase \
+        --DDP-impl torch \
+        --bert-load ${PRETRAINED_BERT_PATH} \
+        --log-interval 100 \
+        --eval-interval 1000 \
+        --eval-iters 10 \
+        --retriever-report-topk-accuracies 1 5 10 20 100 \
+        --retriever-score-scaling \
+        --load $CHECKPOINT_PATH \
+        --save $CHECKPOINT_PATH \
+        --data-path ${TEXT_DATA_PATH} \
+        --titles-data-path ${TITLE_DATA_PATH} \
+        --lr 0.0001 \
+        --lr-decay-style linear \
+        --weight-decay 1e-2 \
+        --clip-grad 1.0 \
+        --lr-warmup-fraction 0.01 \
+        --save-interval 4000 \
+        --exit-interval 8000 \
+        --query-in-block-prob 0.1 \
+        --fp16
--- a/examples/pretrain_t5.sh
+++ b/examples/pretrain_t5.sh
+#!/bin/bash
+
+RANK=0
+WORLD_SIZE=1
+DATA_PATH=<Specify path and file prefix>
+VOCAB_FILE=<Specify path to vocab.txt>
+CHECKPOINT_PATH=<Specify path>
+
+python pretrain_t5.py \
+       --num-layers 12 \
+       --hidden-size 768 \
+       --num-attention-heads 12 \
+       --kv-channels 64 \
+       --ffn-hidden-size 3072 \
+       --encoder-seq-length 512 \
+       --decoder-seq-length 128 \
+       --micro-batch-size 16 \
+       --global-batch-size 2048 \
+       --max-position-embeddings 512 \
+       --train-iters 1000000 \
+       --lr-decay-iters 1000000 \
+       --save $CHECKPOINT_PATH \
+       --load $CHECKPOINT_PATH \
+       --data-path $DATA_PATH \
+       --vocab-file $VOCAB_FILE \
+       --data-impl mmap \
+       --split 949,50,1 \
+       --lr 0.0001 \
+       --min-lr 0.00001 \
+       --lr-decay-style linear \
+       --lr-warmup-fraction .01 \
+       --weight-decay 1e-2 \
+       --clip-grad 1.0 \
+       --log-interval 100 \
+       --save-interval 10000 \
+       --eval-interval 1000 \
+       --eval-iters 10 \
+       --fp16
--- a/examples/pretrain_t5_distributed.sh
+++ b/examples/pretrain_t5_distributed.sh
+#!/bin/bash
+
+GPUS_PER_NODE=8
+# Change for multinode config
+MASTER_ADDR=localhost
+MASTER_PORT=6000
+NNODES=1
+NODE_RANK=0
+WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
+
+DATA_PATH=<Specify path and file prefix>
+VOCAB_FILE=<Specify path to vocab.txt>
+CHECKPOINT_PATH=<Specify path>
+
+DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT"
+
+python -m torch.distributed.launch $DISTRIBUTED_ARGS \
+       pretrain_t5.py \
+       --num-layers 12 \
+       --hidden-size 768 \
+       --num-attention-heads 12 \
+       --kv-channels 64 \
+       --ffn-hidden-size 3072 \
+       --encoder-seq-length 512 \
+       --decoder-seq-length 128 \
+       --micro-batch-size 16 \
+       --global-batch-size 2048 \
+       --max-position-embeddings 512 \
+       --train-iters 1000000 \
+       --lr-decay-iters 1000000 \
+       --save $CHECKPOINT_PATH \
+       --load $CHECKPOINT_PATH \
+       --data-path $DATA_PATH \
+       --vocab-file $VOCAB_FILE \
+       --data-impl mmap \
+       --split 949,50,1 \
+       --lr 0.0001 \
+       --min-lr 0.00001 \
+       --lr-decay-style linear \
+       --lr-warmup-fraction .01 \
+       --weight-decay 1e-2 \
+       --clip-grad 1.0 \
+       --log-interval 100 \
+       --save-interval 10000 \
+       --eval-interval 1000 \
+       --eval-iters 10 \
+       --fp16
--- a/examples/pretrain_t5_distributed_with_mp.sh
+++ b/examples/pretrain_t5_distributed_with_mp.sh
+#!/bin/bash
+
+GPUS_PER_NODE=8
+# Change for multinode config
+MASTER_ADDR=localhost
+MASTER_PORT=6000
+NNODES=1
+NODE_RANK=0
+WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
+
+DATA_PATH=<Specify path and file prefix>
+CHECKPOINT_PATH=<Specify path>
+
+DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT"
+
+python -m torch.distributed.launch $DISTRIBUTED_ARGS \
+       pretrain_t5.py \
+       --tensor-model-parallel-size 2 \
+       --num-layers 12 \
+       --hidden-size 768 \
+       --num-attention-heads 12 \
+       --kv-channels 64 \
+       --ffn-hidden-size 3072 \
+       --encoder-seq-length 512 \
+       --decoder-seq-length 128 \
+       --micro-batch-size 16 \
+       --global-batch-size 2048 \
+       --seq-length 512 \
+       --max-position-embeddings 512 \
+       --train-iters 1000000 \
+       --lr-decay-iters 1000000 \
+       --save $CHECKPOINT_PATH \
+       --load $CHECKPOINT_PATH \
+       --data-path $DATA_PATH \
+       --vocab-file t5-vocab.txt \
+       --data-impl mmap \
+       --split 949,50,1 \
+       --lr 0.0001 \
+       --min-lr 0.00001 \
+       --lr-decay-style linear \
+       --lr-warmup-fraction .01 \
+       --weight-decay 1e-2 \
+       --clip-grad 1.0 \
+       --log-interval 100 \
+       --save-interval 10000 \
+       --eval-interval 1000 \
+       --eval-iters 10 \
+       --fp16
--- a/examples/pretrain_vit_mpi.sh
+++ b/examples/pretrain_vit_mpi.sh
+#!/bin/bash
+lrank=$OMPI_COMM_WORLD_LOCAL_RANK
+RANK=$OMPI_COMM_WORLD_RANK
+WORLD_SIZE=$OMPI_COMM_WORLD_SIZE
+
+DATA_PATH="./data"
+CHECKPOINT_PATH="./checkpoint"
+
+MICRO_BATCH_SIZE=1
+GLOBAL_BATCH_SIZE=8
+
+
+APP="python3 -u pretrain_vit.py \
+       --num-layers 24 \
+       --hidden-size 1024 \
+       --num-attention-heads 16 \
+       --micro-batch-size ${MICRO_BATCH_SIZE} \
+       --global-batch-size ${GLOBAL_BATCH_SIZE} \
+       --seq-length 1024 \
+       --max-position-embeddings 1024 \
+       --train-iters 500000 \
+       --lr-decay-iters 320000 \
+       --save $CHECKPOINT_PATH \
+       --load $CHECKPOINT_PATH \
+       --data-path $DATA_PATH \
+       --data-impl mmap \
+       --split 949,50,1 \
+       --distributed-backend nccl \
+       --lr 0.00015 \
+       --min-lr 1.0e-5 \
+       --lr-decay-style cosine \
+       --weight-decay 1e-2 \
+       --clip-grad 1.0 \
+       --lr-warmup-fraction .01 \
+       --checkpoint-activations \
+       --log-interval 100 \
+       --save-interval 10000 \
+       --eval-interval 1000 \
+       --eval-iters 10 \
+       --fp16 \
+       --padded_vocab_size 224\
+       --rank $RANK \
+       --world_size $WORLD_SIZE \
+    "
+
+# --eval-only True \
+# --do_test True \
+
+case ${lrank} in
+[0])
+  export HIP_VISIBLE_DEVICES=0,1,2,3
+  export UCX_NET_DEVICES=mlx5_0:1
+  export UCX_IB_PCI_BW=mlx5_0:50Gbs
+  NCCL_SOCKET_IFNAME=ib0 numactl --cpunodebind=0 --membind=0 ${APP}
+  ;;
+[1])
+  export HIP_VISIBLE_DEVICES=0,1,2,3
+  export UCX_NET_DEVICES=mlx5_1:1
+  export UCX_IB_PCI_BW=mlx5_1:50Gbs
+  NCCL_SOCKET_IFNAME=ib0 numactl --cpunodebind=1 --membind=1 ${APP}
+  ;;
+[2])
+  export HIP_VISIBLE_DEVICES=0,1,2,3
+  export UCX_NET_DEVICES=mlx5_2:1
+  export UCX_IB_PCI_BW=mlx5_2:50Gbs
+  NCCL_SOCKET_IFNAME=ib0 numactl --cpunodebind=2 --membind=2 ${APP}
+  ;;
+[3])
+  export HIP_VISIBLE_DEVICES=0,1,2,3
+  export UCX_NET_DEVICES=mlx5_3:1
+  export UCX_IB_PCI_BW=mlx5_3:50Gbs
+  NCCL_SOCKET_IFNAME=ib0 numactl --cpunodebind=3 --membind=3 ${APP}
+  ;;
+esac
+