第一次测试提交

bc5c7fa7 · wxj · 70fddd0f · bc5c7fa7 · bc5c7fa7 · bc5c7fa7
Commit bc5c7fa7 authored Jan 07, 2025 by wxj
20 changed files
--- a/Megatron-LM-core_r0.7.0.beta/examples/pretrain_gpt_distributed.sh
+++ b/Megatron-LM-core_r0.7.0.beta/examples/pretrain_gpt_distributed.sh
+#!/bin/bash
+
+# Runs the "345M" parameter model
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+
+GPUS_PER_NODE=8
+# Change for multinode config
+MASTER_ADDR=localhost
+MASTER_PORT=6000
+NNODES=1
+NODE_RANK=0
+WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
+
+CHECKPOINT_PATH=<Specify path>
+VOCAB_FILE=<Specify path to file>/gpt2-vocab.json
+MERGE_FILE=<Specify path to file>/gpt2-merges.txt
+DATA_PATH=<Specify path and file prefix>_text_document
+
+DISTRIBUTED_ARGS="
+    --nproc_per_node $GPUS_PER_NODE \
+    --nnodes $NNODES \
+    --node_rank $NODE_RANK \
+    --master_addr $MASTER_ADDR \
+    --master_port $MASTER_PORT
+"
+
+GPT_ARGS="
+    --num-layers 24 \
+    --hidden-size 1024 \
+    --num-attention-heads 16 \
+    --seq-length 1024 \
+    --max-position-embeddings 1024 \
+    --micro-batch-size 8 \
+    --global-batch-size 64 \
+    --lr 0.00015 \
+    --train-iters 500000 \
+    --lr-decay-iters 320000 \
+    --lr-decay-style cosine \
+    --min-lr 1.0e-5 \
+    --weight-decay 1e-2 \
+    --lr-warmup-fraction .01 \
+    --clip-grad 1.0 \
+    --fp16
+"
+
+DATA_ARGS="
+    --data-path $DATA_PATH \
+    --vocab-file $VOCAB_FILE \
+    --merge-file $MERGE_FILE \
+    --split 949,50,1
+"
+
+OUTPUT_ARGS="
+    --log-interval 100 \
+    --save-interval 10000 \
+    --eval-interval 1000 \
+    --eval-iters 10
+"
+
+torchrun $DISTRIBUTED_ARGS pretrain_gpt.py \
+    $GPT_ARGS \
+    $DATA_ARGS \
+    $OUTPUT_ARGS \
+    --distributed-backend nccl \
+    --save $CHECKPOINT_PATH \
+    --load $CHECKPOINT_PATH
--- a/Megatron-LM-core_r0.7.0.beta/examples/pretrain_gpt_distributed_with_mp.sh
+++ b/Megatron-LM-core_r0.7.0.beta/examples/pretrain_gpt_distributed_with_mp.sh
+#!/bin/bash
+
+# Runs the "345M" parameter model
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+
+GPUS_PER_NODE=8
+# Change for multinode config
+MASTER_ADDR=localhost
+MASTER_PORT=6000
+NNODES=1
+NODE_RANK=0
+WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
+
+CHECKPOINT_PATH=<Specify path>
+VOCAB_FILE=<Specify path to file>/gpt2-vocab.json
+MERGE_FILE=<Specify path to file>/gpt2-merges.txt
+DATA_PATH=<Specify path and file prefix>_text_document
+
+DISTRIBUTED_ARGS="
+    --nproc_per_node $GPUS_PER_NODE \
+    --nnodes $NNODES \
+    --node_rank $NODE_RANK \
+    --master_addr $MASTER_ADDR \
+    --master_port $MASTER_PORT
+"
+
+GPT_ARGS="
+    --tensor-model-parallel-size 2 \
+    --pipeline-model-parallel-size 2 \
+    --sequence-parallel \
+    --num-layers 24 \
+    --hidden-size 1024 \
+    --num-attention-heads 16 \
+    --seq-length 1024 \
+    --max-position-embeddings 1024 \
+    --micro-batch-size 4 \
+    --global-batch-size 16 \
+    --lr 0.00015 \
+    --train-iters 500000 \
+    --lr-decay-iters 320000 \
+    --lr-decay-style cosine \
+    --min-lr 1.0e-5 \
+    --weight-decay 1e-2 \
+    --lr-warmup-fraction .01 \
+    --clip-grad 1.0 \
+    --fp16
+"
+
+DATA_ARGS="
+    --data-path $DATA_PATH \
+    --vocab-file $VOCAB_FILE \
+    --merge-file $MERGE_FILE \
+    --split 949,50,1
+"
+
+OUTPUT_ARGS="
+    --log-interval 100 \
+    --save-interval 10000 \
+    --eval-interval 1000 \
+    --eval-iters 10
+"
+
+torchrun $DISTRIBUTED_ARGS pretrain_gpt.py \
+    $GPT_ARGS \
+    $DATA_ARGS \
+    $OUTPUT_ARGS \
+    --distributed-backend nccl \
+    --save $CHECKPOINT_PATH \
+    --load $CHECKPOINT_PATH
+
--- a/Megatron-LM-core_r0.7.0.beta/examples/pretrain_ict.sh
+++ b/Megatron-LM-core_r0.7.0.beta/examples/pretrain_ict.sh
+#! /bin/bash
+
+# Runs the "217M" parameter biencoder model for ICT retriever
+
+RANK=0
+WORLD_SIZE=1
+
+PRETRAINED_BERT_PATH=<Specify path of pretrained BERT model>
+TEXT_DATA_PATH=<Specify path and file prefix of the text data>
+TITLE_DATA_PATH=<Specify path and file prefix od the titles>
+CHECKPOINT_PATH=<Specify path>
+
+
+python pretrain_ict.py \
+        --num-layers 12 \
+        --hidden-size 768 \
+        --num-attention-heads 12 \
+        --tensor-model-parallel-size 1 \
+        --micro-batch-size 32 \
+        --seq-length 256 \
+        --max-position-embeddings 512 \
+        --train-iters 100000 \
+        --vocab-file bert-vocab.txt \
+        --tokenizer-type BertWordPieceLowerCase \
+        --DDP-impl torch \
+        --bert-load ${PRETRAINED_BERT_PATH} \
+        --log-interval 100 \
+        --eval-interval 1000 \
+        --eval-iters 10 \
+        --retriever-report-topk-accuracies 1 5 10 20 100 \
+        --retriever-score-scaling \
+        --load $CHECKPOINT_PATH \
+        --save $CHECKPOINT_PATH \
+        --data-path ${TEXT_DATA_PATH} \
+        --titles-data-path ${TITLE_DATA_PATH} \
+        --lr 0.0001 \
+        --lr-decay-style linear \
+        --weight-decay 1e-2 \
+        --clip-grad 1.0 \
+        --lr-warmup-fraction 0.01 \
+        --save-interval 4000 \
+        --exit-interval 8000 \
+        --query-in-block-prob 0.1 \
+        --fp16
--- a/Megatron-LM-core_r0.7.0.beta/examples/pretrain_t5.sh
+++ b/Megatron-LM-core_r0.7.0.beta/examples/pretrain_t5.sh
+#!/bin/bash
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+
+CHECKPOINT_PATH=<Specify path>
+VOCAB_FILE=<Specify path to file>/t5-vocab.txt
+DATA_PATH=<Specify path and file prefix>_text_sentence
+
+T5_ARGS="
+    --num-layers 12 \
+    --hidden-size 768 \
+    --num-attention-heads 12 \
+    --kv-channels 64 \
+    --ffn-hidden-size 3072 \
+    --encoder-seq-length 512 \
+    --decoder-seq-length 128 \
+    --max-position-embeddings 512 \
+    --micro-batch-size 16 \
+    --global-batch-size 16 \
+    --lr 0.0001 \
+    --train-iters 1000000 \
+    --lr-decay-iters 1000000 \
+    --lr-decay-style linear \
+    --min-lr 0.00001 \
+    --weight-decay 1e-2 \
+    --lr-warmup-fraction .01 \
+    --clip-grad 1.0 \
+    --fp16 \
+    --vocab-extra-ids 100
+"
+
+DATA_ARGS="
+    --data-path $DATA_PATH \
+    --vocab-file $VOCAB_FILE \
+    --split 949,50,1
+"
+
+OUTPUT_ARGS="
+    --log-interval 100 \
+    --save-interval 10000 \
+    --eval-interval 1000 \
+    --eval-iters 10
+"
+
+torchrun pretrain_t5.py \
+    $T5_ARGS \
+    $DATA_ARGS \
+    $OUTPUT_ARGS \
+    --save $CHECKPOINT_PATH \
+    --load $CHECKPOINT_PATH
--- a/Megatron-LM-core_r0.7.0.beta/examples/pretrain_t5_distributed.sh
+++ b/Megatron-LM-core_r0.7.0.beta/examples/pretrain_t5_distributed.sh
+#!/bin/bash
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+
+GPUS_PER_NODE=8
+# Change for multinode config
+MASTER_ADDR=localhost
+MASTER_PORT=6000
+NNODES=1
+NODE_RANK=0
+WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
+
+CHECKPOINT_PATH=<Specify path>
+VOCAB_FILE=<Specify path to file>/t5-vocab.txt
+DATA_PATH=<Specify path and file prefix>_text_sentence
+
+DISTRIBUTED_ARGS="
+    --nproc_per_node $GPUS_PER_NODE \
+    --nnodes $NNODES \
+    --node_rank $NODE_RANK \
+    --master_addr $MASTER_ADDR \
+    --master_port $MASTER_PORT
+"
+
+T5_ARGS="
+    --num-layers 12 \
+    --hidden-size 768 \
+    --num-attention-heads 12 \
+    --kv-channels 64 \
+    --ffn-hidden-size 3072 \
+    --encoder-seq-length 512 \
+    --decoder-seq-length 128 \
+    --max-position-embeddings 512 \
+    --micro-batch-size 16 \
+    --global-batch-size 128 \
+    --lr 0.0001 \
+    --train-iters 1000000 \
+    --lr-decay-iters 1000000 \
+    --lr-decay-style linear \
+    --min-lr 0.00001 \
+    --weight-decay 1e-2 \
+    --lr-warmup-fraction .01 \
+    --clip-grad 1.0 \
+    --fp16 \
+    --vocab-extra-ids 100
+"
+
+DATA_ARGS="
+    --data-path $DATA_PATH \
+    --vocab-file $VOCAB_FILE \
+    --split 949,50,1
+"
+
+OUTPUT_ARGS="
+    --log-interval 100 \
+    --save-interval 10000 \
+    --eval-interval 1000 \
+    --eval-iters 10
+"
+
+torchrun $DISTRIBUTED_ARGS pretrain_t5_core.py \
+    $T5_ARGS \
+    $DATA_ARGS \
+    $OUTPUT_ARGS \
+    --distributed-backend nccl \
+    --save $CHECKPOINT_PATH \
+    --load $CHECKPOINT_PATH
--- a/Megatron-LM-core_r0.7.0.beta/examples/pretrain_t5_distributed_with_mp.sh
+++ b/Megatron-LM-core_r0.7.0.beta/examples/pretrain_t5_distributed_with_mp.sh
+#!/bin/bash
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+
+GPUS_PER_NODE=8
+# Change for multinode config
+MASTER_ADDR=localhost
+MASTER_PORT=6000
+NNODES=1
+NODE_RANK=0
+WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
+
+CHECKPOINT_PATH=<Specify path>
+VOCAB_FILE=<Specify path to file>/t5-vocab.txt
+DATA_PATH=<Specify path and file prefix>_text_sentence
+
+DISTRIBUTED_ARGS="
+    --nproc_per_node $GPUS_PER_NODE \
+    --nnodes $NNODES \
+    --node_rank $NODE_RANK \
+    --master_addr $MASTER_ADDR \
+    --master_port $MASTER_PORT
+"
+
+T5_ARGS="
+    --tensor-model-parallel-size 2 \
+    --num-layers 12 \
+    --hidden-size 768 \
+    --num-attention-heads 12 \
+    --kv-channels 64 \
+    --ffn-hidden-size 3072 \
+    --encoder-seq-length 512 \
+    --decoder-seq-length 128 \
+    --max-position-embeddings 512 \
+    --micro-batch-size 16 \
+    --global-batch-size 128 \
+    --lr 0.0001 \
+    --train-iters 1000000 \
+    --lr-decay-iters 1000000 \
+    --lr-decay-style linear \
+    --min-lr 0.00001 \
+    --weight-decay 1e-2 \
+    --lr-warmup-fraction .01 \
+    --clip-grad 1.0 \
+    --fp16  \
+    --vocab-extra-ids 100
+"
+
+DATA_ARGS="
+    --data-path $DATA_PATH \
+    --vocab-file $VOCAB_FILE \
+    --split 949,50,1
+"
+
+OUTPUT_ARGS="
+    --log-interval 100 \
+    --save-interval 10000 \
+    --eval-interval 1000 \
+    --eval-iters 10
+"
+
+torchrun $DISTRIBUTED_ARGS pretrain_t5.py \
+    $T5_ARGS \
+    $DATA_ARGS \
+    $OUTPUT_ARGS \
+    --distributed-backend nccl \
+    --save $CHECKPOINT_PATH \
+    --load $CHECKPOINT_PATH
--- a/Megatron-LM-core_r0.7.0.beta/examples/pretrain_vision_classify.sh
+++ b/Megatron-LM-core_r0.7.0.beta/examples/pretrain_vision_classify.sh
+#! /bin/bash
+
+# Pre-trains ViT based image classificaation model
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+export NCCL_IB_SL=1
+
+# Training and validation paths should each point to a folder where each
+# sub-folder contains a collection of images in jpg or png format
+# e.g. If using imagenet, one train image might be, train_data/n01688243/n01688243_11301.JPEG
+DATA_PATH_TRAIN=<Specify train data path>
+DATA_PATH_VAL=<Specify validation data path>
+
+CHECKPOINT_PATH=<Specify path>
+
+CLASSIFIER_ARGS="
+   	   --tensor-model-parallel-size 1 \
+        --num-layers 12 \
+        --hidden-size 768 \
+        --num-attention-heads 12 \
+        --patch-dim 4 \
+        --seq-length 3136 \
+        --max-position-embeddings 3136 \
+        --img-h 224 \
+        --img-w 224 \
+        --mask-factor 1.0 \
+        --fp16 \
+        --train-iters 750000 \
+        --lr-decay-style cosine \
+        --micro-batch-size 4 \
+        --global-batch-size 1024 \
+        --lr 0.0005 \
+        --min-lr 0.00001 \
+        --attention-dropout 0.0 \
+        --weight-decay 0.05 \
+        --lr-warmup-iters 12500 \
+        --clip-grad 1.0 \
+        --no-gradient-accumulation-fusion \
+        --num-workers 4 \
+        --DDP-impl torch "
+
+DATA_ARGS="
+     --tokenizer-type NullTokenizer \
+     --vocab-size 0 \
+     --data-path $DATA_PATH_TRAIN $DATA_PATH_VAL \
+     --no-data-sharding \
+     --split 949,50,1 \
+"
+
+OUTPUT_ARG="
+     --log-interval 32 \
+     --save-interval 10000 \
+     --eval-interval 2500 \
+     --eval-iters 100 \
+     --tensorboard-dir ${CHECKPOINT_PATH} \
+"
+
+torchrun pretrain_vision_classification.py \
+     $CLASSIFIER_ARGS \
+     $DATA_ARGS \
+     $OUTPUT_ARGS \
+     --save $CHECKPOINT_PATH \
+     --load $CHECKPOINT_PATH
+
--- a/Megatron-LM-core_r0.7.0.beta/examples/pretrain_vision_dino.sh
+++ b/Megatron-LM-core_r0.7.0.beta/examples/pretrain_vision_dino.sh
+#! /bin/bash
+
+# Pre-trains Dino V1 model
+# For model details: https://arxiv.org/abs/2104.14294
+# For original author implementation: https://github.com/facebookresearch/dino/tree/main
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+export NCCL_IB_SL=1
+
+# Training and validation paths should each point to a folder where each
+# sub-folder contains a collection of images in jpg or png format
+# e.g. If using imagenet, one train image might be, train_data/n01688243/n01688243_11301.JPEG
+DATA_PATH_TRAIN=<Specify train data path>
+DATA_PATH_VAL=<Specify validation data path>
+
+CHECKPOINT_PATH=<Specify path>
+
+DINO_ARGS="
+        --vision-pretraining-type dino \
+   	   --tensor-model-parallel-size 1 \
+        --num-layers 12 \
+        --hidden-size 768 \
+        --num-attention-heads 12 \
+        --patch-dim 4 \
+        --seq-length 3136 \
+        --max-position-embeddings 3136 \
+        --img-h 224 \
+        --img-w 224 \
+        --mask-factor 1.0 \
+        --fp16 \
+        --train-iters 750000 \
+        --lr-decay-style cosine \
+        --micro-batch-size 4 \
+        --global-batch-size 1024 \
+        --lr 0.0005 \
+        --min-lr 0.00001 \
+        --attention-dropout 0.0 \
+        --weight-decay 0.05 \
+        --lr-warmup-iters 12500 \
+        --clip-grad 1.0 \
+        --no-gradient-accumulation-fusion \
+        --num-workers 4 \
+        --DDP-impl torch "
+
+DATA_ARGS="
+     --tokenizer-type NullTokenizer \
+     --vocab-size 0 \
+     --data-path $DATA_PATH_TRAIN $DATA_PATH_VAL \
+     --no-data-sharding \
+     --split 949,50,1 \
+"
+
+OUTPUT_ARG="
+     --log-interval 32 \
+     --save-interval 10000 \
+     --eval-interval 2500 \
+     --eval-iters 100 \
+     --tensorboard-dir ${CHECKPOINT_PATH} \
+"
+
+torchrun pretrain_vision_dino.py \
+     $DINO_ARGS \
+     $DATA_ARGS \
+     $OUTPUT_ARGS \
+     --save $CHECKPOINT_PATH \
+     --load $CHECKPOINT_PATH
+
--- a/Megatron-LM-core_r0.7.0.beta/examples/pretrain_vision_inpaint.sh
+++ b/Megatron-LM-core_r0.7.0.beta/examples/pretrain_vision_inpaint.sh
+#! /bin/bash
+
+# Pre-trains ViT based image inpainting model
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+export NCCL_IB_SL=1
+
+# Training and validation paths should each point to a folder where each
+# sub-folder contains a collection of images in jpg or png format
+# e.g. If using imagenet, one train image might be, train_data/n01688243/n01688243_11301.JPEG
+DATA_PATH_TRAIN=<Specify train data path>
+DATA_PATH_VAL=<Specify validation data path>
+
+CHECKPOINT_PATH=<Specify path>
+
+INPAINT_ARGS="
+        --vision-pretraining-type inpaint \
+   	   --tensor-model-parallel-size 1 \
+        --num-layers 12 \
+        --hidden-size 768 \
+        --num-attention-heads 12 \
+        --patch-dim 4 \
+        --seq-length 3136 \
+        --max-position-embeddings 3136 \
+        --img-h 224 \
+        --img-w 224 \
+        --mask-factor 1.0 \
+        --fp16 \
+        --train-iters 750000 \
+        --lr-decay-style cosine \
+        --micro-batch-size 4 \
+        --global-batch-size 1024 \
+        --lr 0.0005 \
+        --min-lr 0.00001 \
+        --attention-dropout 0.0 \
+        --weight-decay 0.05 \
+        --lr-warmup-iters 12500 \
+        --clip-grad 1.0 \
+        --no-gradient-accumulation-fusion \
+        --num-workers 4 \
+        --DDP-impl torch "
+
+DATA_ARGS="
+     --tokenizer-type NullTokenizer \
+     --vocab-size 0 \
+     --data-path $DATA_PATH_TRAIN $DATA_PATH_VAL \
+     --no-data-sharding \
+     --split 949,50,1 \
+"
+
+OUTPUT_ARG="
+     --log-interval 32 \
+     --save-interval 10000 \
+     --eval-interval 2500 \
+     --eval-iters 100 \
+     --tensorboard-dir ${CHECKPOINT_PATH} \
+"
+
+torchrun pretrain_vision_inpaint.py \
+     $INPAINT_ARGS \
+     $DATA_ARGS \
+     $OUTPUT_ARGS \
+     --save $CHECKPOINT_PATH \
+     --load $CHECKPOINT_PATH
+
--- a/Megatron-LM-core_r0.7.0.beta/examples/pretrain_vlm.sh
+++ b/Megatron-LM-core_r0.7.0.beta/examples/pretrain_vlm.sh
+#!/bin/bash
+
+# Train a vision language model.
+# Default arguments here use a mock dataset. Please edit the arguments to your liking.
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+
+# Check that the user has set an output path for model checkpoints.
+if [[ -z $CHECKPOINT_PATH ]]; then
+    echo "Please set CHECKPOINT_PATH for storing your model checkpoints."
+    exit 1
+fi
+
+DISTRIBUTED_ARGS="
+    --nproc_per_node 8 \
+"
+
+# Note: the learning rate and other hyperparameters used here are just examples and not optimized in any way.
+GPT_ARGS="
+    --num-layers 24 \
+    --hidden-size 512 \
+    --num-attention-heads 16 \
+    --seq-length 1024 \
+    --max-position-embeddings 1024 \
+    --micro-batch-size 2 \
+    --global-batch-size 16 \
+    --lr 0.00015 \
+    --train-iters 10000 \
+    --lr-decay-iters 3200 \
+    --lr-decay-style cosine \
+    --min-lr 1.0e-5 \
+    --weight-decay 1e-2 \
+    --lr-warmup-fraction .01 \
+    --clip-grad 1.0 \
+    --fp16
+"
+
+IMG_ARGS="
+    --img-h 336 \
+    --img-w 336 \
+    --patch-dim 14
+"
+
+DATA_ARGS="
+    --split 949,50,1
+    --tokenizer-type NullTokenizer
+    --vocab-size=8192
+"
+
+OUTPUT_ARGS="
+    --log-interval 100 \
+    --save-interval 5000 \
+    --eval-interval 1000 \
+    --eval-iters 10
+"
+
+# Select one of the cases below.
+
+# Multi GPU
+# torchrun $DISTRIBUTED_ARGS \
+
+# Single GPU
+# CUDA_VISIBLE_DEVICES=0 python -u \
+
+# Single GPU with a debugger
+# CUDA_VISIBLE_DEVICES=0 python -u -m debugpy --listen 0.0.0.0:5678 --wait-for-client \
+
+torchrun $DISTRIBUTED_ARGS \
+    pretrain_vlm.py \
+    $GPT_ARGS \
+    $IMG_ARGS \
+    $DATA_ARGS \
+    $OUTPUT_ARGS \
+    --distributed-backend nccl \
+    --save $CHECKPOINT_PATH \
+    --load $CHECKPOINT_PATH
--- a/Megatron-LM-core_r0.7.0.beta/examples/retro/README.md
+++ b/Megatron-LM-core_r0.7.0.beta/examples/retro/README.md
+# RETRO MODEL
+
+## Table of contents
+- [1. Training Setup](#1-training-setup)
+- [2. Data Preprocessing](#2-data-preprocessing)
+- [3. Configurations](#3-configurations)
+
+## 1. Training setup
+<a id="markdown-training-setup" name="training-setup"></a>
+
+To run the model using a docker container run it as follows
+```
+PYTORCH_IMAGE=nvcr.io/nvidia/pytorch:23.09-py3
+CHECKPOINT_PATH="" #<Specify path>
+TENSORBOARD_LOGS_PATH=""#<Specify path>
+
+docker run \
+  --gpus=all \
+  --ipc=host \
+  --workdir /workspace/megatron-lm \
+  -v /path/to/data:/path/to/data \
+  -v /path/to/megatron-lm:/workspace/megatron-lm \
+  megatron-lm nvcr.io/nvidia/pytorch:23.04-py3 \
+  bash examples/retro/train_retro_2b_distributed.sh $CHECKPOINT_PATH $TENSORBOARD_LOGS_PATH"
+
+```
+NOTE: Depending on the environment you are running it the above command might look slightly different.
+
+NOTE: Due to how Retro preprocess and caches elements of the pretraining dataset before training begins, some arguments are auto-loaded from the Retro preprocessing configuration. These loaded arguments include:
+
+- `--data-path`
+- `--data-cache-path`
+- `--eval-interval`
+- `--eval-iters`
+- `--global-batch-size`
+- `--tokenizer-type`
+- `--tokenizer-model`
+- `--vocab-file`
+- `--merge-file`
+- `--seed`
+- `--seq-length`
+- `--train-samples`
+
+
+## 2. Data Preprocessing
+<a id="markdown-data-preprocessing" name="data-preprocessing"></a>
+
+Retro preprocesses and caches data prior to pretraining, to greatly speed up pretraining. During data preprocessing, the retrieval database is built, and neighbor IDs are queried for each sample within the pretraining dataset. Please see `preprocess_data.sh` for an example script to preprocess data for Retro. The reference documentation for data preprocessing can be found [here](tools/retro/README.md).
+
+
+## 3. Configurations
+<a id="markdown-configurations" name="configurations"></a>
+The example in this folder shows you how to run a 2B model. Below are a few other example configurations.
+
+### 857M 
+```
+       --num-layers 24 \
+       --hidden-size 1024 \
+       --num-attention-heads 16 \
+       --seq-length 2048 \
+       --tensor-model-parallel-size 1 \
+       --pipeline-model-parallel-size 1 \
+
+```
+
+### 4B
+```
+       --num-layers 48 \
+       --hidden-size 2560 \
+       --num-attention-heads 32 \
+       --tensor-model-parallel-size 1 \
+       --pipeline-model-parallel-size 1 \
+
+```
--- a/Megatron-LM-core_r0.7.0.beta/examples/retro/preprocess_data.sh
+++ b/Megatron-LM-core_r0.7.0.beta/examples/retro/preprocess_data.sh
+#!/bin/bash
+
+set -u
+
+unset NCCL_DEBUG
+
+######## Megatron, Retro dirs. ########
+
+REPO_DIR="<path/to/megatron/repo>"
+RETRO_PROJECT_DIR="<path/to/retro/project/directory>"
+
+######## Task (e.g., db, index, query). ########
+
+# This script takes a single argument, which specifies the retro task to be
+# performed. The available tasks are: db-build, index-train, index-add, and
+# query-neighbors.
+
+# ~~ Examples ~~
+# RETRO_TASKS="db-build"          # Build the retrieval database
+# RETRO_TASKS="index-train"       # Train the index
+# RETRO_TASKS="index-add"         # Add data to the index
+# RETRO_TASKS="query-neighbors"   # Perform query pretraining for neighbors
+
+# You can also provide the task as a command-line argument when executing the
+# script. Example: ./preprocess_data.sh index-add
+RETRO_TASKS=$1
+
+######## Data. ########
+DATA_BLEND="<see --data-path in arguments.py>"
+
+######## Index. ########
+
+RETRO_INDEX_STR="OPQ32_64,IVF65536_HNSW8,PQ32"
+RETRO_INDEX_NTRAIN=66625331
+RETRO_INDEX_TRAIN_LOAD_FRACTION=0.97
+RETRO_INDEX_ADD_LOAD_FRACTION=0.95
+
+######## GPT. ########
+
+RETRO_GPT_SEED=1234
+RETRO_GPT_SPLIT="98,2,0"
+RETRO_GPT_DATA_PATH=${DATA_BLEND}
+RETRO_GPT_TRAIN_SAMPLES=200000
+RETRO_GPT_EVAL_INTERVAL=2000
+RETRO_GPT_EVAL_ITERS=50
+RETRO_GPT_LR_DECAY_SAMPLES=175000
+RETRO_GPT_LR_WARMUP_SAMPLES=10000
+RETRO_GPT_SEQ_LENGTH=2048
+RETRO_GPT_GLOBAL_BATCH_SIZE=256
+RETRO_GPT_CHUNK_LENGTH=64
+
+######## Query. ########
+
+RETRO_QUERY_NUM_NEIGHBORS_QUERY=200
+RETRO_QUERY_NUM_NEIGHBORS_SAVE=20
+RETRO_QUERY_EF_SEARCH=32
+RETRO_QUERY_NPROBE=4096
+
+######## Args. ########
+
+ARGS=" \
+    --distributed-timeout-minutes 600 \
+    --tensor-model-parallel-size 1 \
+    --pipeline-model-parallel-size 1 \
+    --num-layers 24 \
+    --hidden-size 1024 \
+    --num-attention-heads 16 \
+    --micro-batch-size 1 \
+    --global-batch-size ${RETRO_GPT_GLOBAL_BATCH_SIZE} \
+    --seq-length 512 \
+    --max-position-embeddings 512 \
+    --load ${RETRO_PROJECT_DIR}/checkpoints/bert \
+    --exit-on-missing-checkpoint \
+    --no-load-optim \
+    --data-path [null] \
+    --tokenizer-type BertWordPieceLowerCase \
+    --vocab-file ${RETRO_PROJECT_DIR}/tokenizer/bert-large-uncased-vocab.txt \
+    --split ${RETRO_GPT_SPLIT} \
+    --distributed-backend nccl \
+    --lr 0.0001 \
+    --lr-decay-style linear \
+    --min-lr 1.0e-5 \
+    --train-samples ${RETRO_GPT_TRAIN_SAMPLES} \
+    --lr-decay-samples ${RETRO_GPT_LR_DECAY_SAMPLES} \
+    --lr-warmup-samples ${RETRO_GPT_LR_WARMUP_SAMPLES} \
+    --weight-decay 1e-2 \
+    --clip-grad 1.0 \
+    --eval-interval ${RETRO_GPT_EVAL_INTERVAL} \
+    --eval-iters ${RETRO_GPT_EVAL_ITERS} \
+    --bf16 \
+    --no-data-sharding \
+    --no-gradient-accumulation-fusion \
+    --no-async-tensor-model-parallel-allreduce \
+    --bert-embedder-type megatron \
+    --output-bert-embeddings \
+    \
+    --retro-project-dir ${RETRO_PROJECT_DIR} \
+    --retro-tasks ${RETRO_TASKS} \
+    --retro-bert-vocab-file tokenizer/bert-large-uncased-vocab.txt \
+    --retro-bert-tokenizer-type BertWordPieceLowerCase \
+    \
+    --retro-gpt-seed ${RETRO_GPT_SEED} \
+    --retro-gpt-tokenizer-type GPTSentencePieceTokenizer \
+    --retro-gpt-tokenizer-model /path/to/tokenizer/model \
+    --retro-gpt-seq-length ${RETRO_GPT_SEQ_LENGTH} \
+    --retro-gpt-chunk-length ${RETRO_GPT_CHUNK_LENGTH} \
+    --retro-gpt-global-batch-size ${RETRO_GPT_GLOBAL_BATCH_SIZE} \
+    --retro-gpt-eval-interval ${RETRO_GPT_EVAL_INTERVAL} \
+    --retro-gpt-eval-iters ${RETRO_GPT_EVAL_ITERS} \
+    --retro-gpt-split ${RETRO_GPT_SPLIT} \
+    --retro-gpt-data-path ${RETRO_GPT_DATA_PATH} \
+    --retro-gpt-train-samples ${RETRO_GPT_TRAIN_SAMPLES} \
+    \
+    --retro-index-str ${RETRO_INDEX_STR} \
+    --retro-index-ntrain ${RETRO_INDEX_NTRAIN} \
+    --retro-index-train-load-fraction ${RETRO_INDEX_TRAIN_LOAD_FRACTION} \
+    --retro-index-add-load-fraction ${RETRO_INDEX_ADD_LOAD_FRACTION} \
+    --no-retro-index-delete-training-embeddings \
+    --no-retro-index-delete-added-codes \
+    \
+    --retro-query-num-neighbors-query ${RETRO_QUERY_NUM_NEIGHBORS_QUERY} \
+    --retro-query-num-neighbors-save ${RETRO_QUERY_NUM_NEIGHBORS_SAVE} \
+    --retro-query-ef-search ${RETRO_QUERY_EF_SEARCH} \
+    --retro-query-nprobe ${RETRO_QUERY_NPROBE} \
+"
+
+######## Command. ########
+
+NPROCS=8 # Number of GPUs.
+CMD="\
+    cd ${REPO_DIR} && pwd && \
+    export PYTHONPATH=$PYTHONPATH:${REPO_DIR} && \
+    python -m torch.distributed.run \
+    --nproc_per_node ${NPROCS} \
+    --nnodes 1 \
+    --node_rank ${NODE_RANK} \
+    --master_addr ${MASTER_ADDR} \
+    --master_port 6000 \
+    tools/retro/preprocess_data.py ${ARGS} \
+"
+echo "~~~~~~~~~~~~~~~~~~~~~~~~~~"
+echo "CMD = '$CMD'."
+echo "~~~~~~~~~~~~~~~~~~~~~~~~~~"
+eval $CMD
--- a/Megatron-LM-core_r0.7.0.beta/examples/retro/train_retro_2b_distributed.sh
+++ b/Megatron-LM-core_r0.7.0.beta/examples/retro/train_retro_2b_distributed.sh
+#!/bin/bash
+
+# Runs the "307M" parameter Retro model.
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+
+GPUS_PER_NODE=8
+# Change for multinode config
+MASTER_ADDR=localhost
+MASTER_PORT=6000
+NUM_NODES=1
+NODE_RANK=0
+WORLD_SIZE=$(($GPUS_PER_NODE*$NUM_NODES))
+
+CHECKPOINT_PATH=$1 #<Specify path>
+TENSORBOARD_LOGS_PATH=$2 #<Specify path>
+
+DISTRIBUTED_ARGS=(
+    --nproc_per_node $GPUS_PER_NODE 
+    --nnodes $NUM_NODES 
+    --master_addr $MASTER_ADDR 
+    --master_port $MASTER_PORT
+)
+
+######## GPT or Retro? ########
+
+# 0 : GPT.
+# 1 : Retro
+
+ADD_RETRIEVER=1
+
+######## Megatron, Retro dirs. ########
+
+RETRO_PROJECT_DIR="<path/to/retro/project/directory>"
+
+######## Model, training args. ########
+
+# ** Note: --seq-length auto loaded from Retro project dir.
+RETRO_MODEL_ARGS=(
+    --num-layers 32
+    --hidden-size 2048
+    --num-attention-heads 32
+)
+
+# ** Note: --data-path, --tokenizer-type, and --tokenizer-model auto loaded from Retro project dir.
+DATA_ARGS=(
+    --split 98,2,0
+)
+
+MODEL_PARALLEL_ARGS=(
+    --tensor-model-parallel-size 8
+    --pipeline-model-parallel-size 1 
+)
+
+# ** Note: --eval-interval, --eval-iters auto loaded from Retro project dir.
+EVAL_AND_LOGGING_ARGS=(
+    --log-interval 100
+    --save-interval 10000 
+    --eval-interval 1000 
+    --save $CHECKPOINT_PATH 
+    --load $CHECKPOINT_PATH 
+    --eval-iters 10
+    --tensorboard-dir $TENSORBOARD_LOGS_PATH 
+)
+
+TRAINING_ARGS=" \
+    --retro-project-dir ${RETRO_PROJECT_DIR} \
+    --use-mcore-models \
+    --transformer-impl transformer_engine \
+    --num-workers 8 \
+    --micro-batch-size 4 \
+    --lr-decay-samples 166400000 \
+    --lr-warmup-samples 162761 \
+    --lr 6.0e-4 \
+    --min-lr 6.0e-5 \
+    --lr-decay-style cosine \
+    --clip-grad 1.0 \
+    --weight-decay 0.1 \
+    --adam-beta1 0.9 \
+    --adam-beta2 0.95 \
+    --init-method-std 0.023 \
+    --log-params-norm \
+    --log-num-zeros-in-grad \
+    --bf16 \
+    --no-data-sharding \
+"
+
+if [ "$ADD_RETRIEVER" = "1" ]; then
+    TRAINING_ARGS+=" --retro-add-retriever"
+fi
+
+######## Command. ########
+
+torchrun ${DISTRIBUTED_ARGS[@]} pretrain_retro.py \
+    ${RETRO_MODEL_ARGS[@]} \
+    ${TRAINING_ARGS} \
+    ${MODEL_PARALLEL_ARGS[@]} \
+    ${DATA_ARGS[@]} \
+    ${EVAL_AND_LOGGING_ARGS[@]}
--- a/Megatron-LM-core_r0.7.0.beta/examples/run_simple_mcore_train_loop.py
+++ b/Megatron-LM-core_r0.7.0.beta/examples/run_simple_mcore_train_loop.py
+import os
+import torch
+from torch.optim import Adam
+from torch.utils.data import DataLoader
+from functools import partial
+from pathlib import Path
+
+from megatron.core import parallel_state
+from megatron.core import dist_checkpointing
+from megatron.core.pipeline_parallel.schedules import get_forward_backward_func
+from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed
+from megatron.core.transformer.transformer_config import TransformerConfig
+from megatron.core.models.gpt.gpt_model import GPTModel
+from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_local_spec
+from megatron.core.datasets.utils import Split
+from megatron.core.datasets.gpt_dataset import GPTDatasetConfig, MockGPTDataset
+
+def initialize_distributed(tensor_model_parallel_size = 1, pipeline_model_parallel_size = 1):
+    parallel_state.destroy_model_parallel()
+
+    # Torch setup for distributed training
+    rank = int(os.environ['LOCAL_RANK'])
+    world_size = torch.cuda.device_count()
+    torch.cuda.set_device(rank)
+    torch.distributed.init_process_group(world_size=world_size, rank=rank)
+
+    # Megatron core distributed training initialization
+    parallel_state.initialize_model_parallel(tensor_model_parallel_size, pipeline_model_parallel_size)
+
+def model_provider():
+    """Build the model."""
+
+    transformer_config = TransformerConfig(
+        num_layers=2, 
+        hidden_size=12, 
+        num_attention_heads=4, 
+        use_cpu_initialization=True, 
+        pipeline_dtype=torch.float32)
+
+    gpt_model = GPTModel(
+        config=transformer_config, 
+        transformer_layer_spec=get_gpt_layer_local_spec(), 
+        vocab_size=100, 
+        max_sequence_length=64)
+
+    return gpt_model
+
+def get_train_data_iterator():
+    config = GPTDatasetConfig(
+        random_seed = 0,
+        sequence_length = 64,
+        blend=[],
+        mock=True,
+        reset_position_ids=False,
+        reset_attention_mask=False,
+        eod_mask_loss=False,
+        tokenizer="dummy")
+
+    training_data= MockGPTDataset(Split.train, config)
+
+    train_dataloader = DataLoader(training_data, batch_size=8, shuffle=True)
+
+    train_iterator = iter(train_dataloader)
+    return train_iterator
+
+def forward_step_func(data_iterator, model):
+
+    def loss_func(loss_mask: torch.Tensor, output_tensor: torch.Tensor):
+
+        losses = output_tensor.float()
+        loss_mask = loss_mask.view(-1).float()
+        loss = torch.sum(losses.view(-1) * loss_mask) / loss_mask.sum()
+        # If you have data parallel reduce loss across data parallel groups.
+        # If pipeline parallel, loss computation is done only in last stage.
+
+        return loss, {'lm loss': loss}
+
+    data = next(data_iterator)
+    tokens = data['tokens'].to(device)
+    attention_mask = data['attention_mask'].to(device)
+    position_ids = data['position_ids'].to(device)
+    labels = data['labels'].to(device)
+    loss_mask = data['loss_mask'].to(device)
+
+    output_tensor = model(tokens, position_ids, attention_mask,
+                          labels=labels)
+
+    return output_tensor, partial(loss_func, loss_mask)
+
+def save_distributed_checkpoint(checkpoint_path, gpt_model):
+    sharded_state_dict = gpt_model.sharded_state_dict(prefix='')
+    dist_checkpointing.save(sharded_state_dict=sharded_state_dict, checkpoint_dir=checkpoint_path)
+
+def load_distributed_checkpoint(checkpoint_path, gpt_model):
+    sharded_state_dict=gpt_model.sharded_state_dict(prefix='')
+    checkpoint = dist_checkpointing.load(sharded_state_dict=sharded_state_dict, checkpoint_dir=checkpoint_path)
+    gpt_model.load_state_dict(checkpoint)
+    return gpt_model
+
+if __name__ == "__main__":
+    initialize_distributed(tensor_model_parallel_size=2, pipeline_model_parallel_size=1)
+    model_parallel_cuda_manual_seed(123)
+
+    gpt_model = model_provider()
+    device = torch.device("cuda")
+    gpt_model.to(device)
+
+    optim = Adam(gpt_model.parameters())
+
+    train_iterator = get_train_data_iterator()
+
+    forward_backward_func = get_forward_backward_func()
+
+    # Running the model for 5 iterations
+    for _ in range(5):
+        optim.zero_grad()
+
+        losses_reduced = forward_backward_func(
+            forward_step_func=forward_step_func,
+            data_iterator=train_iterator,
+            model=gpt_model,
+            num_microbatches=1,
+            seq_length=64,
+            micro_batch_size=8,
+            decoder_seq_length=64,
+            forward_only=False)
+
+        optim.step()
+
+        print(f'Losses reduced :  {losses_reduced}')
+
+    # Saving the model
+    ckpt_path = os.getcwd() + '/ckpt'
+    Path(ckpt_path).mkdir(exist_ok=True)
+    save_distributed_checkpoint(gpt_model=gpt_model, checkpoint_path=ckpt_path)
+
+    # Loading the model
+    gpt_model = load_distributed_checkpoint(gpt_model=gpt_model, checkpoint_path=ckpt_path)
+    gpt_model.to(device)
+    print('Successfully loaded the model')   
--- a/Megatron-LM-core_r0.7.0.beta/examples/run_text_generation_server_345M.sh
+++ b/Megatron-LM-core_r0.7.0.beta/examples/run_text_generation_server_345M.sh
+#!/bin/bash
+# This example will start serving the 345M model.
+DISTRIBUTED_ARGS="--nproc_per_node 1 \
+                  --nnodes 1 \
+                  --node_rank 0 \
+                  --master_addr localhost \
+                  --master_port 6000"
+
+CHECKPOINT=<Path to checkpoint (e.g /345m)>
+VOCAB_FILE=<Path to vocab.json (e.g. /gpt2-vocab.json)>
+MERGE_FILE=<Path to merges.txt (e.g. /gpt2-merges.txt)>
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+
+pip install flask-restful
+
+torchrun $DISTRIBUTED_ARGS tools/run_text_generation_server.py   \
+       --tensor-model-parallel-size 1  \
+       --pipeline-model-parallel-size 1  \
+       --num-layers 24  \
+       --hidden-size 1024  \
+       --load ${CHECKPOINT}  \
+       --num-attention-heads 16  \
+       --max-position-embeddings 1024  \
+       --tokenizer-type GPT2BPETokenizer  \
+       --fp16  \
+       --micro-batch-size 1  \
+       --seq-length 1024  \
+       --vocab-file $VOCAB_FILE  \
+       --merge-file $MERGE_FILE  \
+       --seed 42
--- a/Megatron-LM-core_r0.7.0.beta/examples/run_text_generation_server_345M_8_tensor_parallel.sh
+++ b/Megatron-LM-core_r0.7.0.beta/examples/run_text_generation_server_345M_8_tensor_parallel.sh
+#!/bin/bash
+# This example will start serving the 345M model that is partitioned 8 way tensor parallel
+DISTRIBUTED_ARGS="--nproc_per_node 8 \
+                  --nnodes 1 \
+                  --node_rank 0 \
+                  --master_addr localhost \
+                  --master_port 6000"
+
+CHECKPOINT=<Path to checkpoint (e.g /345m)>
+VOCAB_FILE=<Path to vocab.json (e.g. /gpt2-vocab.json)>
+MERGE_FILE=<Path to merges.txt (e.g. /gpt2-merges.txt)>
+
+pip install flask-restful
+
+python -m torch.distributed.launch $DISTRIBUTED_ARGS tools/run_text_generation_server.py   \
+       --tensor-model-parallel-size 8  \
+       --pipeline-model-parallel-size 1  \
+       --num-layers 24  \
+       --hidden-size 1024  \
+       --load ${CHECKPOINT}  \
+       --num-attention-heads 16  \
+       --max-position-embeddings 1024  \
+       --tokenizer-type GPT2BPETokenizer  \
+       --fp16  \
+       --micro-batch-size 1  \
+       --seq-length 1024  \
+       --vocab-file $VOCAB_FILE  \
+       --merge-file $MERGE_FILE  \
+       --seed 42
--- a/Megatron-LM-core_r0.7.0.beta/examples/sc21/CONFIG.sh
+++ b/Megatron-LM-core_r0.7.0.beta/examples/sc21/CONFIG.sh
+#!/bin/bash
+
+
+# SLURM options.
+export SLURM_PARTITION=<slurm partition, used to feed -p option in slurm>
+export SLURM_ACCOUNT=<slurm account, used to feed -A option in slurm>
+
+
+# Source code.
+export MEGATRON_CODE_DIR=<megatron source code directory>
+
+
+# This variable is used to mount the relevant part of the filesystem
+# inside the docker container. Note that the `MEGATRON_CODE_DIR` and the
+# launch directory already get mounted; this variable should be used to
+# mount the directories that contain the data and tokenizer files.
+export DOCKER_MOUNT_DIR=<megatron dataset and bpe tokenizer vocab path>
+
+
+# Data and tokenizer files.
+MEGATRON_DATA=<path to megatron processed data>
+BPE_VOCAB_FILE=<path to bpe vocab file>
+BPE_MERGE_FILE=<path to bpe merges file>
+
+
+# Megatron input parameters.
+# `MEGATRON_EXTRA_PARAMS` can be used to provide any extra parameters
+# that are not listed here. 
+export MEGATRON_PARAMS=" ${MEGATRON_EXTRA_PARAMS} \
+	--tensor-model-parallel-size ${TP} \
+	--pipeline-model-parallel-size ${PP} \
+	--micro-batch-size ${MBS} \
+	--global-batch-size ${GBS} \
+        --num-layers ${NLS} \
+        --hidden-size ${HS} \
+        --num-attention-heads ${NAH} \
+	--DDP-impl ${DDP} \
+	--data-path ${MEGATRON_DATA} \
+	--vocab-file ${BPE_VOCAB_FILE} \
+	--merge-file ${BPE_MERGE_FILE} \
+        --log-interval 5 \
+        --seq-length 2048 \
+        --max-position-embeddings 2048 \
+        --train-iters 500 \
+        --lr-decay-iters 320 \
+        --lr 0.0001 \
+	--min-lr 0.00001 \
+        --lr-decay-style cosine \
+        --lr-warmup-fraction 0.01 \
+        --split 969,30,1 \
+        --eval-iters 100 \
+        --eval-interval 1000 \
+        --clip-grad 1.0 \
+        --fp16 \
+	--loss-scale 8192 "
+
+
--- a/Megatron-LM-core_r0.7.0.beta/examples/sc21/README.md
+++ b/Megatron-LM-core_r0.7.0.beta/examples/sc21/README.md
+# Reproducing Figures in SC21 Paper
+
+
+This directory contains some of the scripts that were used to produce the
+results in the [Megatron paper](https://arxiv.org/pdf/2104.04473.pdf) that is
+to appear at [SuperComputing 2021](https://sc21.supercomputing.org/). These
+scripts use [Slurm](https://slurm.schedmd.com/documentation.html) with the
+[pyxis plugin](https://github.com/NVIDIA/pyxis), but can be modified for other
+schedulers as well.
+
+
+## Git commit
+
+To replicate these results use Megatron-LM commit: 6985e58938d40ad91ac07b0fddcfad8132e1447e
+
+
+## Setup
+
+All the cluster-dependent variables are in [`CONFIG.sh`](./CONFIG.sh). Please
+update the unspecified values (in angle brackets `<...>`) before launching any
+scripts.
+
+
+
+## Scripts
+
+Below is a list of scripts that can be used to reproduce various figures in our
+[paper](https://arxiv.org/pdf/2104.04473.pdf):
+
+* [run_table_1.sh](./run_table_1.sh): Table 1 showing weak-scaling throughput
+for GPT models ranging from 1 billion to 1 trillion parameters.
+* [run_figure_11.sh](./run_figure_11.sh): Figure 11 showing the weak-scaling
+performance of pipeline parallelism.
+* [run_figure_12.sh](./run_figure_12.sh): Figure 12 showing the effect of
+the interleaved schedule on a 175B GPT model.
+* [run_figure_13.sh](./run_figure_13.sh): Figure 13 showing the effect of
+different degrees of pipeline and tensor model parallelism on a model with
+162.2 billion parameters.
+* [run_figure_14.sh](./run_figure_14.sh): Figure 14 showing the effect of
+different degrees of data and pipeline model parallelism on a model with
+5.9 billion parameters.
+* [run_figure_15.sh](./run_figure_15.sh): Figure 15 showing the effect of
+different degrees of data and tensor model parallelism on a model with
+5.9 billion parameters.
+* [run_figure_16.sh](./run_figure_16.sh): Figure 16 showing the effect of
+microbatch size.
+* [run_figure_17.sh](./run_figure_17.sh): Figure 17 showing the effect of
+activation recomputation.
+* [run_figure_18.sh](./run_figure_18.sh): Figure 18 showing the effect of
+the scatter-gather communication optimization.
--- a/Megatron-LM-core_r0.7.0.beta/examples/sc21/SBATCH.sh
+++ b/Megatron-LM-core_r0.7.0.beta/examples/sc21/SBATCH.sh
+#!/bin/bash
+
+
+sbatch -p ${SLURM_PARTITION} \
+       -A ${SLURM_ACCOUNT} \
+       --job-name=${JOB_NAME} \
+       --nodes=${NNODES} \
+       --export=MEGATRON_CODE_DIR,MEGATRON_PARAMS,DOCKER_MOUNT_DIR SRUN.sh
+
+exit 0
+
+
+
--- a/Megatron-LM-core_r0.7.0.beta/examples/sc21/SRUN.sh
+++ b/Megatron-LM-core_r0.7.0.beta/examples/sc21/SRUN.sh
+#!/bin/bash
+
+#SBATCH -t 0:30:00 --exclusive --mem=0 --overcommit --ntasks-per-node=8
+
+
+THIS_DIR=`pwd`
+DATETIME=`date +'date_%y-%m-%d_time_%H-%M-%S'`
+mkdir -p ${THIS_DIR}/logs
+
+
+CMD="python -u ${MEGATRON_CODE_DIR}/pretrain_gpt.py ${MEGATRON_PARAMS}"
+
+
+srun -l \
+     --container-image "nvcr.io#nvidia/pytorch:20.12-py3" \
+     --container-mounts "${THIS_DIR}:${THIS_DIR},${MEGATRON_CODE_DIR}:${MEGATRON_CODE_DIR},${DOCKER_MOUNT_DIR}:${DOCKER_MOUNT_DIR}" \
+     --output=${THIS_DIR}/logs/%x_%j_$DATETIME.log sh -c "${CMD}"
+