Initial commit

deb8370c · hepj · deb8370c · deb8370c · deb8370c · deb8370c
Commit deb8370c authored Jan 09, 2025 by hepj
20 changed files
--- a/PAI-Megatron-LM-240718/examples/academic_paper_scripts/sc21/CONFIG.sh
+++ b/PAI-Megatron-LM-240718/examples/academic_paper_scripts/sc21/CONFIG.sh
+#!/bin/bash
+# SLURM options.
+export SLURM_PARTITION=<slurm partition, used to feed -p option in slurm>
+export SLURM_ACCOUNT=<slurm account, used to feed -A option in slurm>
+# Source code.
+export MEGATRON_CODE_DIR=<megatron source code directory>
+# This variable is used to mount the relevant part of the filesystem
+# inside the docker container. Note that the `MEGATRON_CODE_DIR` and the
+# launch directory already get mounted; this variable should be used to
+# mount the directories that contain the data and tokenizer files.
+export DOCKER_MOUNT_DIR=<megatron dataset and bpe tokenizer vocab path>
+# Data and tokenizer files.
+MEGATRON_DATA=<path to megatron processed data>
+BPE_VOCAB_FILE=<path to bpe vocab file>
+BPE_MERGE_FILE=<path to bpe merges file>
+# Megatron input parameters.
+# `MEGATRON_EXTRA_PARAMS` can be used to provide any extra parameters
+# that are not listed here. 
+export MEGATRON_PARAMS=" ${MEGATRON_EXTRA_PARAMS} \
+	--tensor-model-parallel-size ${TP} \
+	--pipeline-model-parallel-size ${PP} \
+	--micro-batch-size ${MBS} \
+	--global-batch-size ${GBS} \
+        --num-layers ${NLS} \
+        --hidden-size ${HS} \
+        --num-attention-heads ${NAH} \
+	--DDP-impl ${DDP} \
+	--data-path ${MEGATRON_DATA} \
+	--vocab-file ${BPE_VOCAB_FILE} \
+	--merge-file ${BPE_MERGE_FILE} \
+        --log-interval 5 \
+        --seq-length 2048 \
+        --max-position-embeddings 2048 \
+        --train-iters 500 \
+        --lr-decay-iters 320 \
+        --lr 0.0001 \
+	--min-lr 0.00001 \
+        --lr-decay-style cosine \
+        --lr-warmup-fraction 0.01 \
+        --split 969,30,1 \
+        --eval-iters 100 \
+        --eval-interval 1000 \
+        --clip-grad 1.0 \
+        --fp16 \
+	--loss-scale 8192 "
--- a/PAI-Megatron-LM-240718/examples/academic_paper_scripts/sc21/README.md
+++ b/PAI-Megatron-LM-240718/examples/academic_paper_scripts/sc21/README.md
+# Reproducing Figures in SC21 Paper
+This directory contains some of the scripts that were used to produce the
+results in the [Megatron paper](https://arxiv.org/pdf/2104.04473.pdf) that is
+to appear at [SuperComputing 2021](https://sc21.supercomputing.org/). These
+scripts use [Slurm](https://slurm.schedmd.com/documentation.html) with the
+[pyxis plugin](https://github.com/NVIDIA/pyxis), but can be modified for other
+schedulers as well.
+## Git commit
+To replicate these results use Megatron-LM commit: 6985e58938d40ad91ac07b0fddcfad8132e1447e
+## Setup
+All the cluster-dependent variables are in [`CONFIG.sh`](./CONFIG.sh). Please
+update the unspecified values (in angle brackets `<...>`) before launching any
+scripts.
+## Scripts
+Below is a list of scripts that can be used to reproduce various figures in our
+[paper](https://arxiv.org/pdf/2104.04473.pdf):
+* [run_table_1.sh](./run_table_1.sh): Table 1 showing weak-scaling throughput
+for GPT models ranging from 1 billion to 1 trillion parameters.
+* [run_figure_11.sh](./run_figure_11.sh): Figure 11 showing the weak-scaling
+performance of pipeline parallelism.
+* [run_figure_12.sh](./run_figure_12.sh): Figure 12 showing the effect of
+the interleaved schedule on a 175B GPT model.
+* [run_figure_13.sh](./run_figure_13.sh): Figure 13 showing the effect of
+different degrees of pipeline and tensor model parallelism on a model with
+162.2 billion parameters.
+* [run_figure_14.sh](./run_figure_14.sh): Figure 14 showing the effect of
+different degrees of data and pipeline model parallelism on a model with
+5.9 billion parameters.
+* [run_figure_15.sh](./run_figure_15.sh): Figure 15 showing the effect of
+different degrees of data and tensor model parallelism on a model with
+5.9 billion parameters.
+* [run_figure_16.sh](./run_figure_16.sh): Figure 16 showing the effect of
+microbatch size.
+* [run_figure_17.sh](./run_figure_17.sh): Figure 17 showing the effect of
+activation recomputation.
+* [run_figure_18.sh](./run_figure_18.sh): Figure 18 showing the effect of
+the scatter-gather communication optimization.
--- a/PAI-Megatron-LM-240718/examples/academic_paper_scripts/sc21/SBATCH.sh
+++ b/PAI-Megatron-LM-240718/examples/academic_paper_scripts/sc21/SBATCH.sh
+#!/bin/bash
+sbatch -p ${SLURM_PARTITION} \
+       -A ${SLURM_ACCOUNT} \
+       --job-name=${JOB_NAME} \
+       --nodes=${NNODES} \
+       --export=MEGATRON_CODE_DIR,MEGATRON_PARAMS,DOCKER_MOUNT_DIR SRUN.sh
+exit 0
--- a/PAI-Megatron-LM-240718/examples/academic_paper_scripts/sc21/SRUN.sh
+++ b/PAI-Megatron-LM-240718/examples/academic_paper_scripts/sc21/SRUN.sh
+#!/bin/bash
+#SBATCH -t 0:30:00 --exclusive --mem=0 --overcommit --ntasks-per-node=8
+THIS_DIR=`pwd`
+DATETIME=`date +'date_%y-%m-%d_time_%H-%M-%S'`
+mkdir -p ${THIS_DIR}/logs
+CMD="python -u ${MEGATRON_CODE_DIR}/pretrain_gpt.py ${MEGATRON_PARAMS}"
+srun -l \
+     --container-image "nvcr.io#nvidia/pytorch:20.12-py3" \
+     --container-mounts "${THIS_DIR}:${THIS_DIR},${MEGATRON_CODE_DIR}:${MEGATRON_CODE_DIR},${DOCKER_MOUNT_DIR}:${DOCKER_MOUNT_DIR}" \
+     --output=${THIS_DIR}/logs/%x_%j_$DATETIME.log sh -c "${CMD}"
--- a/PAI-Megatron-LM-240718/examples/academic_paper_scripts/sc21/run_figure_11.sh
+++ b/PAI-Megatron-LM-240718/examples/academic_paper_scripts/sc21/run_figure_11.sh
+#!/bin/bash
+# ================================
+# Choose the case to run.
+# ================================
+# Pipeline-parallel size options = [1, 2, 4, 8].
+PP=1
+# Batch size (global batch size) options = [8, 128].
+GBS=8
+# Set pipeline-parallel size options.
+NLS=$((3*PP))
+NNODES=${PP}
+# Other params.
+TP=8
+MBS=1
+HS=20480
+NAH=128
+DDP=local
+MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
+# Name of the job.
+export JOB_NAME=results_figure_11_pipeline_parallel_size_${PP}_batch_size_${GBS}
+# Import the configs.
+. `pwd`/CONFIG.sh
+# Submit the job.
+. `pwd`/SBATCH.sh
+exit 0
--- a/PAI-Megatron-LM-240718/examples/academic_paper_scripts/sc21/run_figure_12.sh
+++ b/PAI-Megatron-LM-240718/examples/academic_paper_scripts/sc21/run_figure_12.sh
+#!/bin/bash
+# ================================
+# Choose the case to run.
+# ================================
+# Interleaved schedule options = [YES, NO].
+INTERLEAVED=YES
+# Batch size (global batch size) options = [12, 24, 36, ..., 60].
+GBS=12
+# Set interleaved schedule options.
+if [ ${INTERLEAVED} == "YES" ]; then
+    MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform --num-layers-per-virtual-pipeline-stage 2 "
+elif [ ${INTERLEAVED} == "NO" ]; then
+    MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
+else
+    echo "Invalid configuration"
+    exit 1
+fi
+# Other params.
+TP=8
+PP=12
+MBS=1
+NLS=96
+HS=12288
+NAH=96
+DDP=local
+NNODES=12
+# Name of the job.
+export JOB_NAME=results_figure_12_interleaved_${INTERLEAVED}_batch_size_${GBS}
+# Import the configs.
+. `pwd`/CONFIG.sh
+# Submit the job.
+. `pwd`/SBATCH.sh
+exit 0
--- a/PAI-Megatron-LM-240718/examples/academic_paper_scripts/sc21/run_figure_13.sh
+++ b/PAI-Megatron-LM-240718/examples/academic_paper_scripts/sc21/run_figure_13.sh
+#!/bin/bash
+# ================================
+# Choose the case to run.
+# ================================
+# Pipeline-parallel size options = [2, 4, 8, 16, 32].
+PP=2
+# Batch size (global batch size) options = [32, 128].
+GBS=32
+# Set pipeline-parallel and tensor-parallel size options.
+TP=$((64/PP))
+# Other params.
+MBS=1
+NLS=32
+HS=20480
+NAH=128
+DDP=local
+MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
+NNODES=8
+# Name of the job.
+export JOB_NAME=results_figure_13_pipeline_parallel_size_${PP}_tensor_parallel_size_${TP}_batch_size_${GBS}
+# Import the configs.
+. `pwd`/CONFIG.sh
+# Submit the job.
+. `pwd`/SBATCH.sh
+exit 0
--- a/PAI-Megatron-LM-240718/examples/academic_paper_scripts/sc21/run_figure_14.sh
+++ b/PAI-Megatron-LM-240718/examples/academic_paper_scripts/sc21/run_figure_14.sh
+#!/bin/bash
+# ================================
+# Choose the case to run.
+# ================================
+# Pipeline-parallel size options = [2, 4, 8, 16, 32].
+PP=2
+# Batch size (global batch size) options = [32, 512].
+GBS=32
+# Set pipeline-parallel and data-parallel size options.
+DP=$((64/PP))
+# Other params.
+TP=1
+MBS=1
+NLS=32
+HS=3840
+NAH=32
+DDP=local
+MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
+NNODES=8
+# Name of the job.
+export JOB_NAME=results_figure_14_pipeline_parallel_size_${PP}_data_parallel_size_${DP}_batch_size_${GBS}
+# Import the configs.
+. `pwd`/CONFIG.sh
+# Submit the job.
+. `pwd`/SBATCH.sh
+exit 0
--- a/PAI-Megatron-LM-240718/examples/academic_paper_scripts/sc21/run_figure_15.sh
+++ b/PAI-Megatron-LM-240718/examples/academic_paper_scripts/sc21/run_figure_15.sh
+#!/bin/bash
+# ================================
+# Choose the case to run.
+# ================================
+# Tensor-parallel size options = [2, 4, 8, 16, 32].
+TP=2
+# Batch size (global batch size) options = [32, 128, 512].
+GBS=32
+# Set tensor-parallel and data-parallel size options.
+DP=$((64/TP))
+# Other params.
+PP=1
+MBS=1
+NLS=32
+HS=3840
+NAH=32
+DDP=local
+MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
+NNODES=8
+# Name of the job.
+export JOB_NAME=results_figure_15_tensor_parallel_size_${TP}_data_parallel_size_${DP}_batch_size_${GBS}
+# Import the configs.
+. `pwd`/CONFIG.sh
+# Submit the job.
+. `pwd`/SBATCH.sh
+exit 0
--- a/PAI-Megatron-LM-240718/examples/academic_paper_scripts/sc21/run_figure_16.sh
+++ b/PAI-Megatron-LM-240718/examples/academic_paper_scripts/sc21/run_figure_16.sh
+#!/bin/bash
+# ================================
+# Choose the case to run.
+# ================================
+# Microbatch size options = [1, 2, 4, 8].
+MBS=1
+# Batch size (global batch size) options = [128, 512].
+GBS=128
+# Other params.
+TP=8
+PP=8
+NLS=32
+HS=15360
+NAH=128
+DDP=local
+MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
+NNODES=8
+# Name of the job.
+export JOB_NAME=results_figure_16_microbatch_size_${MBS}_batch_size_${GBS}
+# Import the configs.
+. `pwd`/CONFIG.sh
+# Submit the job.
+. `pwd`/SBATCH.sh
+exit 0
--- a/PAI-Megatron-LM-240718/examples/academic_paper_scripts/sc21/run_figure_17.sh
+++ b/PAI-Megatron-LM-240718/examples/academic_paper_scripts/sc21/run_figure_17.sh
+#!/bin/bash
+# ================================
+# Choose the case to run.
+# ================================
+# Activation recomputation options = [YES, NO].
+ACTIVATION_RECOMPUTATION=YES
+# Batch size (global batch size) options = [1, 2, 4, ..., 256].
+GBS=1
+# Set activation recomputation.
+if [ ${ACTIVATION_RECOMPUTATION} == "YES" ]; then
+    MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
+elif [ ${ACTIVATION_RECOMPUTATION} == "NO" ]; then
+    MEGATRON_EXTRA_PARAMS=""
+else
+    echo "Invalid configuration"
+    exit 1
+fi
+# Other params.
+TP=8
+PP=16
+MBS=1
+NLS=80
+HS=12288
+NAH=96
+DDP=local
+NNODES=16
+# Name of the job.
+export JOB_NAME=results_figure_17_activation_recomputation_${ACTIVATION_RECOMPUTATION}_batch_size_${GBS}
+# Import the configs.
+. `pwd`/CONFIG.sh
+# Submit the job.
+. `pwd`/SBATCH.sh
+exit 0
--- a/PAI-Megatron-LM-240718/examples/academic_paper_scripts/sc21/run_figure_18.sh
+++ b/PAI-Megatron-LM-240718/examples/academic_paper_scripts/sc21/run_figure_18.sh
+#!/bin/bash
+# ================================
+# Choose the case to run.
+# ================================
+# Scatter-gather communication optimization options = [YES, NO].
+SCATTER_GATHER=YES
+# Batch size (global batch size) options = [12, 24, 36, ..., 60].
+GBS=12
+# Set scatter-gather communication optimization options.
+if [ ${SCATTER_GATHER} == "YES" ]; then
+    MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform --num-layers-per-virtual-pipeline-stage 2 "
+elif [ ${SCATTER_GATHER} == "NO" ]; then
+    MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform --num-layers-per-virtual-pipeline-stage 2 --no-scatter-gather-tensors-in-pipeline "
+else
+    echo "Invalid configuration"
+    exit 1
+fi
+# Other params.
+TP=8
+PP=12
+MBS=1
+NLS=96
+HS=12288
+NAH=96
+DDP=local
+NNODES=12
+# Name of the job.
+export JOB_NAME=results_figure_18_scatter_gather_${SCATTER_GATHER}_batch_size_${GBS}
+# Import the configs.
+. `pwd`/CONFIG.sh
+# Submit the job.
+. `pwd`/SBATCH.sh
+exit 0
--- a/PAI-Megatron-LM-240718/examples/academic_paper_scripts/sc21/run_table_1.sh
+++ b/PAI-Megatron-LM-240718/examples/academic_paper_scripts/sc21/run_table_1.sh
+#!/bin/bash
+# ================================
+# Choose the case to run.
+# ================================
+# model size options = [1.7B, 3.6B, 7.5B, 18B, 39B, 76B, 145B, 310B, 530B, 1T]
+MODEL_SIZE=1.7B
+if [ ${MODEL_SIZE} == "1.7B" ]; then
+    TP=1
+    PP=1
+    MBS=16
+    GBS=512
+    NLS=24
+    HS=2304
+    NAH=24
+    DDP=torch
+    NNODES=4
+    MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
+elif [ ${MODEL_SIZE} == "3.6B" ]; then
+    TP=2
+    PP=1
+    MBS=16
+    GBS=512
+    NLS=30
+    HS=3072
+    NAH=32
+    DDP=torch
+    NNODES=8
+    MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
+elif [ ${MODEL_SIZE} == "7.5B" ]; then
+    TP=4
+    PP=1
+    MBS=16
+    GBS=512
+    NLS=36
+    HS=4096
+    NAH=32
+    DDP=torch
+    NNODES=16
+    MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
+elif [ ${MODEL_SIZE} == "18B" ]; then
+    TP=8
+    PP=1
+    MBS=8
+    GBS=1024
+    NLS=40
+    HS=6144
+    NAH=48
+    DDP=torch
+    NNODES=32
+    MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
+elif [ ${MODEL_SIZE} == "39B" ]; then
+    TP=8
+    PP=2
+    MBS=4
+    GBS=1536
+    NLS=48
+    HS=8192
+    NAH=64
+    DDP=local
+    NNODES=64
+    MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
+elif [ ${MODEL_SIZE} == "76B" ]; then
+    TP=8
+    PP=4
+    MBS=2
+    GBS=1792
+    NLS=60
+    HS=10240
+    NAH=80
+    DDP=local
+    NNODES=128
+    MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform --num-layers-per-virtual-pipeline-stage 5"
+elif [ ${MODEL_SIZE} == "145B" ]; then
+    TP=8
+    PP=8
+    MBS=2
+    GBS=2304
+    NLS=80
+    HS=12288
+    NAH=96
+    DDP=local
+    NNODES=192
+    MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform --num-layers-per-virtual-pipeline-stage 5 "
+elif [ ${MODEL_SIZE} == "310B" ]; then
+    TP=8
+    PP=16
+    MBS=1
+    GBS=2160
+    NLS=96
+    HS=16384
+    NAH=128
+    DDP=local
+    NNODES=240
+    MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform --num-layers-per-virtual-pipeline-stage 3 "
+elif [ ${MODEL_SIZE} == "530B" ]; then
+    TP=8
+    PP=35
+    MBS=1
+    GBS=2520
+    NLS=105
+    HS=20480
+    NAH=128
+    DDP=local
+    NNODES=315
+    MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform --num-layers-per-virtual-pipeline-stage 1 "
+elif [ ${MODEL_SIZE} == "1T" ]; then
+    TP=8
+    PP=64
+    MBS=1
+    GBS=3072
+    NLS=128
+    HS=25600
+    NAH=160
+    DDP=local
+    NNODES=384
+    MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
+else
+    echo "Invalid configuration"
+    exit 1
+fi
+# Name of the job
+export JOB_NAME=results_table_1_model_size_${MODEL_SIZE}
+# Import the configs.
+. `pwd`/CONFIG.sh
+# Submit the job.
+. `pwd`/SBATCH.sh
+exit 0
--- a/PAI-Megatron-LM-240718/examples/bert/README.md
+++ b/PAI-Megatron-LM-240718/examples/bert/README.md
+# BERT MODEL
+## Table of contents
+- [1. Training Setup](#1-training-setup)
+- [2. Configurations](#2-configurations)
+## 1. Training setup
+<a id="markdown-training-setup" name="training-setup"></a>
+To run the model using a docker container run it as follows
+```
+PYTORCH_IMAGE=nvcr.io/nvidia/pytorch:24.01-py3
+CHECKPOINT_PATH="" #<Specify path>
+TENSORBOARD_LOGS_PATH=""#<Specify path>
+VOCAB_FILE="" #<Specify path to file>//bert-vocab.txt
+DATA_PATH="" #<Specify path and file prefix>_text_document
+docker run \
+  --gpus=all \
+  --ipc=host \
+  --workdir /workspace/megatron-lm \
+  -v /path/to/data:/path/to/data \
+  -v /path/to/megatron-lm:/workspace/megatron-lm \
+  megatron-lm nvcr.io/nvidia/pytorch:24.01-py3 \
+  bash examples/bert/train_bert_340m_distributed.sh $CHECKPOINT_PATH $TENSORBOARD_LOGS_PATH $VOCAB_FILE $DATA_PATH "
+```
+NOTE: Depending on the environment you are running it the above command might like slightly different.
+## 2. Configurations
+<a id="markdown-configurations" name="configurations"></a>
+The example in this folder shows you how to run 340m large model. There are other configs you could run as well
+### 4B
+```
+       --num-layers 48 \
+       --hidden-size 2560 \
+       --num-attention-heads 32 \
+       --tensor-model-parallel-size 1 \
+       --pipeline-model-parallel-size 1 \
+```
+### 20B
+```
+       --num-layers 48 \
+       --hidden-size 6144 \
+       --num-attention-heads 96 \
+       --tensor-model-parallel-size 4 \
+       --pipeline-model-parallel-size 4 \
+```
\ No newline at end of file
--- a/PAI-Megatron-LM-240718/examples/bert/train_bert_340m_distributed.sh
+++ b/PAI-Megatron-LM-240718/examples/bert/train_bert_340m_distributed.sh
+#!/bin/bash
+# Runs the "340M" parameter model (Bert - Large)
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+GPUS_PER_NODE=8
+# Change for multinode config
+MASTER_ADDR=localhost
+MASTER_PORT=6000
+NUM_NODES=1
+NODE_RANK=0
+WORLD_SIZE=$(($GPUS_PER_NODE*$NUM_NODES))
+CHECKPOINT_PATH=$1 #<Specify path>
+TENSORBOARD_LOGS_PATH=$2 #<Specify path>
+VOCAB_FILE=$3 #<Specify path to file>/bert-vocab.json
+DATA_PATH=$4 #<Specify path and file prefix>_text_document
+DISTRIBUTED_ARGS=(
+    --nproc_per_node $GPUS_PER_NODE 
+    --nnodes $NUM_NODES 
+    --master_addr $MASTER_ADDR 
+    --master_port $MASTER_PORT
+)
+BERT_MODEL_ARGS=(
+    --num-layers 24 
+    --hidden-size 1024 
+    --num-attention-heads 16 
+    --seq-length 512 
+    --max-position-embeddings 512 
+)
+TRAINING_ARGS=(
+    --micro-batch-size 4 
+    --global-batch-size 32 
+    --train-iters 1000000 
+    --weight-decay 1e-2 
+    --clip-grad 1.0 
+    --fp16
+    --lr 0.0001
+    --lr-decay-iters 990000 
+    --lr-decay-style linear 
+    --min-lr 1.0e-5 
+    --weight-decay 1e-2 
+    --lr-warmup-fraction .01 
+    --clip-grad 1.0 
+)
+MODEL_PARALLEL_ARGS=(
+	--tensor-model-parallel-size 8 
+	--pipeline-model-parallel-size 16 
+)
+DATA_ARGS=(
+    --data-path $DATA_PATH 
+    --vocab-file $VOCAB_FILE 
+    --split 949,50,1
+)
+EVAL_AND_LOGGING_ARGS=(
+    --log-interval 100
+    --save-interval 10000 
+    --eval-interval 1000 
+    --save $CHECKPOINT_PATH 
+    --load $CHECKPOINT_PATH 
+    --eval-iters 10
+    --tensorboard-dir $TENSORBOARD_LOGS_PATH 
+)
+torchrun ${DISTRIBUTED_ARGS[@]} pretrain_bert.py \
+    ${BERT_MODEL_ARGS[@]} \
+    ${TRAINING_ARGS[@]} \
+    ${MODEL_PARALLEL_ARGS[@]} \
+    ${DATA_ARGS[@]} \
+    ${EVAL_AND_LOGGING_ARGS[@]}
--- a/PAI-Megatron-LM-240718/examples/gpt3/README.md
+++ b/PAI-Megatron-LM-240718/examples/gpt3/README.md
+# GPT3 MODEL
+## Table of contents
+- [1. Training Setup](#1-training-setup)
+- [2. Configurations](#2-configurations)
+- [3. Training Results](#3-training-results)
+## 1. Training setup
+<a id="markdown-training-setup" name="training-setup"></a>
+To run the model using a docker container run it as follows
+```
+PYTORCH_IMAGE=nvcr.io/nvidia/pytorch:24.01-py3
+CHECKPOINT_PATH="" #<Specify path>
+TENSORBOARD_LOGS_PATH=""#<Specify path>
+VOCAB_FILE="" #<Specify path to file>/gpt2-vocab.json
+MERGE_FILE="" #<Specify path to file>/gpt2-merges.txt
+DATA_PATH="" #<Specify path and file prefix>_text_document
+docker run \
+  --gpus=all \
+  --ipc=host \
+  --workdir /workspace/megatron-lm \
+  -v /path/to/data:/path/to/data \
+  -v /path/to/megatron-lm:/workspace/megatron-lm \
+  megatron-lm nvcr.io/nvidia/pytorch:24.01-py3 \
+  bash examples/gpt3/train_gpt3_175b_distributed.sh $CHECKPOINT_PATH $TENSORBOARD_LOGS_PATH $VOCAB_FILE $MERGE_FILE $DATA_PATH "
+```
+NOTE: Depending on the environment you are running it the above command might like slightly different.
+## 2. Configurations
+<a id="markdown-configurations" name="configurations"></a>
+The example in this folder shows you how to run 175B model. There are other configs you could run as well
+### 345M
+```
+       --num-layers 12 \
+       --hidden-size 512 \
+       --num-attention-heads 8 \
+       --seq-length 1024 \
+       --tensor-model-parallel-size 1 \
+       --pipeline-model-parallel-size 1 \
+```
+### 857M
+```
+       --num-layers 24 \
+       --hidden-size 1024 \
+       --num-attention-heads 16 \
+       --seq-length 2048 \
+       --tensor-model-parallel-size 1 \
+       --pipeline-model-parallel-size 1 \
+```
--- a/PAI-Megatron-LM-240718/examples/gpt3/gpt_config.yaml
+++ b/PAI-Megatron-LM-240718/examples/gpt3/gpt_config.yaml
+# WARNING: Yaml configs is currently an experimental feature
+language_model:
+  # model architecture
+  num_layers: 24
+  hidden_size: 1024
+  num_attention_heads: 16
+  num_query_groups: null
+  ffn_hidden_size: null
+  kv_channels: null
+  hidden_dropout: 0.0
+  attention_dropout: 0.0
+  fp32_residual_connection: False
+  apply_residual_connection_post_layernorm: False
+  layernorm_epsilon: 1.e-5
+  layernorm_zero_centered_gamma: True
+  add_bias_linear: False
+  bias_activation_fusion: False
+  add_qkv_bias: False
+  gated_linear_unit: False
+  activation_func: swiglu
+  num_moe_experts: null
+  rotary_interleaved: False
+  window_size: null
+  # initialization
+  init_method: null
+  init_method_std: 0.02
+  output_layer_init_method: null
+  # mixed-precision
+  apply_query_key_layer_scaling: False
+  attention_softmax_in_fp32: False
+  # fusion
+  bias_swiglu_fusion: True
+  masked_softmax_fusion: True
+  persist_layer_norm: False
+  memory_efficient_layer_norm: False
+  bias_dropout_fusion: True
+  apply_rope_fusion: True
+  # activation recomputation
+  recompute_granularity: null
+  recompute_method: null
+  recompute_num_layers: null
+  distribute_saved_activations: null
+  # fp8 related
+  fp8: null
+  fp8_margin: 0
+  fp8_interval: 1
+  fp8_amax_history_len: 1
+  fp8_amax_compute_algo: "most_recent"
+  fp8_wgrad: True
+  # miscellaneous
+  clone_scatter_output_in_embedding: True
+  normalization: "LayerNorm"  # alt value supported by TE: "RMSNorm"
+  # MoE related
+  moe_router_load_balancing_type: "aux_loss"
+  moe_router_topk: 2
+  moe_grouped_gemm: False
+  moe_aux_loss_coeff: 0  # 1e-2 would be a good start value for load balance loss.
+  moe_z_loss_coeff: null  # 1e-3 would be a good start value for z-loss
+  moe_input_jitter_eps: null
+  moe_token_dropping: False
+model_parallel:
+  # Model parallelism
+  tensor_model_parallel_size: 1
+  context_parallel_size: 1
+  pipeline_model_parallel_size: 1
+  virtual_pipeline_model_parallel_size: null
+  sequence_parallel: True
+  expert_model_parallel_size: 1
+  # Initialization
+  perform_initialization: True
+  use_cpu_initialization: null
+  # Training
+  fp16: False
+  bf16: True
+  params_dtype: null # Set from above arguments for core
+  timers: null
+  # Optimizations
+  gradient_accumulation_fusion: True
+  async_tensor_model_parallel_allreduce: True
+  tp_comm_overlap: False
+  # Debug Options
+  tp_comm_split_ag: True
+  tp_comm_atomic_ag: True
+  tp_comm_split_rs: True
+  tp_comm_atomic_rs: True
+  tp_comm_bulk_wgrad: True
+  tp_comm_bulk_dgrad: True
+  # Parallelism
+  finalize_model_grads_func: null
+  # Pipeline Parallel
+  pipeline_dtype: null
+  grad_scale_func: null
+  enable_autocast: False
+  autocast_dtype: null
+  variable_seq_lengths: False
+  num_microbatches_with_partial_activation_checkpoints: null
+  overlap_p2p_comm: False
+  batch_p2p_comm: True
+  batch_p2p_sync: True
+  use_ring_exchange_p2p: False
+  deallocate_pipeline_outputs: False
+  no_sync_func: null
+  grad_sync_func: null
+  param_sync_func: null
+  pipeline_model_parallel_split_rank: null
+  # CPU Offloading
+  cpu_offloading: False
+  cpu_offloading_num_layers: 0
+  _cpu_offloading_context: null
+  cpu_offloading_weights: False
+  cpu_offloading_activations: True
+  # Timing
+  barrier_with_L1_time: True
+# training:
+use_legacy_models: False
+spec: null
+micro_batch_size: 2
+global_batch_size: 128
+rampup_batch_size: [32, 32, 65324160] 
+check_for_nan_in_loss_and_grad: True
+num_layers_per_virtual_pipeline_stage: null
+encoder_num_layers: null
+decoder_num_layers: null
+rotary_seq_len_interpolation_factor: null
+add_position_embedding: False
+make_vocab_size_divisible_by: 128
+group_query_attention: False
+exit_signal_handler: False
+exit_duration_in_mins: null
+exit_interval: null
+untie_embeddings_and_output_weights: True
+position_embedding_type: rope
+rotary_percent: 0.5
+openai_gelu: False
+squared_relu: False
+swiglu: True
+onnx_safe: null
+bert_binary_head: True
+max_position_embeddings: 4096
+transformer_impl: local
+use_flash_attn: False
+seed: 1234
+data_parallel_random_init: False
+# Optimizer
+optimizer: adam
+lr: 2.5e-4
+lr_decay_style: cosine
+lr_decay_iters: null
+lr_decay_samples: 255126953
+lr_warmup_fraction: null
+lr_warmup_iters: 0
+lr_warmup_samples: 81381
+lr_warmup_init: 0.0
+min_lr: 2.5e-5
+weight_decay: 0.1
+start_weight_decay: null
+end_weight_decay: null
+weight_decay_incr_style: constant
+clip_grad: 1.0
+adam_beta1: 0.9
+adam_beta2: 0.95
+adam_eps: 1.e-08
+sgd_momentum: 0.9
+override_opt_param_scheduler: False
+use_checkpoint_opt_param_scheduler: False
+# checkpointing arguments
+save: null
+save_interval: 20000
+no_save_optim: null
+no_save_rng: null
+load: null
+no_load_optim: null
+no_load_rng: null
+finetune: False
+use_checkpoint_args: False
+exit_on_missing_checkpoint: False
+# loss arguments
+loss_scale: null
+initial_loss_scale: 4294967296
+min_loss_scale: 1.0
+loss_scale_window: 1000 
+hysteresis: 2
+accumulate_allreduce_grads_in_fp32: False
+fp16_lm_cross_entropy: False
+# distributed arguments
+distributed_backend: nccl
+distributed_timeout_minutes: 10
+overlap_grad_reduce: False
+delay_grad_reduce: True
+overlap_param_gather: False
+delay_param_gather: False
+scatter_gather_tensors_in_pipeline: True
+local_rank: null
+lazy_mpu_init: null
+empty_unused_memory_level: 0
+standalone_embedding_stage: False
+use_distributed_optimizer: False
+nccl_communicator_config_path: null
+train_iters: null
+eval_iters: 32
+eval_interval: 2000
+skip_train: False
+adlr_autoresume: False
+adlr_autoresume_interval: 1000
+# garbage collection
+manual_gc: False
+manual_gc_interval: 0
+manual_gc_eval: True
+tp_comm_overlap_cfg: null
+#data
+data_path: null
+split: '99,1,0'
+train_data_path: null
+valid_data_path: null
+test_data_path: null
+data_cache_path: null
+mock_data: False
+vocab_size: null
+vocab_file: null
+merge_file: null
+vocab_extra_ids: 0
+seq_length: 4096
+encoder_seq_length: null
+decoder_seq_length: null
+retriever_seq_length: 256
+sample_rate: 1.0
+mask_prob: 0.15
+short_seq_prob: 0.1
+num_workers: 2
+tokenizer_type: GPTSentencePieceTokenizer
+tokenizer_model: null
+reset_position_ids: False
+reset_attention_mask: False
+eod_mask_loss: False
+train_samples: 268554688
+dataloader_type: null
+#profile:
+profile: False
+profile_ranks: [0]
+profile_step_end: 12
+profile_step_start: 10
+#logging:
+log_params_norm: True
+log_num_zeros_in_grad: True
+log_throughput: False
+log_progress: False
+timing_log_level: 0
+timing_log_option: minmax
+tensorboard_log_interval: 1
+tensorboard_queue_size: 1000
+log_timers_to_tensorboard: False
+log_batch_size_to_tensorboard: False
+log_learning_rate_to_tensorboard: True
+log_learning_rate_to_tensorboard: True
+log_validation_ppl_to_tensorboard: False
+log_memory_to_tensorboard: False
+log_world_size_to_tensorboard: False
+log_loss_scale_to_tensorboard: True
+wandb_project: ''
+wandb_exp_name: ''
+wandb_save_dir: ''
+enable_one_logger: True
+one_logger_project: megatron-lm
+one_logger_run_name: null
+log_interval: 100
+tensorboard_dir: null
--- a/PAI-Megatron-LM-240718/examples/gpt3/train_gpt3_175b_distributed.sh
+++ b/PAI-Megatron-LM-240718/examples/gpt3/train_gpt3_175b_distributed.sh
+#!/bin/bash
+# Runs the "175B" parameter model
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+GPUS_PER_NODE=8
+# Change for multinode config
+MASTER_ADDR=localhost
+MASTER_PORT=6000
+NUM_NODES=1
+NODE_RANK=0
+WORLD_SIZE=$(($GPUS_PER_NODE*$NUM_NODES))
+CHECKPOINT_PATH=$1 #<Specify path>
+TENSORBOARD_LOGS_PATH=$2 #<Specify path>
+VOCAB_FILE=$3 #<Specify path to file>/gpt2-vocab.json
+MERGE_FILE=$4 #<Specify path to file>/gpt2-merges.txt
+DATA_PATH=$5 #<Specify path and file prefix>_text_document
+DISTRIBUTED_ARGS=(
+    --nproc_per_node $GPUS_PER_NODE 
+    --nnodes $NUM_NODES 
+    --master_addr $MASTER_ADDR 
+    --master_port $MASTER_PORT
+)
+GPT_MODEL_ARGS=(
+    --num-layers 96 
+    --hidden-size 12288 
+    --num-attention-heads 96 
+    --seq-length 2048 
+    --max-position-embeddings 2048 
+)
+TRAINING_ARGS=(
+    --micro-batch-size 1 
+    --global-batch-size 1536 
+    --rampup-batch-size 16 16 5859375 
+    --train-iters 500000 
+    --weight-decay 0.1 
+    --adam-beta1 0.9 
+    --adam-beta2 0.95 
+    --init-method-std 0.006 
+    --clip-grad 1.0 
+    --fp16
+    --lr 6.0e-5 
+    --lr-decay-style cosine 
+    --min-lr 6.0e-6
+    --lr-warmup-fraction .001 
+    --lr-decay-iters 430000 
+)
+MODEL_PARALLEL_ARGS=(
+	--tensor-model-parallel-size 8 
+	--pipeline-model-parallel-size 16 
+)
+DATA_ARGS=(
+    --data-path $DATA_PATH 
+    --vocab-file $VOCAB_FILE 
+    --merge-file $MERGE_FILE 
+    --split 949,50,1
+)
+EVAL_AND_LOGGING_ARGS=(
+    --log-interval 100
+    --save-interval 10000 
+    --eval-interval 1000 
+    --save $CHECKPOINT_PATH 
+    --load $CHECKPOINT_PATH 
+    --eval-iters 10
+    --tensorboard-dir $TENSORBOARD_LOGS_PATH 
+)
+torchrun ${DISTRIBUTED_ARGS[@]} pretrain_gpt.py \
+    ${GPT_MODEL_ARGS[@]} \
+    ${TRAINING_ARGS[@]} \
+    ${MODEL_PARALLEL_ARGS[@]} \
+    ${DATA_ARGS[@]} \
+    ${EVAL_AND_LOGGING_ARGS[@]}
--- a/PAI-Megatron-LM-240718/examples/inference/README.md
+++ b/PAI-Megatron-LM-240718/examples/inference/README.md
+### Megatron Core Inference Documentation
+This guide will walk you through how you can use megatron core for inference on your models. 
+### Contents
+- [Megatron Core Inference Documentation](#megatron-core-inference-documentation)
+- [Contents](#contents)
+  - [1. Quick Start](#1-quick-start)
+    - [1.1 Understanding The Code](#11-understanding-the-code)
+    - [1.2 Running The Code](#12-running-the-code)
+  - [2. Flow of Control In MCore Backend](#2-flow-of-control-in-mcore-backend)
+  - [3. Customizing The Inference Pipeline](#3-customizing-the-inference-pipeline)
+    - [3.1. Create Your Own Inference Backend](#31-create-your-own-inference-backend)
+    - [3.2. Create Your Own Text Generation Controller](#32-create-your-own-text-generation-controller)
+    - [3.3. Support Other Models](#33-support-other-models)
+    - [3.3. Modify Inference Parameters](#33-modify-inference-parameters)
+  - [4. Future work](#4-future-work)
+<br>
+#### 1. Quick Start
+This will walk you through the flow of running batch inference on a GPT model trained using megatron core. The file can be found at [simple_gpt_batch_inference.py](./gpt/simple_gpt_batch_inference.py)
+<br>
+##### 1.1 Understanding The Code
+***STEP 1 - We initialize model parallel and other default arguments***
+We can default micro batch size to be 1, since for TP models it is not used, and for PP models it is calculated during runtime. 
+```python
+    initialize_megatron(
+        args_defaults={'no_load_rng': True, 'no_load_optim': True, 'micro_batch_size': 1}
+    )
+```
+***STEP 2 - We load the model using the model_provider_function***
+NOTE: The model provider function in the script supports MCore and Legacy models. 
+```python
+    model = get_model(model_provider, wrap_with_ddp=False)
+    load_checkpoint(model, None, None)
+    model = model[0]
+```
+***STEP 3 - Choose an engine***
+One of the important elements of the generate function is an inference engine. In this example we will be choosing the [megatron core engine](../../megatron/core/inference/engine/mcore_engine.py) with a [simple text generation controller](../../megatron/core/inference/text_generation_controllers/simple_text_generation_controller.py), the default engine. Other engines that will be supported in the future are TRTLLMEngine.
+```python
+    inference_wrapped_model = GPTInferenceWrapper(model, args)
+    text_generation_controller = SimpleTextGenerationController(
+        inference_wrapped_model=inference_wrapped_model, 
+        tokenizer=tokenizer
+    )
+    inference_backend = MCoreEngine(
+        text_generation_controller=text_generation_controller, max_batch_size=args.max_batch_size
+    )
+```
+***STEP 4 - Run the generate function and display results***
+We use default values for the [common inference params](../../megatron/core/inference/common_inference_params.py). Customize this if you want to change top_p, top_k, number of tokens to generate etc. 
+*Note that the result is returned as a list of [InferenceRequests](../../megatron/core/inference/inference_request.py)*
+```python
+    results: List[InferenceRequest] = inference_engine.generate(
+        prompts=args.prompts, common_inference_params=common_inference_params
+    )
+    if torch.distributed.get_rank() == 0:
+        for idx, result in enumerate(results):
+            print(f' ------------- RESULT FOR PROMPT {idx} --------------- ')
+            result = {
+                'id': result.request_id,
+                'input_prompt': result.prompt, 
+                'generated_text': result.generated_text,
+                'generated_tokens' : result.generated_tokens
+                }
+            print(result)
+```
+<br>
+##### 1.2 Running The Code
+An example run script is shown below. Change the tokenizer paths, inference params, and other settings for your model. 
+For a quick recap on inference params refer to [this blog](https://ivibudh.medium.com/a-guide-to-controlling-llm-model-output-exploring-top-k-top-p-and-temperature-parameters-ed6a31313910) 
+```
+#In a slurm cluster (You could also use docker)
+ACCOUNT=<account>
+MLM_PATH=/path/to/megatron-lm
+GPT_CKPT=/path/to/gpt/ckpt
+VOCAB_MERGE_FILE_PATH=/path/to/vocab/and/merge/file
+CONTAINER_IMAGE=nvcr.io/ea-bignlp/ga-participants/nemofw-training:23.11
+srun --account $ACCOUNT \
+--job-name=$ACCOUNT:inference \
+--partition=batch \
+--time=01:00:00 \
+--container-image $CONTAINER_IMAGE \
+--container-mounts $MLM_PATH:/workspace/megatron-lm/,$GPT_CKPT:/workspace/mcore_gpt_ckpt,$VOCAB_MERGE_FILE_PATH:/workspace/tokenizer \
+--no-container-mount-home \
+--pty /bin/bash \
+# Inside the container run the following. 
+cd megatron-lm/
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+TOKENIZER_ARGS=(
+    --vocab-file /workspace/tokenizer/gpt2-vocab.json
+    --merge-file /workspace/tokenizer/gpt2-merges.txt
+    --tokenizer-type GPT2BPETokenizer
+)
+MODEL_ARGS=(
+    --use-checkpoint-args
+    --use-mcore-models
+    --load /workspace/mcore_gpt_ckpt
+)
+INFERENCE_SPECIFIC_ARGS=(
+    --attention-dropout 0.0
+    --hidden-dropout 0.0
+    --num-tokens-to-generate 20
+    --max-batch-size 4
+)
+torchrun --nproc-per-node=4 examples/inference/gpt/simple_gpt_batch_inference.py \
+    ${TOKENIZER_ARGS[@]} \
+    ${MODEL_ARGS[@]} \
+    ${INFERENCE_SPECIFIC_ARGS[@]} \
+    --prompts "prompt one " "sample prompt two" "sample prompt 3"
+NOTE: Other parameters which can be customized for inference are :-
+--temperature (Sampling temperature)
+--top_k (top_k sampling)
+--top_p (top_p sampling)
+--num-tokens-to-generate (Number of tokens to generate for each prompt)
+--inference-batch-times-seqlen-threshold (During inference, if batch-size times sequence-length is smaller than this threshold then we will not use pipelining, otherwise we will.')
+--use-dist-ckpt (If you are using dist checkpoint format for the model)
+--use-legacy-models (If you are using legacy gpt model instead of mcore gpt model)
+```
+<br>
+#### 2. Flow of Control In MCore Backend
+The following is what happens in the [simple_gpt_batch_inference.py](./gpt/simple_gpt_batch_inference.py).
+* We call  [mcore_engine](../../megatron/core/inference/engines/mcore_engine.py) **generate()** function with all our input prompts.
+* The scheduler in the engine will add these prompts to the [active requests] pool (../../megatron/core/inference/inference_request.py) until we hit the max batch size, and then it will put the rest in the waiting requests pool. 
+* The engine will then run until all requests (waiting + active) are completed 
+    * The active requests are passed into  **generate_all_output_tokens_static_batch()** of the text generation controller . 
+    * This function uses the [model_inference_wrappers](../../megatron/core/inference/model_inference_wrappers/abstract_model_inference_wrapper.py) **prep_model_for_inference()** , and then runs an auto regressive loop
+    * In the auto regressive loop, the **get_batch_for_context_window()** method of the inference wrapper is called to get the required input, passes it into the **run_one_forward_step()** method, which calls the appropriate (PP, TP) model `.forward()` methods to get the output logits
+    * The output logits are synchronized across all pipeline parallel ranks
+    * The text generation controller obtains the log probabilities and samples tokens based on the strategy defined in the common inference parameters.
+    * The sampled tokens are then appended to the input prompt tokens for the next iteration 
+    * The **update_generation_status()** method of the text generation controller checks which prompts have finished generating or hit a stop condition
+    * After the inference loop, the result is detokenized and stored as an attribute of the InferenceRequest. These requests are marked as completed. 
+    * The **update_requests_pool()** method of the scheduler moves completed requests into the completed request pool and waiting requests into the active request pool
+<br>
+#### 3. Customizing The Inference Pipeline
+The following guide will walk you through how you can customize different parts of the inference pipeline. There are three levels at which you can customize the pipeline. 
+* **Inference engine** - Highest level of customization. Currently we support the MCore Engine. Change this to add a new engine.
+* **Text generation controller** - Extend this to customize tokenization, detokenization, or implement a new sampling strategy.
+* **Inference Wrapped Model** - Change this to support a new model.
+* **Modify Inference Parameters** - Change this to update top_p, top_k, number of tokens to be generated, temperature, or other sampling parameters.
+<br>
+##### 3.1. Create Your Own Inference Backend 
+This is the highest level of customization. The  [abstract_engine.py](./../../megatron/core/inference/engine/abstract_engine.py) file has a generate method that can be extended to support a new backend. 
+```python
+class AbstractEngine(ABC):
+    @staticmethod
+    def generate(self) -> dict:
+        """The abstract backend's generate function. 
+        To define your own backend, make sure you implement this and return the outputs as a dictionary . 
+<br>
+##### 3.2. Create Your Own Text Generation Controller
+In case you want to use the megatron core backend, but would like to overwrite the tokenization, text generation or detokenization extend the [simple_text_generation_controller.py](../../megatron/core/inference/text_generation_controllers/simple_text_generation_controller.py). The class has the following methods
+``` python
+class SimpleTextGenerationController:
+    def tokenize_prompt(self, prompt: str) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Utility to tokenize the input prompts"""
+    def sample_from_logits(
+        self,
+        last_token_logits: torch.Tensor,
+        common_inference_params: CommonInferenceParams,
+        vocab_size: int,
+    ) -> torch.Tensor:
+        """Samples the logits to generate outputs
+        Given the logits of the last token, this function samples it according to the parameters defined in common_inference_params and returns the samples
+        """
+    def update_generation_status(
+        self,
+        updated_prompts_tokens: torch.Tensor,
+        generation_started: torch.Tensor,
+        current_context_end_position: int,
+        is_generation_done_tensor: torch.Tensor,
+        generated_sequence_lengths: torch.Tensor,
+    ) -> torch.Tensor:
+        """Function to check which prompts have reached an end condition
+        We check which prompts have reached an end condition and set the corresponding flags of the is_generation_done_tensor to True . The generated sequence lengths increases as we keep generating, until that prompts hits an eod condition. The generation started status tensor helps us determine which prompts have started generating
+        """
+    def generate_all_output_tokens_static_batch(
+        self, active_requests: OrderedDict[int, InferenceRequest],
+    ) -> OrderedDict[int, InferenceRequest]:
+        """Utility to generate all the output tokens and probabilities for the prompts .
+        This utility generates the output tokens for a static batch. It runs the forward steps till all prompts complete generation, updates the status of these requests to completed, adds the generated result and returns these requests
+        """
+    def detokenize_generations(self, prompt_tokens_with_generated_tokens: torch.Tensor) -> str:
+        """Detokenize the output generations"""
+```
+<br>
+##### 3.3. Support Other Models
+In order to support other models please extend the [abstract_model_inference_wrapper.py](./../../megatron/core/inference/model_inference_wrappers/abstract_model_inference_wrapper.py) file. The abstract wrapper already supports the following :
+* Forward method which automatically calls the appropriate forward method (PP or TP etc) depending on model parallel settings
+* Initalizes the model and puts it in eval mode
+* Obtains the input parameters (batch size, max seq length) and has an instance of the input 
+The main methods to change for your model might be the following: 
+```python
+class AbstractModelInferenceWrapper:
+    def prep_model_for_inference(self, prompts_tokens: torch.Tensor):
+        """A utility function for preparing model for inference
+        The function gets called once before the auto regressive inference loop. It puts the model in eval mode , and gets some model and inference data parameters. Extend this to build position ids ,attention mask etc, so that required slices can be extracted during the forward pass
+        """
+    @abc.abstractclassmethod
+    def get_batch_for_context_window(self) -> List:
+        """Returns the input data for inference 
+        This function gets called iteratively in the inference loop . It can be used to extract relevant input from the prompt tokens, attention mask etc. required for each step in inference.
+```
+Refer to [gpt_inference_wrapper.py](../../megatron/core/inference/model_inference_wrappers/gpt/gpt_inference_wrapper.py) for an example of extending this for GPTModel.
+<br>
+##### 3.3. Modify Inference Parameters
+We use  [common inference params](../../megatron/core/inference/common_inference_params.py) for text generation. Customize this if you want to change top_p, top_k, number of tokens to generate etc. If you want to add other attributes that you would use in the inference loop, you can do that as shown below
+```
+from megatron.core.inference.common_inference_params import CommonInferenceParams
+c = CommonInferenceParams(temperature=0.5)
+c.add_attributes({'min_length':4, 'eod_id':153})
+```
+<br>
+#### 4. Future work
+The following are planned for the future releases . 
+* Dynamic batching 
+* Paged Attention
+* TRTLLM Engine support
+* Support for Multimodal model inference
\ No newline at end of file
--- a/PAI-Megatron-LM-240718/examples/inference/gpt/simple_gpt_batch_inference.py
+++ b/PAI-Megatron-LM-240718/examples/inference/gpt/simple_gpt_batch_inference.py
+import os
+from megatron.core.inference.model_inference_wrappers.inference_wrapper_config import InferenceWrapperConfig
+from pretrain_gpt import model_provider
+import torch
+import sys
+from argparse import Namespace
+from megatron.core.inference.engines.abstract_engine import AbstractEngine
+from megatron.core.inference.engines.mcore_engine import MCoreEngine
+from megatron.core.inference.common_inference_params import CommonInferenceParams
+from megatron.core.inference.model_inference_wrappers.gpt.gpt_inference_wrapper import GPTInferenceWrapper
+from megatron.core.inference.inference_request import InferenceRequest
+from megatron.core.inference.text_generation_controllers.simple_text_generation_controller import SimpleTextGenerationController
+from megatron.core.transformer.module import MegatronModule
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__),
+                                             os.path.pardir, os.path.pardir)))
+from megatron.training import get_args
+from megatron.training import get_tokenizer
+from megatron.training.checkpointing import load_checkpoint
+from megatron.core import mpu
+from megatron.training.initialize import initialize_megatron
+from megatron.training import get_model
+from typing import List
+def add_text_generate_args(parser):
+    """Text generation arguments."""
+    group = parser.add_argument_group(title='text generation')
+    group.add_argument("--temperature", type=float, default=1.0,
+                       help='Sampling temperature.')
+    group.add_argument("--top_k", type=int, default=1,
+                       help='Top k sampling.')
+    group.add_argument("--top_p", type=float, default=0.0,
+                       help='Top p sampling.')
+    group.add_argument("--return-log-probs", action='store_true', default=False,
+                       help='Return the log probabilities of the final output tokens')
+    group.add_argument("--num-tokens-to-generate", type=int, default=30,
+                       help='Number of tokens to generate for each prompt')
+    group.add_argument("--prompts", metavar='N', type=str, nargs='+',
+                       help='Input prompts with each prompt within quotes and seperated by space')
+    group.add_argument("--max-batch-size", type=int, default=1,
+                       help='Max number of prompts to process at once')
+    return parser
+def get_inference_engine(args: Namespace, model: MegatronModule) -> AbstractEngine:
+    """Utility to get the relevant backend for running inference
+    This function will automatically chose the TRTLLMBackend when possible, and if not revert to Mcore backend if the user does not specify any backends. TRT LLM Backend is not implmented yet. 
+    Args:
+        args (Namespace): The user arguments parsed from command line
+        model (MegatronModule): The megatron model . 
+    Returns:
+        AbstractBackend: The chosen backend
+    """
+    tokenizer = get_tokenizer()
+    inference_wrapper_config = InferenceWrapperConfig(
+        hidden_size=args.hidden_size,
+        inference_batch_times_seqlen_threshold=args.inference_batch_times_seqlen_threshold,
+        fp32_residual_connection=args.fp32_residual_connection,
+        params_dtype=args.params_dtype,
+        padded_vocab_size=args.padded_vocab_size
+    )
+    inference_wrapped_model = GPTInferenceWrapper(model, inference_wrapper_config)
+    text_generation_controller = SimpleTextGenerationController(inference_wrapped_model=inference_wrapped_model, tokenizer=tokenizer)
+    return MCoreEngine(text_generation_controller=text_generation_controller, max_batch_size=args.max_batch_size)
+def main():
+    """Main program."""
+    # Note: The default args passed here can be overwritten by using appropriate params (check arguments.py file)
+    # Micro batch size is not needed to be set by user. (It is calculated based on inference-batch-times-seqlen-threshold argument)
+    initialize_megatron(extra_args_provider=add_text_generate_args,
+                        args_defaults={'no_load_rng': True,
+                                       'no_load_optim': True,
+                                       'micro_batch_size': 1, 
+                                       'exit_on_missing_checkpoint': True})
+    # Set up model and load checkpoint
+    model = get_model(model_provider, wrap_with_ddp=False)
+    load_checkpoint(model, None, None)
+    model = model[0]
+    args = get_args()
+    inference_engine = get_inference_engine(args, model)
+    common_inference_params = CommonInferenceParams(
+        temperature=args.temperature, 
+        top_k=args.top_k, 
+        top_p=args.top_p, 
+        return_log_probs=args.return_log_probs, 
+        num_tokens_to_generate=args.num_tokens_to_generate)
+    results: List[InferenceRequest] = inference_engine.generate(
+        prompts=args.prompts, common_inference_params=common_inference_params
+    )
+    if torch.distributed.get_rank() == 0:
+        for idx, result in enumerate(results):
+            print(f' \n------------- RESULT FOR PROMPT {idx} --------------- ')
+            result = {
+                'id': result.request_id,
+                'input_prompt': result.prompt, 
+                'generated_text': result.generated_text,
+                'generated_tokens' : result.generated_tokens
+                }
+            print(result)
+if __name__ == "__main__":
+    main()