“13.0”

1106877d · jerrrrry · 1106877d · 1106877d · 1106877d · 1106877d
Commit 1106877d authored Sep 23, 2025 by jerrrrry
20 changed files
--- a/Megatron-LM/examples/academic_paper_scripts/sc21/run_figure_15.sh
+++ b/Megatron-LM/examples/academic_paper_scripts/sc21/run_figure_15.sh
+#!/bin/bash
+
+# ================================
+# Choose the case to run.
+# ================================
+
+# Tensor-parallel size options = [2, 4, 8, 16, 32].
+TP=2
+
+# Batch size (global batch size) options = [32, 128, 512].
+GBS=32
+
+
+
+
+
+# Set tensor-parallel and data-parallel size options.
+DP=$((64/TP))
+
+
+# Other params.
+PP=1
+MBS=1
+NLS=32
+HS=3840
+NAH=32
+DDP=local
+MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
+NNODES=8
+
+
+# Name of the job.
+export JOB_NAME=results_figure_15_tensor_parallel_size_${TP}_data_parallel_size_${DP}_batch_size_${GBS}
+
+
+# Import the configs.
+. `pwd`/CONFIG.sh
+
+
+# Submit the job.
+. `pwd`/SBATCH.sh
+
+
+exit 0
+
+
+
--- a/Megatron-LM/examples/academic_paper_scripts/sc21/run_figure_16.sh
+++ b/Megatron-LM/examples/academic_paper_scripts/sc21/run_figure_16.sh
+#!/bin/bash
+
+# ================================
+# Choose the case to run.
+# ================================
+
+# Microbatch size options = [1, 2, 4, 8].
+MBS=1
+
+# Batch size (global batch size) options = [128, 512].
+GBS=128
+
+
+
+
+
+# Other params.
+TP=8
+PP=8
+NLS=32
+HS=15360
+NAH=128
+DDP=local
+MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
+NNODES=8
+
+
+# Name of the job.
+export JOB_NAME=results_figure_16_microbatch_size_${MBS}_batch_size_${GBS}
+
+
+# Import the configs.
+. `pwd`/CONFIG.sh
+
+
+# Submit the job.
+. `pwd`/SBATCH.sh
+
+
+exit 0
+
+
+
--- a/Megatron-LM/examples/academic_paper_scripts/sc21/run_figure_17.sh
+++ b/Megatron-LM/examples/academic_paper_scripts/sc21/run_figure_17.sh
+#!/bin/bash
+
+# ================================
+# Choose the case to run.
+# ================================
+
+# Activation recomputation options = [YES, NO].
+ACTIVATION_RECOMPUTATION=YES
+
+# Batch size (global batch size) options = [1, 2, 4, ..., 256].
+GBS=1
+
+
+
+
+
+# Set activation recomputation.
+if [ ${ACTIVATION_RECOMPUTATION} == "YES" ]; then
+    MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
+elif [ ${ACTIVATION_RECOMPUTATION} == "NO" ]; then
+    MEGATRON_EXTRA_PARAMS=""
+else
+    echo "Invalid configuration"
+    exit 1
+fi
+
+
+# Other params.
+TP=8
+PP=16
+MBS=1
+NLS=80
+HS=12288
+NAH=96
+DDP=local
+NNODES=16
+
+
+# Name of the job.
+export JOB_NAME=results_figure_17_activation_recomputation_${ACTIVATION_RECOMPUTATION}_batch_size_${GBS}
+
+
+# Import the configs.
+. `pwd`/CONFIG.sh
+
+
+# Submit the job.
+. `pwd`/SBATCH.sh
+
+
+exit 0
+
+
+
--- a/Megatron-LM/examples/academic_paper_scripts/sc21/run_figure_18.sh
+++ b/Megatron-LM/examples/academic_paper_scripts/sc21/run_figure_18.sh
+#!/bin/bash
+
+# ================================
+# Choose the case to run.
+# ================================
+
+# Scatter-gather communication optimization options = [YES, NO].
+SCATTER_GATHER=YES
+
+# Batch size (global batch size) options = [12, 24, 36, ..., 60].
+GBS=12
+
+
+
+
+
+# Set scatter-gather communication optimization options.
+if [ ${SCATTER_GATHER} == "YES" ]; then
+    MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform --num-layers-per-virtual-pipeline-stage 2 "
+elif [ ${SCATTER_GATHER} == "NO" ]; then
+    MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform --num-layers-per-virtual-pipeline-stage 2 --no-scatter-gather-tensors-in-pipeline "
+else
+    echo "Invalid configuration"
+    exit 1
+fi
+
+
+# Other params.
+TP=8
+PP=12
+MBS=1
+NLS=96
+HS=12288
+NAH=96
+DDP=local
+NNODES=12
+
+
+# Name of the job.
+export JOB_NAME=results_figure_18_scatter_gather_${SCATTER_GATHER}_batch_size_${GBS}
+
+
+# Import the configs.
+. `pwd`/CONFIG.sh
+
+
+# Submit the job.
+. `pwd`/SBATCH.sh
+
+
+exit 0
+
+
+
--- a/Megatron-LM/examples/academic_paper_scripts/sc21/run_table_1.sh
+++ b/Megatron-LM/examples/academic_paper_scripts/sc21/run_table_1.sh
+#!/bin/bash
+
+# ================================
+# Choose the case to run.
+# ================================
+# model size options = [1.7B, 3.6B, 7.5B, 18B, 39B, 76B, 145B, 310B, 530B, 1T]
+MODEL_SIZE=1.7B
+
+
+
+
+
+
+if [ ${MODEL_SIZE} == "1.7B" ]; then
+    TP=1
+    PP=1
+    MBS=16
+    GBS=512
+    NLS=24
+    HS=2304
+    NAH=24
+    DDP=torch
+    NNODES=4
+    MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
+elif [ ${MODEL_SIZE} == "3.6B" ]; then
+    TP=2
+    PP=1
+    MBS=16
+    GBS=512
+    NLS=30
+    HS=3072
+    NAH=32
+    DDP=torch
+    NNODES=8
+    MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
+elif [ ${MODEL_SIZE} == "7.5B" ]; then
+    TP=4
+    PP=1
+    MBS=16
+    GBS=512
+    NLS=36
+    HS=4096
+    NAH=32
+    DDP=torch
+    NNODES=16
+    MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
+elif [ ${MODEL_SIZE} == "18B" ]; then
+    TP=8
+    PP=1
+    MBS=8
+    GBS=1024
+    NLS=40
+    HS=6144
+    NAH=48
+    DDP=torch
+    NNODES=32
+    MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
+elif [ ${MODEL_SIZE} == "39B" ]; then
+    TP=8
+    PP=2
+    MBS=4
+    GBS=1536
+    NLS=48
+    HS=8192
+    NAH=64
+    DDP=local
+    NNODES=64
+    MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
+elif [ ${MODEL_SIZE} == "76B" ]; then
+    TP=8
+    PP=4
+    MBS=2
+    GBS=1792
+    NLS=60
+    HS=10240
+    NAH=80
+    DDP=local
+    NNODES=128
+    MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform --num-layers-per-virtual-pipeline-stage 5"
+elif [ ${MODEL_SIZE} == "145B" ]; then
+    TP=8
+    PP=8
+    MBS=2
+    GBS=2304
+    NLS=80
+    HS=12288
+    NAH=96
+    DDP=local
+    NNODES=192
+    MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform --num-layers-per-virtual-pipeline-stage 5 "
+elif [ ${MODEL_SIZE} == "310B" ]; then
+    TP=8
+    PP=16
+    MBS=1
+    GBS=2160
+    NLS=96
+    HS=16384
+    NAH=128
+    DDP=local
+    NNODES=240
+    MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform --num-layers-per-virtual-pipeline-stage 3 "
+elif [ ${MODEL_SIZE} == "530B" ]; then
+    TP=8
+    PP=35
+    MBS=1
+    GBS=2520
+    NLS=105
+    HS=20480
+    NAH=128
+    DDP=local
+    NNODES=315
+    MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform --num-layers-per-virtual-pipeline-stage 1 "
+elif [ ${MODEL_SIZE} == "1T" ]; then
+    TP=8
+    PP=64
+    MBS=1
+    GBS=3072
+    NLS=128
+    HS=25600
+    NAH=160
+    DDP=local
+    NNODES=384
+    MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
+else
+    echo "Invalid configuration"
+    exit 1
+fi
+
+
+# Name of the job
+export JOB_NAME=results_table_1_model_size_${MODEL_SIZE}
+
+
+# Import the configs.
+. `pwd`/CONFIG.sh
+
+
+# Submit the job.
+. `pwd`/SBATCH.sh
+
+
+exit 0
+
+
+
--- a/Megatron-LM/examples/bert/README.md
+++ b/Megatron-LM/examples/bert/README.md
+# BERT MODEL
+
+## Table of contents
+- [1. Training Setup](#1-training-setup)
+- [2. Configurations](#2-configurations)
+
+## 1. Training setup
+<a id="markdown-training-setup" name="training-setup"></a>
+
+To run the model using a docker container run it as follows
+```
+PYTORCH_IMAGE=nvcr.io/nvidia/pytorch:24.01-py3
+CHECKPOINT_PATH="" #<Specify path>
+TENSORBOARD_LOGS_PATH=""#<Specify path>
+VOCAB_FILE="" #<Specify path to file>//bert-vocab.txt
+DATA_PATH="" #<Specify path and file prefix>_text_document
+
+docker run \
+  --gpus=all \
+  --ipc=host \
+  --workdir /workspace/megatron-lm \
+  -v /path/to/data:/path/to/data \
+  -v /path/to/megatron-lm:/workspace/megatron-lm \
+  megatron-lm nvcr.io/nvidia/pytorch:24.01-py3 \
+  bash examples/bert/train_bert_340m_distributed.sh $CHECKPOINT_PATH $TENSORBOARD_LOGS_PATH $VOCAB_FILE $DATA_PATH "
+
+```
+NOTE: Depending on the environment you are running it the above command might like slightly different.
+
+
+## 2. Configurations
+<a id="markdown-configurations" name="configurations"></a>
+The example in this folder shows you how to run 340m large model. There are other configs you could run as well
+
+### 4B
+```
+       --num-layers 48 \
+       --hidden-size 2560 \
+       --num-attention-heads 32 \
+       --tensor-model-parallel-size 1 \
+       --pipeline-model-parallel-size 1 \
+
+```
+
+### 20B
+```
+       --num-layers 48 \
+       --hidden-size 6144 \
+       --num-attention-heads 96 \
+       --tensor-model-parallel-size 4 \
+       --pipeline-model-parallel-size 4 \
+
+```
\ No newline at end of file
--- a/Megatron-LM/examples/bert/train_bert_340m_distributed.sh
+++ b/Megatron-LM/examples/bert/train_bert_340m_distributed.sh
+#!/bin/bash
+
+# Runs the "340M" parameter model (Bert - Large)
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+
+GPUS_PER_NODE=8
+# Change for multinode config
+MASTER_ADDR=localhost
+MASTER_PORT=6000
+NUM_NODES=1
+NODE_RANK=0
+WORLD_SIZE=$(($GPUS_PER_NODE*$NUM_NODES))
+
+CHECKPOINT_PATH=$1 #<Specify path>
+TENSORBOARD_LOGS_PATH=$2 #<Specify path>
+VOCAB_FILE=$3 #<Specify path to file>/bert-vocab.json
+DATA_PATH=$4 #<Specify path and file prefix>_text_document
+
+DISTRIBUTED_ARGS=(
+    --nproc_per_node $GPUS_PER_NODE 
+    --nnodes $NUM_NODES 
+    --master_addr $MASTER_ADDR 
+    --master_port $MASTER_PORT
+)
+
+BERT_MODEL_ARGS=(
+    --num-layers 24 
+    --hidden-size 1024 
+    --num-attention-heads 16 
+    --seq-length 512 
+    --max-position-embeddings 512 
+    --attention-backend auto # Can use (flash/fused/unfused/local)
+)
+
+TRAINING_ARGS=(
+    --micro-batch-size 4 
+    --global-batch-size 32 
+    --train-iters 1000000 
+    --weight-decay 1e-2 
+    --clip-grad 1.0 
+    --fp16
+    --lr 0.0001
+    --lr-decay-iters 990000 
+    --lr-decay-style linear 
+    --min-lr 1.0e-5 
+    --weight-decay 1e-2 
+    --lr-warmup-fraction .01 
+    --clip-grad 1.0 
+)
+
+MODEL_PARALLEL_ARGS=(
+	--tensor-model-parallel-size 8 
+	--pipeline-model-parallel-size 16 
+)
+
+DATA_ARGS=(
+    --data-path $DATA_PATH 
+    --vocab-file $VOCAB_FILE 
+    --split 949,50,1
+)
+
+EVAL_AND_LOGGING_ARGS=(
+    --log-interval 100
+    --save-interval 10000 
+    --eval-interval 1000 
+    --save $CHECKPOINT_PATH 
+    --load $CHECKPOINT_PATH 
+    --eval-iters 10
+    --tensorboard-dir $TENSORBOARD_LOGS_PATH 
+)
+
+torchrun ${DISTRIBUTED_ARGS[@]} pretrain_bert.py \
+    ${BERT_MODEL_ARGS[@]} \
+    ${TRAINING_ARGS[@]} \
+    ${MODEL_PARALLEL_ARGS[@]} \
+    ${DATA_ARGS[@]} \
+    ${EVAL_AND_LOGGING_ARGS[@]}
+    
\ No newline at end of file
--- a/Megatron-LM/examples/export/README.md
+++ b/Megatron-LM/examples/export/README.md
+# Megatron Core Export
+
+This module is used to export megatron core models to different inference frameworks. 
+Currently we support TRTLLM export . In the future we will be adding support for VLLM etc. 
+
+## PTQ AND EXPORT
+Follow the examples of [TensorRT Model Optimizer](../post_training/modelopt) to perform post training quantization, followed by an export to a HF-like checkpoint for TensorRT-LLM, vLLM, and SGLang deployment. 
+
+# TRTLLM EXPORT
+Follow the instructions in [trtllm_export](./trtllm_export/) to do export to TRTLLM checkpoint format alone.
--- a/Megatron-LM/examples/export/trtllm_export/README.md
+++ b/Megatron-LM/examples/export/trtllm_export/README.md
+# Megatron Core To TRTLLM Export Documentation
+This guide will walk you through how you can use the megatron core export for exporting models to trtllm format
+
+### Contents
+- [Megatron Core To TRTLLM Export Documentation](#megatron-core-to-trtllm-export-documentation)
+- [Contents](#contents)
+  - [1. Quick Start](#1-quick-start)
+    - [1.1 Understanding The Code](#11-understanding-the-code)
+    - [1.2 Running The Code](#12-running-the-code)
+  - [2. GPU Export](#2-gpu-export)
+  - [3. Future work](#4-future-work)
+
+#### 1. Quick Start
+This will walk you through the flow of converting an mcore gpt model to trtllm format using single device mode. The file can be found at [gpt_single_device_cpu_export.py](./single_device_export/gpt_single_device_cpu_export.py)
+
+NOTE: For faster performance, if your entire model will fit into gpu memory, pre transfer the model state dict to gpu and then call the get_trtllm_pretrained_config_and_model_weights function.
+
+<br>
+
+##### 1.1 Understanding The Code
+***STEP 1 - We initialize model parallel and other default arguments***
+We initalize tp and pp to 1 so that we can get the full model state dict on cpu
+```python
+    initialize_distributed(tensor_model_parallel_size=1, pipeline_model_parallel_size=1)
+```
+
+***STEP 2 - We load the model using the model_provider_function***
+NOTE: We create a simple gpt model
+
+```python
+    transformer_config = TransformerConfig(
+        num_layers=2, 
+        hidden_size=64, # Needs to be atleast 32 times num_attn_heads
+        num_attention_heads=2, 
+        use_cpu_initialization=True, 
+        pipeline_dtype=torch.float32,
+    )
+
+    gpt_model = GPTModel(
+        config=transformer_config, 
+        transformer_layer_spec=get_gpt_layer_local_spec(), 
+        vocab_size=100, 
+        max_sequence_length=_SEQUENCE_LENGTH,
+    )
+
+    # Optionally you can also load a model using this code 
+    # sharded_state_dict=gpt_model.sharded_state_dict(prefix='')
+    # checkpoint = dist_checkpointing.load(sharded_state_dict=sharded_state_dict, checkpoint_dir=checkpoint_path)
+    # gpt_model.load_state_dict(checkpoint)
+
+```
+
+***STEP 3 - Instantiate the TRTLLM Helper***
+We instantiate the [TRTLLM Helper](../../../megatron/core/export/trtllm/trtllm_helper.py)  For the GPT model we instantiate trtllm_helper as shown below.
+```python
+    if hasattr(gpt_model, "rotary_pos_emb"):
+        seq_len_interpolation_factor =  gpt_model.rotary_pos_emb.seq_len_interpolation_factor
+
+    trtllm_helper = TRTLLMHelper(
+                        transformer_config=gpt_model.config, 
+                        model_type=ModelType.gpt,
+                        position_embedding_type = gpt_model.position_embedding_type, 
+                        max_position_embeddings = gpt_model.max_position_embeddings, 
+                        rotary_percentage = gpt_model.rotary_percent,
+                        rotary_base = gpt_model.rotary_base,
+                        moe_tp_mode = 2,
+                        multi_query_mode = False,
+                        activation = "gelu", 
+                        seq_len_interpolation_factor = seq_len_interpolation_factor,
+                        share_embeddings_and_output_weights=gpt_model.share_embeddings_and_output_weights
+                    )   
+```
+
+***STEP 4 - Get the TRTLLM Weights and configs***
+To convert model weights to trtllm weights and configs, we use the [single_device_converter](../../../megatron/core/export/trtllm/trtllm_weights_converter/single_device_trtllm_model_weights_converter.py). We pass as inputs the model state dict, and export config. In this example we use inference tp size as 2 for the export. 
+
+```python
+    model_state_dict={}
+    for key , val in gpt_model.state_dict().items():
+        # val is non for _extra_state layers . We filter it out
+        if val is not None:
+            model_state_dict[key] = val
+
+    export_config = ExportConfig(inference_tp_size = 2)
+    weight_list, config_list = trtllm_helper.get_trtllm_pretrained_config_and_model_weights(
+        model_state_dict= model_state_dict,
+        dtype = DataType.bfloat16,
+        export_config=export_config
+    )
+```
+
+***STEP 5 - Build the TRTLLM Engine***
+Following code is used to build the TRTLLM Engine. 
+
+```python
+    for trtllm_model_weights, trtllm_model_config in zip(weight_list, config_list):
+        trtllm_helper.build_and_save_engine(
+            max_input_len=256,
+            max_output_len=256,
+            max_batch_size=8,
+            engine_dir='/opt/megatron-lm/engine',
+            trtllm_model_weights=trtllm_model_weights,
+            trtllm_model_config=trtllm_model_config,
+            lora_ckpt_list=None,
+            use_lora_plugin=None,
+            max_lora_rank=64,
+            lora_target_modules=None,
+            max_prompt_embedding_table_size=0,
+            paged_kv_cache=True,
+            remove_input_padding=True,
+            paged_context_fmha=False,
+            use_refit=False,
+            max_num_tokens=None,
+            max_seq_len=512,
+            opt_num_tokens=None,
+            max_beam_width=1,
+            tokens_per_block=128,
+            multiple_profiles=False,
+            gpt_attention_plugin="auto",
+            gemm_plugin="auto",
+        )
+```
+<br>
+
+##### 1.2 Running The Code
+An example run script is shown below. 
+
+```
+# In a workstation 
+MLM_PATH=/path/to/megatron-lm
+CONTAINER_IMAGE=gitlab-master.nvidia.com:5005/dl/joc/nemo-ci/trtllm_0.12/train:pipe.17669124-x86
+
+docker run -it --gpus=all --ipc=host -v $MLM_PATH/:/opt/megatron-lm $CONTAINER_IMAGE bash
+
+# Inside the container run the following. 
+
+cd /opt/megatron-lm/
+
+CUDA_VISIBLE_DEVICES=0 torchrun --nproc-per-node 1  examples/export/trtllm_export/single_device_export/gpt_single_device_cpu_export.py
+```
+
+<br>
+
+#### 2. GPU Export
+You can use the [gpt_distributed_gpu_export.py](./distributed_export/gpt_distributed_gpu_export.py) to run a more optimized on device distributed. version of trtllm export. Internally this uses the [distributed_converter](../../../megatron/core/export/trtllm/trtllm_weights_converter/distributed_trtllm_model_weights_converter.py) to convert model weights on device. 
+In the single device version you collect all the model weights on CPU/GPU, convert it to trtllm format, and then store the engine back on disk. In the GPU version you load each individual state dict on the gpus, convert it on the device itself and store the engine on disk. 
+
+To run the gpu version 
+
+```
+CUDA_VISIBLE_DEVICES=0,1 torchrun --nproc-per-node 2  examples/export/trtllm_export/distributed_export/gpt_distributed_gpu_export.py
+```
+
+<br>
+
+#### 3. Future work
+The following are planned for the future releases . 
+* Pipeline parallellism for export (Work in progress) 
+* GPU Export for more models (Work in progress for some models)
+* Refit functionality
+* VLLM Support
\ No newline at end of file
--- a/Megatron-LM/examples/export/trtllm_export/distributed_export/gpt_distributed_gpu_export.py
+++ b/Megatron-LM/examples/export/trtllm_export/distributed_export/gpt_distributed_gpu_export.py
+import os
+import torch
+from megatron.core import parallel_state
+from megatron.core import dist_checkpointing
+from megatron.core.export.model_type import ModelType
+from megatron.core.export.data_type import DataType
+from megatron.core.export.trtllm.trtllm_helper import TRTLLMHelper
+from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed
+from megatron.core.transformer.transformer_config import TransformerConfig
+from megatron.core.models.gpt.gpt_model import GPTModel
+from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_local_spec
+
+
+_SEQUENCE_LENGTH = 64
+_VOCAB_SIZE = 256
+
+def initialize_distributed(tensor_model_parallel_size=1, pipeline_model_parallel_size=1):
+    parallel_state.destroy_model_parallel()
+
+    # Torch setup for distributed training
+    rank = int(os.environ['LOCAL_RANK'])
+    world_size = torch.cuda.device_count()
+    torch.cuda.set_device(rank)
+    torch.distributed.init_process_group(world_size=world_size, rank=rank)
+
+    # Megatron core distributed training initialization
+    parallel_state.initialize_model_parallel(tensor_model_parallel_size = tensor_model_parallel_size, pipeline_model_parallel_size=pipeline_model_parallel_size)
+
+def model_provider():
+    """Build the model."""
+
+    transformer_config = TransformerConfig(
+        num_layers=2, 
+        hidden_size=64, 
+        num_attention_heads=2, 
+        use_cpu_initialization=True, 
+        pipeline_dtype=torch.float32
+    )
+
+    gpt_model = GPTModel(
+        config=transformer_config, 
+        transformer_layer_spec=get_gpt_layer_local_spec(), 
+        vocab_size=_VOCAB_SIZE, 
+        max_sequence_length=_SEQUENCE_LENGTH,
+    )
+
+    return gpt_model
+
+def load_distributed_checkpoint(checkpoint_path, gpt_model):
+    sharded_state_dict=gpt_model.sharded_state_dict(prefix='')
+    checkpoint = dist_checkpointing.load(sharded_state_dict=sharded_state_dict, checkpoint_dir=checkpoint_path)
+    gpt_model.load_state_dict(checkpoint)
+    return gpt_model
+
+if __name__ == "__main__":
+    initialize_distributed(tensor_model_parallel_size=2, pipeline_model_parallel_size=1)
+    model_parallel_cuda_manual_seed(123)
+
+    gpt_model = model_provider()
+    device = torch.device("cuda")
+    gpt_model.to(device) 
+    
+    # Optionally you can also load a gpt model from ckpt_path using this code below
+    # gpt_model = load_distributed_checkpoint(gpt_model=gpt_model, checkpoint_path=ckpt_path)
+
+    seq_len_interpolation_factor = None
+    if hasattr(gpt_model, "rotary_pos_emb"):
+        seq_len_interpolation_factor =  gpt_model.rotary_pos_emb.seq_len_interpolation_factor
+
+    trtllm_helper = TRTLLMHelper(
+                        transformer_config=gpt_model.config, 
+                        model_type=ModelType.gpt,
+                        position_embedding_type = gpt_model.position_embedding_type, 
+                        max_position_embeddings = gpt_model.max_position_embeddings, 
+                        rotary_percentage = gpt_model.rotary_percent,
+                        rotary_base = gpt_model.rotary_base,
+                        moe_tp_mode = 2,
+                        multi_query_mode = False,
+                        activation = "gelu", 
+                        seq_len_interpolation_factor = seq_len_interpolation_factor,
+                        share_embeddings_and_output_weights=gpt_model.share_embeddings_and_output_weights
+                    )
+    
+
+    trtllm_model_weights, trtllm_model_config = trtllm_helper.get_trtllm_pretrained_config_and_model_weights(
+        model_state_dict= gpt_model.state_dict(),
+        dtype = DataType.bfloat16,
+        on_device_distributed_conversion=True, 
+        vocab_size=_VOCAB_SIZE, 
+        gpus_per_node=2,
+    )
+
+    trtllm_helper.build_and_save_engine(
+        max_input_len=256,
+        max_output_len=256,
+        max_batch_size=8,
+        engine_dir='/opt/megatron-lm/engine',
+        trtllm_model_weights=trtllm_model_weights[0],
+        trtllm_model_config=trtllm_model_config[0],
+        lora_ckpt_list=None,
+        use_lora_plugin=None,
+        max_lora_rank=64,
+        lora_target_modules=None,
+        max_prompt_embedding_table_size=0,
+        paged_kv_cache=True,
+        remove_input_padding=True,
+        paged_context_fmha=False,
+        use_refit=False,
+        max_num_tokens=None,
+        max_seq_len=512,
+        opt_num_tokens=None,
+        max_beam_width=1,
+        tokens_per_block=128,
+        multiple_profiles=False,
+        gpt_attention_plugin="auto",
+        gemm_plugin="auto",
+    )
--- a/Megatron-LM/examples/export/trtllm_export/single_device_export/gpt_single_device_cpu_export.py
+++ b/Megatron-LM/examples/export/trtllm_export/single_device_export/gpt_single_device_cpu_export.py
+import os
+import torch
+from megatron.core import parallel_state
+from megatron.core import dist_checkpointing
+from megatron.core.export.model_type import ModelType
+from megatron.core.export.data_type import DataType
+from megatron.core.export.export_config import ExportConfig
+from megatron.core.export.trtllm.trtllm_helper import TRTLLMHelper
+from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed
+from megatron.core.transformer.transformer_config import TransformerConfig
+from megatron.core.models.gpt.gpt_model import GPTModel
+from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_local_spec
+
+
+_SEQUENCE_LENGTH = 64
+
+
+def initialize_distributed(tensor_model_parallel_size=1, pipeline_model_parallel_size=1):
+    parallel_state.destroy_model_parallel()
+
+    # Torch setup for distributed training
+    rank = int(os.environ['LOCAL_RANK'])
+    world_size = torch.cuda.device_count()
+    torch.cuda.set_device(rank)
+    torch.distributed.init_process_group(world_size=world_size, rank=rank)
+
+    # Megatron core distributed training initialization
+    parallel_state.initialize_model_parallel(tensor_model_parallel_size, pipeline_model_parallel_size)
+
+def model_provider():
+    """Build the model."""
+
+    transformer_config = TransformerConfig(
+        num_layers=2, 
+        hidden_size=64, # Needs to be atleast 32 times num_attn_heads
+        num_attention_heads=2, 
+        use_cpu_initialization=True, 
+        pipeline_dtype=torch.float32,
+    )
+
+    gpt_model = GPTModel(
+        config=transformer_config, 
+        transformer_layer_spec=get_gpt_layer_local_spec(), 
+        vocab_size=100, 
+        max_sequence_length=_SEQUENCE_LENGTH,
+    )
+
+    return gpt_model
+
+def load_distributed_checkpoint(checkpoint_path, gpt_model):
+    sharded_state_dict=gpt_model.sharded_state_dict(prefix='')
+    checkpoint = dist_checkpointing.load(sharded_state_dict=sharded_state_dict, checkpoint_dir=checkpoint_path)
+    gpt_model.load_state_dict(checkpoint)
+    return gpt_model
+
+if __name__ == "__main__":
+    # Need to use TP1 PP1 for export on single device
+    initialize_distributed(tensor_model_parallel_size=1, pipeline_model_parallel_size=1)
+    model_parallel_cuda_manual_seed(123)
+
+    gpt_model = model_provider()
+
+    # Optionally you can also load a gpt model from ckpt_path using this code below
+    # gpt_model = load_distributed_checkpoint(gpt_model=gpt_model, checkpoint_path=ckpt_path)
+
+    seq_len_interpolation_factor = None
+    if hasattr(gpt_model, "rotary_pos_emb"):
+        seq_len_interpolation_factor =  gpt_model.rotary_pos_emb.seq_len_interpolation_factor
+
+    trtllm_helper = TRTLLMHelper(
+                        transformer_config=gpt_model.config, 
+                        model_type=ModelType.gpt,
+                        position_embedding_type = gpt_model.position_embedding_type, 
+                        max_position_embeddings = gpt_model.max_position_embeddings, 
+                        rotary_percentage = gpt_model.rotary_percent,
+                        rotary_base = gpt_model.rotary_base,
+                        moe_tp_mode = 2,
+                        multi_query_mode = False,
+                        activation = "gelu", 
+                        seq_len_interpolation_factor = seq_len_interpolation_factor,
+                        share_embeddings_and_output_weights=gpt_model.share_embeddings_and_output_weights
+                    )
+    
+
+    export_config = ExportConfig(inference_tp_size = 2)
+    # NOTE : For faster performance, if your entire model will fit in gpu memory, transfer model state dict to GPU and then call this api
+    weight_list, config_list = trtllm_helper.get_trtllm_pretrained_config_and_model_weights(
+        model_state_dict= gpt_model.state_dict(),
+        dtype = DataType.bfloat16,
+        export_config=export_config
+    )
+
+    for trtllm_model_weights, trtllm_model_config in zip(weight_list, config_list):
+        trtllm_helper.build_and_save_engine(
+            max_input_len=256,
+            max_output_len=256,
+            max_batch_size=8,
+            engine_dir='/opt/megatron-lm/engine',
+            trtllm_model_weights=trtllm_model_weights,
+            trtllm_model_config=trtllm_model_config,
+            lora_ckpt_list=None,
+            use_lora_plugin=None,
+            max_lora_rank=64,
+            lora_target_modules=None,
+            max_prompt_embedding_table_size=0,
+            paged_kv_cache=True,
+            remove_input_padding=True,
+            paged_context_fmha=False,
+            use_refit=False,
+            max_num_tokens=None,
+            max_seq_len=512,
+            opt_num_tokens=None,
+            max_beam_width=1,
+            tokens_per_block=128,
+            multiple_profiles=False,
+            gpt_attention_plugin="auto",
+            gemm_plugin="auto",
+        )
\ No newline at end of file
--- a/Megatron-LM/examples/gpt3/README.md
+++ b/Megatron-LM/examples/gpt3/README.md
+# GPT3 MODEL
+
+## Table of contents
+- [1. Training Setup](#1-training-setup)
+- [2. Configurations](#2-configurations)
+- [3. Training Results](#3-training-results)
+
+## 1. Training setup
+<a id="markdown-training-setup" name="training-setup"></a>
+
+To run the model using a docker container run it as follows
+```
+PYTORCH_IMAGE=nvcr.io/nvidia/pytorch:24.01-py3
+CHECKPOINT_PATH="" #<Specify path>
+TENSORBOARD_LOGS_PATH=""#<Specify path>
+VOCAB_FILE="" #<Specify path to file>/gpt2-vocab.json
+MERGE_FILE="" #<Specify path to file>/gpt2-merges.txt
+DATA_PATH="" #<Specify path and file prefix>_text_document
+
+docker run \
+  --gpus=all \
+  --ipc=host \
+  --workdir /workspace/megatron-lm \
+  -v /path/to/data:/path/to/data \
+  -v /path/to/megatron-lm:/workspace/megatron-lm \
+  megatron-lm nvcr.io/nvidia/pytorch:24.01-py3 \
+  bash examples/gpt3/train_gpt3_175b_distributed.sh $CHECKPOINT_PATH $TENSORBOARD_LOGS_PATH $VOCAB_FILE $MERGE_FILE $DATA_PATH "
+
+```
+NOTE: Depending on the environment you are running it the above command might like slightly different.
+
+
+## 2. Configurations
+<a id="markdown-configurations" name="configurations"></a>
+The example in this folder shows you how to run 175B model. There are other configs you could run as well
+
+### 345M
+```
+       --num-layers 12 \
+       --hidden-size 512 \
+       --num-attention-heads 8 \
+       --seq-length 1024 \
+       --tensor-model-parallel-size 1 \
+       --pipeline-model-parallel-size 1 \
+
+```
+
+### 857M
+```
+       --num-layers 24 \
+       --hidden-size 1024 \
+       --num-attention-heads 16 \
+       --seq-length 2048 \
+       --tensor-model-parallel-size 1 \
+       --pipeline-model-parallel-size 1 \
+
+```
--- a/Megatron-LM/examples/gpt3/gpt_config.yaml
+++ b/Megatron-LM/examples/gpt3/gpt_config.yaml
+# WARNING: Yaml configs is currently an experimental feature
+language_model:
+  # model architecture
+  num_layers: 24
+  hidden_size: 1024
+  num_attention_heads: 16
+  num_query_groups: null
+
+  ffn_hidden_size: null
+  kv_channels: null
+  hidden_dropout: 0.0
+  attention_dropout: 0.0
+  fp32_residual_connection: False
+
+  apply_residual_connection_post_layernorm: False
+  layernorm_epsilon: 1.e-5
+  layernorm_zero_centered_gamma: True
+  add_bias_linear: False
+  bias_activation_fusion: False
+  add_qkv_bias: False
+  gated_linear_unit: False
+  activation_func: swiglu
+  num_moe_experts: null
+  rotary_interleaved: False
+  window_size: null
+
+  # initialization
+  init_method: null
+  init_method_std: 0.02
+  output_layer_init_method: null
+
+  # mixed-precision
+  apply_query_key_layer_scaling: False
+  attention_softmax_in_fp32: False
+
+  # fusion
+  bias_swiglu_fusion: True
+  masked_softmax_fusion: True
+  persist_layer_norm: False
+  memory_efficient_layer_norm: False
+  bias_dropout_fusion: True
+  apply_rope_fusion: True
+
+  # activation recomputation
+  recompute_granularity: null
+  recompute_method: null
+  recompute_num_layers: null
+  distribute_saved_activations: null
+
+  # fp8 related
+  fp8: null
+  fp8_margin: 0
+  fp8_interval: 1
+  fp8_amax_history_len: 1
+  fp8_amax_compute_algo: "most_recent"
+  fp8_wgrad: True
+
+  # miscellaneous
+  clone_scatter_output_in_embedding: True
+
+  normalization: "LayerNorm"  # alt value supported by TE: "RMSNorm"
+
+  # MoE related
+  moe_router_load_balancing_type: "aux_loss"
+  moe_router_topk: 2
+  moe_router_group_topk: null
+  moe_router_num_groups: null
+  moe_grouped_gemm: False
+  moe_aux_loss_coeff: 0  # 1e-2 would be a good start value for load balance loss.
+  moe_z_loss_coeff: null  # 1e-3 would be a good start value for z-loss
+  moe_input_jitter_eps: null
+  moe_token_dropping: False
+
+model_parallel:
+  # Model parallelism
+  tensor_model_parallel_size: 1
+  context_parallel_size: 1
+  pipeline_model_parallel_size: 1
+  virtual_pipeline_model_parallel_size: null
+  sequence_parallel: True
+  expert_model_parallel_size: 1
+
+  # Initialization
+  perform_initialization: True
+  use_cpu_initialization: null
+
+  # Training
+  fp16: False
+  bf16: True
+  params_dtype: null # Set from above arguments for core
+  timers: null
+
+  # Optimizations
+  gradient_accumulation_fusion: True
+  async_tensor_model_parallel_allreduce: True
+  tp_comm_overlap: False
+
+  # Debug Options
+  tp_comm_split_ag: True
+  tp_comm_atomic_ag: True
+  tp_comm_split_rs: True
+  tp_comm_atomic_rs: True
+  tp_comm_bulk_wgrad: True
+  tp_comm_bulk_dgrad: True
+
+  # Parallelism
+  finalize_model_grads_func: null
+
+  # Pipeline Parallel
+  pipeline_dtype: null
+  grad_scale_func: null
+  enable_autocast: False
+  autocast_dtype: null
+  variable_seq_lengths: False
+  num_microbatches_with_partial_activation_checkpoints: null
+  overlap_p2p_comm: False
+  batch_p2p_comm: True
+  batch_p2p_sync: True
+  use_ring_exchange_p2p: False
+  deallocate_pipeline_outputs: False
+  no_sync_func: null
+  grad_sync_func: null
+  param_sync_func: null
+  pipeline_model_parallel_split_rank: null
+
+  # CPU Offloading
+  cpu_offloading: False
+  cpu_offloading_num_layers: 0
+  _cpu_offloading_context: null
+  cpu_offloading_weights: False
+  cpu_offloading_activations: True
+
+  # Timing
+  barrier_with_L1_time: True
+
+# training:
+use_legacy_models: False
+spec: null
+micro_batch_size: 2
+global_batch_size: 128
+rampup_batch_size: [32, 32, 65324160] 
+check_for_nan_in_loss_and_grad: True
+num_layers_per_virtual_pipeline_stage: null
+
+encoder_num_layers: null
+decoder_num_layers: null
+rotary_seq_len_interpolation_factor: null
+add_position_embedding: False
+make_vocab_size_divisible_by: 128
+group_query_attention: False
+
+
+exit_signal_handler: False
+exit_duration_in_mins: null
+exit_interval: null
+
+untie_embeddings_and_output_weights: True
+position_embedding_type: rope
+rotary_percent: 0.5
+openai_gelu: False
+squared_relu: False
+swiglu: True
+onnx_safe: null
+bert_binary_head: True
+max_position_embeddings: 4096
+
+transformer_impl: local
+use_flash_attn: False
+seed: 1234
+data_parallel_random_init: False
+
+# Optimizer
+optimizer: adam
+lr: 2.5e-4
+lr_decay_style: cosine
+lr_decay_iters: null
+lr_decay_samples: 255126953
+lr_warmup_fraction: null
+lr_warmup_iters: 0
+lr_warmup_samples: 81381
+lr_warmup_init: 0.0
+min_lr: 2.5e-5
+weight_decay: 0.1
+start_weight_decay: null
+end_weight_decay: null
+weight_decay_incr_style: constant
+clip_grad: 1.0
+adam_beta1: 0.9
+adam_beta2: 0.95
+adam_eps: 1.e-08
+sgd_momentum: 0.9
+override_opt_param_scheduler: False
+use_checkpoint_opt_param_scheduler: False
+
+# checkpointing arguments
+save: null
+save_interval: 20000
+no_save_optim: null
+no_save_rng: null
+load: null
+no_load_optim: null
+no_load_rng: null
+finetune: False
+use_checkpoint_args: False
+exit_on_missing_checkpoint: False
+
+# loss arguments
+loss_scale: null
+initial_loss_scale: 4294967296
+min_loss_scale: 1.0
+loss_scale_window: 1000 
+hysteresis: 2
+accumulate_allreduce_grads_in_fp32: False
+fp16_lm_cross_entropy: False
+
+# distributed arguments
+distributed_backend: nccl
+distributed_timeout_minutes: 10
+overlap_grad_reduce: False
+align_grad_reduce: True
+overlap_param_gather: False
+align_param_gather: False
+scatter_gather_tensors_in_pipeline: True
+local_rank: null
+lazy_mpu_init: null
+empty_unused_memory_level: 0
+standalone_embedding_stage: False
+use_distributed_optimizer: False
+nccl_communicator_config_path: null
+
+train_iters: null
+eval_iters: 32
+eval_interval: 2000
+skip_train: False
+
+adlr_autoresume: False
+adlr_autoresume_interval: 1000
+
+# garbage collection
+manual_gc: False
+manual_gc_interval: 0
+manual_gc_eval: True
+
+tp_comm_overlap_cfg: null
+
+#data
+data_path: null
+split: '99,1,0'
+train_data_path: null
+valid_data_path: null
+test_data_path: null
+data_cache_path: null
+mock_data: False
+vocab_size: null
+vocab_file: null
+merge_file: null
+vocab_extra_ids: 0
+seq_length: 4096
+encoder_seq_length: null
+decoder_seq_length: null
+retriever_seq_length: 256
+sample_rate: 1.0
+mask_prob: 0.15
+short_seq_prob: 0.1
+num_workers: 2
+tokenizer_type: GPTSentencePieceTokenizer
+tokenizer_model: null
+reset_position_ids: False
+reset_attention_mask: False
+eod_mask_loss: False
+train_samples: 268554688
+dataloader_type: null
+
+#profile:
+profile: False
+profile_ranks: [0]
+profile_step_end: 12
+profile_step_start: 10
+
+#logging:
+log_params_norm: True
+log_num_zeros_in_grad: True
+log_throughput: False
+log_progress: False
+timing_log_level: 0
+timing_log_option: minmax
+tensorboard_log_interval: 1
+tensorboard_queue_size: 1000
+log_timers_to_tensorboard: False
+log_validation_ppl_to_tensorboard: False
+log_memory_to_tensorboard: False
+log_world_size_to_tensorboard: False
+log_loss_scale_to_tensorboard: True
+wandb_project: ''
+wandb_exp_name: ''
+wandb_save_dir: ''
+enable_one_logger: True
+one_logger_project: megatron-lm
+one_logger_run_name: null
+log_interval: 100
+tensorboard_dir: null
--- a/Megatron-LM/examples/gpt3/train_gpt3_175b_distributed.sh
+++ b/Megatron-LM/examples/gpt3/train_gpt3_175b_distributed.sh
+#!/bin/bash
+
+# Runs the "175B" parameter model
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+
+GPUS_PER_NODE=8
+# Change for multinode config
+MASTER_ADDR=localhost
+MASTER_PORT=6000
+NUM_NODES=1
+NODE_RANK=0
+WORLD_SIZE=$(($GPUS_PER_NODE*$NUM_NODES))
+
+CHECKPOINT_PATH=$1 #<Specify path>
+TENSORBOARD_LOGS_PATH=$2 #<Specify path>
+VOCAB_FILE=$3 #<Specify path to file>/gpt2-vocab.json
+MERGE_FILE=$4 #<Specify path to file>/gpt2-merges.txt
+DATA_PATH=$5 #<Specify path and file prefix>_text_document
+
+DISTRIBUTED_ARGS=(
+    --nproc_per_node $GPUS_PER_NODE 
+    --nnodes $NUM_NODES 
+    --master_addr $MASTER_ADDR 
+    --master_port $MASTER_PORT
+)
+
+GPT_MODEL_ARGS=(
+    --num-layers 96 
+    --hidden-size 12288 
+    --num-attention-heads 96 
+    --seq-length 2048 
+    --max-position-embeddings 2048 
+    --attention-backend auto # Can use (flash/fused/unfused/local)
+)
+
+TRAINING_ARGS=(
+    --micro-batch-size 1 
+    --global-batch-size 1536 
+    --rampup-batch-size 16 16 5859375 
+    --train-iters 500000 
+    --weight-decay 0.1 
+    --adam-beta1 0.9 
+    --adam-beta2 0.95 
+    --init-method-std 0.006 
+    --clip-grad 1.0 
+    --fp16
+    --lr 6.0e-5 
+    --lr-decay-style cosine 
+    --min-lr 6.0e-6
+    --lr-warmup-fraction .001 
+    --lr-decay-iters 430000 
+)
+
+MODEL_PARALLEL_ARGS=(
+	--tensor-model-parallel-size 8 
+	--pipeline-model-parallel-size 16 
+)
+
+DATA_ARGS=(
+    --data-path $DATA_PATH 
+    --vocab-file $VOCAB_FILE 
+    --merge-file $MERGE_FILE 
+    --split 949,50,1
+)
+
+EVAL_AND_LOGGING_ARGS=(
+    --log-interval 100
+    --save-interval 10000 
+    --eval-interval 1000 
+    --save $CHECKPOINT_PATH 
+    --load $CHECKPOINT_PATH 
+    --eval-iters 10
+    --tensorboard-dir $TENSORBOARD_LOGS_PATH 
+)
+
+torchrun ${DISTRIBUTED_ARGS[@]} pretrain_gpt.py \
+    ${GPT_MODEL_ARGS[@]} \
+    ${TRAINING_ARGS[@]} \
+    ${MODEL_PARALLEL_ARGS[@]} \
+    ${DATA_ARGS[@]} \
+    ${EVAL_AND_LOGGING_ARGS[@]}
--- a/Megatron-LM/examples/inference/README.md
+++ b/Megatron-LM/examples/inference/README.md
+### Megatron Core Inference Documentation
+This guide provides an example for Megatron Core for running model inference. 
+
+### Contents
+- [Megatron Core Inference Documentation](#megatron-core-inference-documentation)
+- [Contents](#contents)
+  - [1. Quick Start](#1-quick-start)
+    - [1.1 Understanding The Code](#11-understanding-the-code)
+    - [1.2 Running The Code](#12-running-the-code)
+  - [2. Flow of Control In MCore Backend](#2-flow-of-control-in-mcore-backend)
+  - [3. Customizing The Inference Pipeline](#3-customizing-the-inference-pipeline)
+    - [3.1. Create Your Own Inference Backend](#31-create-your-own-inference-backend)
+    - [3.2. Create Your Own Text Generation Controller](#32-create-your-own-text-generation-controller)
+    - [3.3. Support Other Models](#33-support-other-models)
+    - [3.3. Modify Inference Parameters](#33-modify-inference-parameters)
+  - [4. Future work](#4-future-work)
+
+<br>
+
+#### 1. Quickstart
+This example runs statically-batched inference on a model trained using Megatron Core. The entrypoint is [gpt_static_inference.py](./gpt/gpt_static_inference.py). A similar workflow can be adapted for [gpt_dynamic_inference.py](./gpt/gpt_dynamic_inference.py).
+
+<br>
+
+##### 1.1 Code Walkthrough 
+***STEP 1 - Initialize model parallel and other default arguments***
+The micro batch size defaults to 1. It is not used in tensor-parallelism only, and for pipeline-parallel models it is calculated at runtime. 
+```python
+# Initialize Megatron model using the same model provider from training.
+    initialize_megatron(
+        args_defaults={'no_load_rng': True, 'no_load_optim': True, 'micro_batch_size': 1}
+    )
+```
+
+***STEP 2 - Load the model using the model_provider_function***
+The model provider function supports both MCore and Legacy models. 
+
+```python
+    # Load the model checkpoint
+    model = get_model(model_provider, wrap_with_ddp=False)
+    load_checkpoint(model, None, None)
+    model.eval()
+    model = model[0]
+```
+
+***STEP 3 - Choose an engine***
+Text generation requires an inference engine, which includes a scheduler. The default engine is the [Megatron Core engine](../../megatron/core/inference/engine/mcore_engine.py) with a [text generation controller](../../megatron/core/inference/text_generation_controllers/text_generation_controller.py). TRTLLMEngine will be supported in the future.
+```python
+    # Create an inference wrapper to setup the model.
+    inference_wrapped_model = GPTInferenceWrapper(model, args)
+    
+    # Define a sampling loop.
+    text_generation_controller = TextGenerationController(
+        inference_wrapped_model=inference_wrapped_model, 
+        tokenizer=tokenizer
+    )
+    
+    # Create a static or dynamic inference engine.
+    inference_engine = StaticInferenceEngine(
+        text_generation_controller=text_generation_controller, 
+        max_batch_size=args.max_batch_size
+)
+```
+
+***STEP 4 - Run text generation***
+The [SamplingParams](../../megatron/core/inference/sampling_params.py) class uses suggested defaults. Customize this to change top_p, top_k, number of tokens to generate, etc. The result is returned as a list of [InferenceRequests](../../megatron/core/inference/inference_request.py).
+```python
+    results: List[InferenceRequest] = inference_engine.generate(
+        prompts=args.prompts, sampling_params=sampling_params
+    )
+    
+    if torch.distributed.get_rank() == 0:
+        for idx, result in enumerate(results):
+            print(f' ------------- RESULT FOR PROMPT {idx} --------------- ')
+            result = {
+                'id': result.request_id,
+                'input_prompt': result.prompt, 
+                'generated_text': result.generated_text,
+                'generated_tokens' : result.generated_tokens
+                }
+            print(result)
+```
+
+<br>
+
+##### 1.2 Running The Code
+An example Slurm script is shown below. Set the tokenizer paths, inference params, and other settings appropriately. 
+
+For a recap on sampling parameters, refer to [this blog](https://ivibudh.medium.com/a-guide-to-controlling-llm-model-output-exploring-top-k-top-p-and-temperature-parameters-ed6a31313910).
+
+```
+# Slurm cluster settings 
+ACCOUNT=<account>
+MLM_PATH=/path/to/megatron-lm
+GPT_CKPT=/path/to/gpt/ckpt
+VOCAB_MERGE_FILE_PATH=/path/to/vocab/and/merge/file
+CONTAINER_IMAGE=nvcr.io/ea-bignlp/ga-participants/nemofw-training:23.11
+
+srun --account $ACCOUNT \
+--job-name=$ACCOUNT:inference \
+--partition=batch \
+--time=01:00:00 \
+--container-image $CONTAINER_IMAGE \
+--container-mounts $MLM_PATH:/workspace/megatron-lm/,$GPT_CKPT:/workspace/mcore_gpt_ckpt,$VOCAB_MERGE_FILE_PATH:/workspace/tokenizer \
+--no-container-mount-home \
+--pty /bin/bash \
+
+# Inside the container run the following. 
+
+cd megatron-lm/
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+
+TOKENIZER_ARGS=(
+    --vocab-file /workspace/tokenizer/gpt2-vocab.json
+    --merge-file /workspace/tokenizer/gpt2-merges.txt
+    --tokenizer-type GPT2BPETokenizer
+)
+
+MODEL_ARGS=(
+    --use-checkpoint-args
+    --use-mcore-models
+    --load /workspace/mcore_gpt_ckpt
+)
+
+INFERENCE_SPECIFIC_ARGS=(
+    --attention-dropout 0.0
+    --hidden-dropout 0.0
+    --num-tokens-to-generate 20
+    --max-batch-size 4
+)
+
+torchrun --nproc-per-node=4 examples/inference/gpt/gpt_static_inference.py \
+    ${TOKENIZER_ARGS[@]} \
+    ${MODEL_ARGS[@]} \
+    ${INFERENCE_SPECIFIC_ARGS[@]} \
+    --prompts "prompt one " "sample prompt two" "sample prompt 3"
+
+NOTE: Other parameters which can be customized for inference:
+--temperature (Sampling temperature)
+--top_k (top_k sampling)
+--top_p (top_p sampling)
+--num-tokens-to-generate (Number of tokens to generate for each prompt)
+--inference-batch-times-seqlen-threshold (During inference, if batch-size times sequence-length is smaller than this threshold then we will not use microbatched pipelining.')
+--use-dist-ckpt (If using dist checkpoint format for the model)
+--use-legacy-models (If using legacy models instead of MCore models)
+
+```
+
+
+<br>
+
+
+#### 2. Control Flow in the MCore Backend
+An example of inference with static batching is provided in [gpt_static_inference.py](./gpt/gpt_static_inference.py).
+* [mcore_engine](../../megatron/core/inference/engines/mcore_engine.py) **generate()** function is called with the input prompts.
+* The `Scheduler` in the engine will add these prompts to the [active requests] pool (../../megatron/core/inference/inference_request.py) until max batch size is hit. Remaining requests will be added to the waiting requests pool. 
+* The engine will run until all requests (waiting + active) are completed. 
+    * The active requests are passed into  **generate_all_output_tokens_static_batch()** of the text generation controller . 
+    * This function uses the **prep_model_for_inference()** method of the [model_inference_wrappers](../../megatron/core/inference/model_inference_wrappers/abstract_model_inference_wrapper.py) and runs an autoregressive sampling loop
+    * In the autoregressive loop, the **get_batch_for_context_window()** method of the inference wrapper is called to slice out the input tokens and masks
+    * Input tokens and masks are passed it into the **run_one_forward_step()** method, which calls the model `.forward()` method to get the output logits
+    * Output logits are synchronized across all pipeline parallel ranks
+    * The text generation controller obtains the log probabilities and samples tokens based on the strategy defined in the sampling parameters.
+    * The sampled tokens are then appended to the input prompt tokens for the next iteration 
+    * The **update_generation_status()** method of the text generation controller checks which prompts have finished generating or hit a stop condition
+    * After the inference loop, the result is detokenized and stored as an attribute of the InferenceRequest. These requests are marked as completed. 
+    * The **update_requests_pool()** method of the scheduler moves completed requests into the completed request pool and waiting requests into the active request pool
+
+<br>
+
+#### 3. Customizing The Inference Pipeline
+
+The inference pipeline supports three levels of customization:
+
+* **Inference engine** - The MCore Engine supports static and dynamic batching. Modify this to add a new backend.
+* **Text generation controller** - The main sampling loop. Customize this to support alternative tokenization or implement a new sampling strategy.
+* **Inference Wrapped Model** - Change this to support a new model.
+* **Modify Inference Parameters** - Change this to update top_p, top_k, number of tokens to be generated, temperature, and other sampling parameters.
+
+<br>
+
+##### 3.1. Create Your Own Inference Backend 
+The  [abstract_engine.py](./../../megatron/core/inference/engine/abstract_engine.py) file contains a `generate` method that can be extended to support a new backend. 
+
+```python
+class AbstractEngine(ABC):
+    @staticmethod
+    def generate(self) -> dict:
+        """The abstract backend's generate function. 
+
+        To define a new backend, implement this method and return the outputs as a dictionary. 
+```
+
+<br>
+
+##### 3.2. Implement a new Sampling Loop 
+
+The [TextGenerationController](../../megatron/core/inference/text_generation_controllers/text_generation_controller.py) contains the main sampling loop and can be modified to support new tokenization, detokenization, or sampling strategies.
+
+``` python
+class TextGenerationController:
+
+    def tokenize_prompt(self, prompt: str) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Utility to tokenize the input prompts"""
+
+    def sample_from_logits(
+        self,
+        last_token_logits: torch.Tensor,
+        sampling_params: SamplingParams,
+        vocab_size: int,
+        generation_started : Optional[torch.Tensor] = None,
+        top_n_logprobs_dict: Dict[int, List[Dict[str, float]]] = None,
+    ) -> torch.Tensor:
+        """Samples the logits to generate outputs
+
+        Given the logits of the last token, this function samples according to the parameters defined in sampling_params and returns the sampled tokens. If sampling_params.top_n_logprobs > 0 
+        at each step it also updates the top_n_logprobs_dict.
+        """
+
+    def update_generation_status(
+        self,
+        updated_prompts_tokens: torch.Tensor,
+        generation_started: torch.Tensor,
+        current_context_end_position: int,
+        is_generation_done_tensor: torch.Tensor,
+        generated_sequence_lengths: torch.Tensor,
+    ) -> torch.Tensor:
+        """Function to check which prompts have reached an end condition
+
+        We check which prompts have reached an end condition and set the corresponding flags of the is_generation_done_tensor to True . The generated sequence lengths increases as we keep generating, until that prompts hits an eod condition. The generation started status tensor helps us determine which prompts have started generating
+        """
+
+    def generate_all_output_tokens_static_batch(
+        self, active_requests: OrderedDict[int, InferenceRequest],
+    ) -> OrderedDict[int, InferenceRequest]:
+        """Utility to generate all the output tokens and probabilities for the prompts .
+
+        This utility generates the output tokens for a static batch. It runs the forward steps till all prompts complete generation, updates the status of these requests to completed, adds the generated result and returns these requests
+        """
+
+    def detokenize_generations(self, prompt_tokens_with_generated_tokens: torch.Tensor) -> str:
+        """Detokenize the output generations"""
+```
+
+<br>
+
+##### 3.3. Support Other Models
+Extend [abstract_model_inference_wrapper.py](./../../megatron/core/inference/model_inference_wrappers/abstract_model_inference_wrapper.py) to support other models. The abstract model wrapper implements: 
+* Forward method which calls the model `forward` method depending on model parallel settings
+* Initializes the model and puts it in `.eval()` mode
+* Setup for the input parameters (max batch size, max seq length) 
+
+The following methods should be implemented: 
+```python
+class AbstractModelInferenceWrapper:
+    def prep_model_for_inference(self, prompts_tokens: torch.Tensor):
+        """A utility function for preparing model for inference
+
+        The function gets called once before the auto regressive inference loop. It puts the model in eval mode , and gets some model and inference data parameters. Extend this to build position ids ,attention mask etc, so that required slices can be extracted during the forward pass
+        """
+
+    @abc.abstractclassmethod
+    def get_batch_for_context_window(self) -> List:
+        """Returns the input data for inference 
+
+        This function gets called iteratively in the inference loop. It can be used to extract relevant input from the prompt tokens, attention mask etc. required for each step in inference.
+```
+
+Refer to [gpt_inference_wrapper.py](../../megatron/core/inference/model_inference_wrappers/gpt/gpt_inference_wrapper.py) for an example of implementing this for GPTModel.
+
+<br>
+
+##### 3.3. Modify Inference Parameters
+We use  [common inference params](../../megatron/core/inference/sampling_params.py) for text generation. Customize this to change `top_p`, `top_k`, number of tokens to generate etc. Other attributes can be added for the inference loop as shown below.
+
+```
+from megatron.core.inference.sampling_params import SamplingParams
+
+c = SamplingParams(temperature=0.5)
+c.add_attributes({'min_length':4, 'eod_id':153})
+```
+
+<br>
+
+#### 4. Future work
+The following features are planned for future releases.
+* TRTLLM Engine support
+* Continuous batching optimizations
+* Speculative decoding
\ No newline at end of file
--- a/Megatron-LM/examples/inference/gpt/gpt_dynamic_inference.py
+++ b/Megatron-LM/examples/inference/gpt/gpt_dynamic_inference.py
+# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+
+import torch
+from argparse import ArgumentParser
+from collections import defaultdict
+from tqdm import tqdm
+from typing import List
+
+from megatron.core.inference.contexts.dynamic_context import (
+    ContextOverflowError,
+    DynamicInferenceContext,
+)
+from megatron.core.inference.engines import DynamicInferenceEngine
+from megatron.core.inference.model_inference_wrappers.gpt.gpt_inference_wrapper import GPTInferenceWrapper
+from megatron.core.inference.sampling_params import SamplingParams
+from megatron.core.inference.text_generation_controllers.text_generation_controller import TextGenerationController
+from megatron.core.transformer.module import MegatronModule
+from megatron.training import (
+    get_args,
+    get_model as _get_model,
+    get_tokenizer,
+    initialize_megatron,
+)
+from megatron.training.checkpointing import load_checkpoint
+from pretrain_gpt import model_provider
+
+from .utils import add_common_inference_args, build_requests, get_curr_time, Request
+
+
+def add_dynamic_inference_args(parser: ArgumentParser) -> ArgumentParser:
+    """Dynamic inference arguments."""
+
+    add_common_inference_args(parser)
+
+    group = parser.add_argument_group(title='Dynamic inference')
+    group.add_argument("--inference-ckpt-non-strict", action="store_true",
+                       help="Load checkpoint with `strict=False`.")
+    
+
+    return parser
+
+
+def get_model() -> MegatronModule:
+    """Initialize model and load checkpoint."""
+
+    args = get_args()
+
+    # Build model.
+    model = _get_model(model_provider, wrap_with_ddp=False)
+
+    # Load checkpoint.
+    assert args.load is not None
+    args.exit_on_missing_checkpoint = True
+    load_checkpoint(
+        ddp_model=model,
+        optimizer=None,
+        opt_param_scheduler=None,
+        strict=not args.inference_ckpt_non_strict,
+    )
+
+    # No virtual PP.
+    assert len(model) == 1, "Above condition should have caught this"
+    model = model[0]
+
+    # Eval mode.
+    model.eval()
+
+    return model
+
+
+def get_inference_context(
+    requests: List[Request],
+    sampling_params: SamplingParams,
+):
+    """The inference context manages the KV cache and other inference state."""
+
+    args = get_args()
+
+    # Max sequence length.
+    max_gen_length = sampling_params.num_tokens_to_generate
+    max_context_length = max(len(r.prompt_tokens) for r in requests)
+    max_sequence_length = max_context_length + max_gen_length
+
+    # Inference context.
+    context = DynamicInferenceContext(
+        params_dtype=args.params_dtype,
+        num_layers=args.num_layers,
+        kv_channels=args.kv_channels,
+        num_attention_heads=args.num_query_groups if args.group_query_attention else args.num_attention_heads,
+        max_sequence_length=max_sequence_length,
+        buffer_size_gb=args.inference_dynamic_batching_buffer_size_gb,
+        buffer_guaranteed_fraction=args.inference_dynamic_batching_buffer_guaranteed_fraction,
+        chunk_size_tokens=args.inference_dynamic_batching_chunk_size,
+        buffer_overflow_factor=args.inference_dynamic_batching_buffer_overflow_factor,
+        max_requests_override=args.inference_dynamic_batching_max_requests_override,
+        max_tokens_override=args.inference_dynamic_batching_max_tokens_override,
+        tensor_model_parallel_size=args.tensor_model_parallel_size,
+    )
+
+    return context
+
+
+def get_inference_controller(
+    model: MegatronModule,
+    context: DynamicInferenceContext,
+) -> TextGenerationController:
+    """Buid text generation controller, which manages the model inference context.
+
+    Args:
+        model (MegatronModule): Megatron GPT model.
+        context (DynamicInferenceContext): Context for managing KV cache.
+
+    Return:
+        (TextGenerationController) Inference text generation controller.
+    """
+
+    args = get_args()
+    tokenizer = get_tokenizer()
+
+    # Wrap model in inference wrapper.
+    model = GPTInferenceWrapper(model, args, context)
+
+    # Note: the following is taken from AbstractModelInferenceWrapper.prep_model_for_inference().
+    from megatron.core import parallel_state
+    model.model_is_pipeline_parallel = not (
+        parallel_state.is_pipeline_first_stage() and
+        parallel_state.is_pipeline_last_stage()
+    )
+
+    # Text generation controller.
+    controller = TextGenerationController(model, tokenizer)
+
+    return controller
+
+
+def run_inference(
+    requests: List[Request],
+    sampling_params: SamplingParams,
+    engine: DynamicInferenceEngine,
+) -> None:
+    """Add requests to engine and generate tokens.
+
+    Args:
+        requests (List[Request]): Requests that are to be added and processed.
+        sampling_params (SamplingParams): Sampling params for the logits.
+        engine (DynamicInferenceEngine): Inference engine that manages generating tokens.
+
+    Return:
+        None.
+    """
+
+    # Initialize request arrival times.
+    base_arrival_time = get_curr_time()
+    for request in requests:
+        request.time_arrival = request.time_offset + base_arrival_time
+
+    # Add and process requests.
+    num_requests_total = len(requests)
+    num_requests_added = 0
+    num_requests_finished = 0
+    step_id = 0
+    step_times = {"prefill": [], "decode": []}
+    add_times = []
+    output_times = []
+    tbar = tqdm(total=num_requests_total)
+    while True:
+        curr_time = get_curr_time()
+
+        # Add requests with 'earlier' arrival time.
+        add_start = get_curr_time()
+        while num_requests_added < num_requests_total:
+            request = requests[num_requests_added]
+            if request.time_arrival > curr_time:
+                break
+            try:
+
+                # Using `prompt_text` instead of `prompt_tokens` for fair comparison.
+                engine.add_request(num_requests_added, request.prompt_text)
+                request.time_start = get_curr_time()
+                request.state = "started"
+                num_requests_added += 1
+                tbar.update(1)
+            except ContextOverflowError:
+                break
+        add_times.append(get_curr_time() - add_start)
+
+        # Step inference engine (i.e., generate a token for each active request).
+        is_decode_only = engine.context.is_decode_only()
+        finished_requests, step_time = engine.step(sampling_params, verbose=True)
+        step_id += 1
+
+        if len(finished_requests) > 0:
+            output_start = get_curr_time()
+            if is_decode_only:
+                step_times["decode"].append(step_time)
+            else:
+                step_times["prefill"].append(step_time)
+
+            # Append output tokens.
+            for finished_request in finished_requests:
+                request = requests[finished_request.request_id]
+                request.output_tokens = finished_request.generated_tokens
+                request.time_end = get_curr_time()
+                request.output_text = finished_request.generated_text
+                request.state = "finished"
+                num_requests_finished += 1
+            
+            output_times.append(get_curr_time() - output_start)
+
+        # Check if all requests are finished.
+        if not (engine.has_unfinished_requests() or
+                num_requests_added < num_requests_total):
+            break
+
+    return step_times, add_times, output_times
+
+
+if __name__ == "__main__":
+
+    # Initialize Megatron.
+    initialize_megatron(
+        extra_args_provider=add_dynamic_inference_args,
+        args_defaults={'no_load_rng': True,
+                       'no_load_optim': True},
+    )
+
+    args = get_args()
+    tokenizer = get_tokenizer()
+
+    # Sampling params.
+    sampling_params = SamplingParams(
+        temperature=args.temperature,
+        top_k=args.top_k,
+        top_p=args.top_p,
+        return_log_probs=args.return_log_probs,
+        num_tokens_to_generate=args.num_tokens_to_generate,
+    )
+
+    # Requests, context, conroller.
+    model = get_model()
+    requests = build_requests(args, tokenizer)
+    context = get_inference_context(requests, sampling_params)
+    controller = get_inference_controller(model, context)
+
+    # Inference engine.
+    engine = DynamicInferenceEngine(controller,
+                                    context,
+                                    termination_id=tokenizer.eod,
+                                    enable_cuda_graph=args.enable_cuda_graph,
+                                    random_seed=args.seed)
+
+    # Print setup.
+    setup_prefix = "dynamic | cg %d | %s | bf %.0f, flw %.1f [r %d, t %d], gtd %.2f [r %d] ... reqs %d" % (
+        args.enable_cuda_graph,
+        (
+            f"<user prompts, n {len(args.prompts)}>"
+            if args.prompts else
+            "<auto prompts> %s, %d, %.1e, %.1e" % (
+                "(%s)" % " ".join(map(str, args.num_tokens_to_prompt)),
+                args.num_tokens_to_generate,
+                args.incoming_requests_duration,
+                args.incoming_requests_per_sec,
+            )
+        ),
+        args.inference_dynamic_batching_buffer_size_gb,
+        args.inference_dynamic_batching_buffer_overflow_factor,
+        context.max_requests,
+        context.max_tokens,
+        args.inference_dynamic_batching_buffer_guaranteed_fraction,
+        context.gtd_request_count,
+        len(requests),
+    )
+    print("~~~")
+    print(setup_prefix)
+    print("~~~")
+
+    # Run and time test.
+    t = get_curr_time()
+    step_times, add_times, output_times = run_inference(requests, sampling_params, engine)
+    total_time = get_curr_time() - t
+
+    # Validate all requests finished.
+    for request in requests:
+        assert request.state == "finished"
+
+    # Print unique prompts + outputs.
+    if torch.distributed.get_rank() == 0:
+
+        print("~~~~ Unique prompts + outputs. ~~~~")
+
+        # Map requests by their prompt.
+        unique_prompt_map = defaultdict(list)
+        for request_idx, request in enumerate(requests):
+            unique_prompt_map[request.prompt_text].append(request_idx)
+
+        # Print unique prompts + outputs.
+        for unique_idx, (prompt_text, request_idxs) in enumerate(unique_prompt_map.items()):
+            request_idx = request_idxs[0]
+            request = requests[request_idx]
+            print(f"{unique_idx}/{len(unique_prompt_map)} [{len(request_idxs)}]. {prompt_text} ... %s" % request.output_text.replace("\n", "\\n"))
+
+    # Timing results.
+    stats = torch.cuda.memory_stats()
+    print("~~~")
+    print("%s ... mem %.1f/%.1f ... total time: %.3f ... step time: total %.3f [ p %.3f, d %.3f ], mean [ p %.3f, d %.3f ], count [ p %d, d %d ] ... add time: %.3f, output time: %.3f." % (
+        setup_prefix,
+        stats["allocated_bytes.all.peak"] / (1024**3),
+        stats["reserved_bytes.all.peak"] / (1024**3),
+        sum(step_times["prefill"]) + sum(step_times["decode"]) + sum(add_times),
+        sum(step_times["prefill"]) + sum(step_times["decode"]),
+        sum(step_times["prefill"]),
+        sum(step_times["decode"]),
+        sum(step_times["prefill"]) / len(step_times["prefill"]),
+        sum(step_times["decode"]) / len(step_times["decode"]),
+        len(step_times["prefill"]),
+        len(step_times["decode"]),
+        sum(add_times),
+        sum(output_times),
+    ))
+    print("~~~")
--- a/Megatron-LM/examples/inference/gpt/gpt_dynamic_inference_12b.sh
+++ b/Megatron-LM/examples/inference/gpt/gpt_dynamic_inference_12b.sh
+# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+
+# Run dynamic batching inference on the 12B GPT model.
+
+set -u
+
+pip install simpy
+pip install sentencepiece
+pip install tiktoken
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+
+: ${CHECKPOINT_DIR:?"CHECKPOINT_DIR is not set"}
+: ${TOKENIZER_MODEL:?"TOKENIZER_MODEL is not set"}
+
+: ${NUM_TOKENS_TO_PROMPT="8 32"}
+: ${NUM_TOKENS_TO_GENERATE=256}
+: ${INCOMING_REQUESTS_DURATION=10.}
+: ${INCOMING_REQUESTS_PER_SEC=100.}
+
+: ${INFERENCE_DYNAMIC_BATCHING_BUFFER_SIZE_GB=50.}
+: ${INFERENCE_DYNAMIC_BATCHING_BUFFER_OVERFLOW_FACTOR=1.}
+: ${INFERENCE_DYNAMIC_BATCHING_BUFFER_GUARANTEED_FRACTION=0.05}
+
+: ${ENGINE=dynamic}
+: ${EXTRA_ARGS=""}
+# NSIGHT_PREFIX=/path/to/nsight/profile
+
+# --inference-rng-tracker \ # ... re-add after bugfix.
+ARGS=" \
+    --no-persist-layer-norm \
+    --apply-layernorm-1p \
+    --no-position-embedding \
+    --group-query-attention \
+    --num-query-groups 8 \
+    --load ${CHECKPOINT_DIR} \
+    --use-checkpoint-args \
+    --untie-embeddings-and-output-weights \
+    --disable-bias-linear \
+    --use-rotary-position-embeddings \
+    --position-embedding-type rope \
+    --rotary-base 1000000 \
+    --rotary-percent 1.0 \
+    --swiglu \
+    --normalization RMSNorm \
+    --attention-dropout 0.0 \
+    --hidden-dropout 0.0 \
+    --exit-duration-in-mins 5740 \
+    --tensor-model-parallel-size 1 \
+    --pipeline-model-parallel-size 1 \
+    --num-layers 40 \
+    --hidden-size 5120 \
+    --ffn-hidden-size 14336 \
+    --num-attention-heads 32 \
+    --kv-channels 128 \
+    --seq-length 1024 \
+    --max-position-embeddings 1024 \
+    --micro-batch-size 64 \
+    --bf16 \
+    --tokenizer-type TikTokenizer \
+    --tiktoken-pattern v2 \
+    --tokenizer-model ${TOKENIZER_MODEL} \
+    --distributed-timeout-minutes 2400 \
+    --transformer-impl local \
+    --use-flash-attn \
+    \
+    --inference-dynamic-batching \
+    --inference-dynamic-batching-buffer-size-gb ${INFERENCE_DYNAMIC_BATCHING_BUFFER_SIZE_GB} \
+    --inference-dynamic-batching-buffer-overflow-factor ${INFERENCE_DYNAMIC_BATCHING_BUFFER_OVERFLOW_FACTOR} \
+    --inference-dynamic-batching-buffer-guaranteed-fraction ${INFERENCE_DYNAMIC_BATCHING_BUFFER_GUARANTEED_FRACTION} \
+    \
+    --enable-cuda-graph \
+    ${EXTRA_ARGS} \
+"
+
+if [[ -v PROMPTS ]]; then
+    ARGS+=" --prompts ${PROMPTS}"
+else
+    ARGS+=" \
+        --num-tokens-to-prompt ${NUM_TOKENS_TO_PROMPT} \
+        --num-tokens-to-generate ${NUM_TOKENS_TO_GENERATE} \
+        --incoming-requests-duration ${INCOMING_REQUESTS_DURATION} \
+        --incoming-requests-per-sec ${INCOMING_REQUESTS_PER_SEC} \
+    "
+fi
+
+CMD="python -m examples.inference.gpt.gpt_${ENGINE}_inference ${ARGS}"
+if [[ -v NSIGHT_PREFIX ]]; then
+    CMD="nsys profile -t cuda,nvtx,mpi -s none --wait=primary --show-output=true --force-overwrite=true --export=sqlite -o ${NSIGHT_PREFIX} ${CMD}"
+fi
+
+eval ${CMD}
--- a/Megatron-LM/examples/inference/gpt/gpt_dynamic_inference_357m.sh
+++ b/Megatron-LM/examples/inference/gpt/gpt_dynamic_inference_357m.sh
+# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+
+# Run dynamic batching inference on the 357M GPT model.
+
+set -u
+
+pip install simpy
+pip install sentencepiece
+pip install tiktoken
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+
+: ${CHECKPOINT_DIR:?"CHECKPOINT_DIR is not set"}
+: ${VOCAB_FILE:?"VOCAB_FILE is not set"}
+: ${MERGE_FILE:?"MERGE_FILE is not set"}
+
+: ${NUM_TOKENS_TO_PROMPT="8 32"}
+: ${NUM_TOKENS_TO_GENERATE=256}
+: ${INCOMING_REQUESTS_DURATION=10.}
+: ${INCOMING_REQUESTS_PER_SEC=100.}
+
+: ${INFERENCE_DYNAMIC_BATCHING_BUFFER_SIZE_GB=50.}
+: ${INFERENCE_DYNAMIC_BATCHING_BUFFER_OVERFLOW_FACTOR=1.}
+: ${INFERENCE_DYNAMIC_BATCHING_BUFFER_GUARANTEED_FRACTION=0.05}
+
+: ${ENGINE=dynamic}
+: ${EXTRA_ARGS=""}
+# NSIGHT_PREFIX=/path/to/nsight/profile
+
+# --inference-rng-tracker \ # ... re-add after bugfix.
+ARGS=" \
+    --exit-on-missing-checkpoint \
+    --transformer-impl local \
+    --load ${CHECKPOINT_DIR} \
+    --tokenizer-type GPT2BPETokenizer \
+    --vocab-file ${VOCAB_FILE} \
+    --merge-file ${MERGE_FILE} \
+    --exit-on-missing-checkpoint \
+    --max-position-embeddings 2048 \
+    --seq-length 2048 \
+    --tensor-model-parallel-size 1 \
+    --pipeline-model-parallel-size 1 \
+    --num-layers 24 \
+    --num-attention-heads 16 \
+    --hidden-size 1024 \
+    --bf16 \
+    --micro-batch-size 1 \
+    --attention-dropout 0.0 \
+    --hidden-dropout 0.0 \
+    --seed 42 \
+    --use-flash-attn \
+    \
+    --inference-dynamic-batching \
+    --inference-dynamic-batching-buffer-size-gb ${INFERENCE_DYNAMIC_BATCHING_BUFFER_SIZE_GB} \
+    --inference-dynamic-batching-buffer-overflow-factor ${INFERENCE_DYNAMIC_BATCHING_BUFFER_OVERFLOW_FACTOR} \
+    --inference-dynamic-batching-buffer-guaranteed-fraction ${INFERENCE_DYNAMIC_BATCHING_BUFFER_GUARANTEED_FRACTION} \
+    \
+    --enable-cuda-graph \
+    ${EXTRA_ARGS} \
+"
+
+if [[ -v PROMPTS ]]; then
+    ARGS+=" --prompts ${PROMPTS}"
+else
+    ARGS+=" \
+        --num-tokens-to-prompt ${NUM_TOKENS_TO_PROMPT} \
+        --num-tokens-to-generate ${NUM_TOKENS_TO_GENERATE} \
+        --incoming-requests-duration ${INCOMING_REQUESTS_DURATION} \
+        --incoming-requests-per-sec ${INCOMING_REQUESTS_PER_SEC} \
+    "
+fi
+
+CMD="python -m examples.inference.gpt.gpt_${ENGINE}_inference ${ARGS}"
+if [[ -v NSIGHT_PREFIX ]]; then
+    CMD="nsys profile -t cuda,nvtx,mpi -s none --wait=primary --show-output=true --force-overwrite=true --export=sqlite -o ${NSIGHT_PREFIX} ${CMD}"
+fi
+
+eval ${CMD}
--- a/Megatron-LM/examples/inference/gpt/gpt_static_inference.py
+++ b/Megatron-LM/examples/inference/gpt/gpt_static_inference.py
+# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+
+import os
+from megatron.core.inference.model_inference_wrappers.inference_wrapper_config import (
+    InferenceWrapperConfig,
+)
+from pretrain_mamba import model_provider as mamba_model_provider
+from pretrain_gpt import model_provider as gpt_model_provider
+import torch
+import sys
+import time
+import tqdm
+import warnings
+from argparse import Namespace
+from megatron.core.inference.contexts import StaticInferenceContext
+from megatron.core.inference.engines import StaticInferenceEngine
+from megatron.core.inference.sampling_params import SamplingParams
+from megatron.core.inference.model_inference_wrappers.gpt.gpt_inference_wrapper import (
+    GPTInferenceWrapper,
+)
+from megatron.core.inference.inference_request import InferenceRequest
+from megatron.core.inference.text_generation_controllers.text_generation_controller import (
+    TextGenerationController,
+)
+from megatron.core.transformer.module import MegatronModule
+
+sys.path.append(
+    os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir, os.path.pardir))
+)
+
+from megatron.training import get_args, get_tokenizer, print_rank_0
+from megatron.training.checkpointing import load_checkpoint
+from megatron.core import mpu
+import json
+from megatron.training.initialize import initialize_megatron
+from megatron.training import get_model
+import asyncio
+from typing import AsyncIterator, List
+
+from examples.inference.gpt.utils import add_common_inference_args, build_requests
+
+
+def add_static_inference_args(parser):
+    """Static inference arguments."""
+
+    add_common_inference_args(parser)
+
+    group = parser.add_argument_group(title='Static inference')
+    group.add_argument(
+        "--max-batch-size",
+        type=int,
+        default=None,
+        dest="max_batch_size",
+        help='Deprecated, use `--inference-max-requests` instead',
+    )
+    group.add_argument("--stream", action="store_true", default=False, help="Stream output tokens")
+    group.add_argument(
+        "--output-path", type=str, default=None, help="Path to save generations as JSON"
+    )
+
+    return parser
+
+
+def get_inference_engine(args: Namespace, model: MegatronModule) -> StaticInferenceEngine:
+    """Utility to get the relevant backend for running inference
+
+    This function will automatically choose the TRTLLMBackend when possible, and if not revert to Mcore backend if the user does not specify any backends. TRT LLM Backend is not implmented yet.
+
+    Args:
+        args (Namespace): The user arguments parsed from command line
+        model (MegatronModule): The megatron model .
+
+    Returns:
+        AbstractBackend: The chosen backend
+    """
+    tokenizer = get_tokenizer()
+
+    inference_wrapper_config = InferenceWrapperConfig(
+        hidden_size=args.hidden_size,
+        inference_batch_times_seqlen_threshold=args.inference_batch_times_seqlen_threshold,
+        fp32_residual_connection=args.fp32_residual_connection,
+        params_dtype=args.params_dtype,
+        padded_vocab_size=args.padded_vocab_size,
+        inference_max_requests=args.inference_max_batch_size,
+        inference_max_seq_length=args.inference_max_seq_length,
+        nccl_all_reduce_for_prefill=args.nccl_all_reduce_for_prefill,
+    )
+
+    inference_context = StaticInferenceContext.from_config(inference_wrapper_config)
+
+    inference_wrapped_model = GPTInferenceWrapper(
+        model, inference_wrapper_config, inference_context
+    )
+    text_generation_controller = TextGenerationController(
+        inference_wrapped_model=inference_wrapped_model, tokenizer=tokenizer
+    )
+    return StaticInferenceEngine(text_generation_controller=text_generation_controller)
+
+
+async def generate(
+    inference_engine: StaticInferenceEngine, sampling_params: SamplingParams, prompts: List[str]
+) -> List[InferenceRequest]:
+    async def collect_stream(prompt, request_id, stream_generator):
+        print(f"Request {request_id}: {prompt}", end="", flush=True)
+        prev_idx = 0
+        async for output in stream_generator:
+            print(output.generated_text[prev_idx:], end="", flush=True)
+            prev_idx = len(output.generated_text)
+        print()
+
+    request_ids: List[str] = [
+        inference_engine.add_request(prompt=prompt, sampling_params=sampling_params, streaming=True)
+        for prompt in prompts
+    ]
+    stream_generators = [
+        inference_engine.get_stream_generator(request_id) for request_id in request_ids
+    ]
+
+    tasks = [
+        asyncio.create_task(collect_stream(prompt, request_id, stream_generator))
+        for (prompt, request_id, stream_generator) in zip(prompts, request_ids, stream_generators)
+    ]
+
+    await inference_engine.run_engine_async()
+    await asyncio.gather(*tasks)
+
+    results: List[InferenceRequest] = [
+        inference_engine.scheduler.completed_request_pool[request_id] for request_id in request_ids
+    ]
+
+    return results
+
+
+def main():
+    """Main program."""
+
+    # Note: The default args passed here can be overwritten by using appropriate params (check arguments.py file)
+    # Micro batch size is not needed to be set by user. (It is calculated based on inference-batch-times-seqlen-threshold argument)
+    initialize_megatron(
+        extra_args_provider=add_static_inference_args,
+        args_defaults={
+            'no_load_rng': True,
+            'no_load_optim': True,
+            'micro_batch_size': 1,
+            'exit_on_missing_checkpoint': True,
+        },
+    )
+
+    args = get_args()
+
+    if args.max_batch_size is not None:
+        warnings.warn(
+            f"`--max-batch-size` has been deprecated in favor of `--inference-max-requests`."
+        )
+        args.inference_max_batch_size = max(args.max_batch_size, args.inference_max_batch_size)
+
+    # Set up model and load checkpoint
+    if args.model_provider == "gpt":
+        model_provider = gpt_model_provider
+    elif args.model_provider == "mamba":
+        model_provider = mamba_model_provider
+    else:
+        raise ValueError(f"Invalid model provider {args.model_provider}")
+    model = get_model(model_provider, wrap_with_ddp=False)
+    load_checkpoint(model, None, None, strict=False)
+    model = model[0]
+
+    inference_engine = get_inference_engine(args, model)
+
+    sampling_params = SamplingParams(
+        temperature=args.temperature,
+        top_k=args.top_k,
+        top_p=args.top_p,
+        return_log_probs=args.return_log_probs,
+        num_tokens_to_generate=args.num_tokens_to_generate,
+        top_n_logprobs=args.top_n_logprobs,
+    )
+
+    requests = build_requests(args, get_tokenizer())
+    prompts = [r.prompt_text for r in requests]
+
+    if args.enable_cuda_graph:
+        print(f"Running warmup for CUDA graphs...")
+        inference_engine.generate(
+            prompts=["warmup"], sampling_params=SamplingParams(num_tokens_to_generate=10)
+        )
+    start_time = time.perf_counter()
+    if args.stream:
+        results: List[InferenceRequest] = asyncio.run(
+            generate(inference_engine, sampling_params, prompts)
+        )
+    else:
+        results: List[InferenceRequest] = inference_engine.generate(
+            prompts=prompts, sampling_params=sampling_params
+        )
+    end_time = time.perf_counter()
+    latency = end_time - start_time
+
+    if torch.distributed.get_rank() == 0:
+        for idx, result in enumerate(results):
+            print(f' \n------------- RESULT FOR PROMPT {idx} --------------- ')
+            result_dict = {
+                'id': result.request_id,
+                'input_prompt': result.prompt,
+                'generated_text': result.generated_text,
+                'generated_tokens': result.generated_tokens,
+                'latency': latency,
+            }
+            if sampling_params.top_n_logprobs > 0:
+                result_dict['generated_top_n_logprobs'] = result.generated_top_n_logprobs
+            if sampling_params.return_log_probs:
+                response_logprobs = result.prompt_log_probs + result.generated_log_probs
+                result_dict["logprobs"] = response_logprobs
+
+        # Write results to JSON. Primarily used for functional testing.
+        if args.output_path:
+            # Tensors cannot be serialized so we move these to CPU
+            result_dict['generated_tokens'] = result_dict['generated_tokens'].cpu().numpy().tolist()
+            results_as_json = json.dumps(result_dict)
+            with open(args.output_path, 'w') as f:
+                json.dump(results_as_json, f)
+
+    # Print unique prompts + outputs.
+    if torch.distributed.get_rank() == 0:
+
+        print("~~~~ Unique prompts + outputs. ~~~~")
+
+        # Map results by their prompt.
+        from collections import defaultdict
+
+        unique_prompt_map = defaultdict(list)
+        for result_idx, result in enumerate(results):
+            unique_prompt_map[result.prompt].append(result_idx)
+
+        # Print unique prompts + outputs.
+        for unique_idx, (prompt_text, result_idxs) in enumerate(unique_prompt_map.items()):
+            result_idx = result_idxs[0]
+            result = results[result_idx]
+            generated_text = result.generated_text.replace("\n", "\\n")
+            print(
+                f"{unique_idx}/{len(unique_prompt_map)} [{len(result_idxs)}]. {prompt_text} "
+                f"... {generated_text}"
+            )
+
+    stats = torch.cuda.memory_stats()
+    print_rank_0(
+        "static | cg %d | %s | reqs %d [ batch %d ] ... mem %.1f/%.1f ... time %.3f."
+        % (
+            args.enable_cuda_graph,
+            (
+                f"<user prompts>"
+                if args.prompts
+                else "<auto prompts> %s, %d, %.1e, %.1e"
+                % (
+                    "(%s)" % " ".join(map(str, args.num_tokens_to_prompt)),
+                    args.num_tokens_to_generate,
+                    args.incoming_requests_duration,
+                    args.incoming_requests_per_sec,
+                )
+            ),
+            len(requests),
+            args.inference_max_batch_size,
+            stats["allocated_bytes.all.peak"] / (1024**3),
+            stats["reserved_bytes.all.peak"] / (1024**3),
+            latency,
+        )
+    )
+
+    torch.distributed.destroy_process_group()
+
+
+if __name__ == "__main__":
+    main()
--- a/Megatron-LM/examples/inference/gpt/utils.py
+++ b/Megatron-LM/examples/inference/gpt/utils.py
+# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+
+import random
+import time
+import torch
+from argparse import ArgumentParser, Namespace
+from typing import Any, List
+
+
+def add_common_inference_args(parser: ArgumentParser) -> ArgumentParser:
+    """Common inference arguments."""
+
+    group = parser.add_argument_group(title='Common inference')
+
+    group.add_argument("--temperature", type=float, default=1.0, help='Sampling temperature.')
+    group.add_argument("--top_k", type=int, default=1, help='Top k sampling.')
+    group.add_argument("--top_p", type=float, default=0.0, help='Top p sampling.')
+    group.add_argument(
+        "--return-log-probs",
+        action='store_true',
+        default=False,
+        help='Return the log probabilities of the final output tokens',
+    )
+    group.add_argument(
+        "--prompts",
+        metavar='N',
+        type=str,
+        nargs='+',
+        help='Input prompts with each prompt within quotes and seperated by space',
+    )
+    group.add_argument(
+        "--num-tokens-to-prompt",
+        type=int,
+        nargs="+",
+        default=[64, 1024],
+        help='Number of tokens to use for simulated prompts. This should be a '
+        'space-separated pair of integers, and the generated prompt lengths will '
+        'be uniformly sampled within this range.',
+    )
+    group.add_argument(
+        "--num-tokens-to-generate",
+        type=int,
+        default=30,
+        help='Number of tokens to generate for each prompt',
+    )
+    group.add_argument(
+        "--top-n-logprobs",
+        type=int,
+        default=0,
+        help='Return the top n logprobs for the generated tokens and their corresponding token as a dictionary',
+    )
+    group.add_argument(
+        "--incoming-requests-per-sec",
+        type=float,
+        default=100.0,
+        help="Simulated number of requests per second.",
+    )
+    group.add_argument(
+        "--incoming-requests-duration",
+        type=float,
+        default=10.0,
+        help="Total amount of time to simulate that requests are "
+        "arriving. Multiply this value with "
+        "`--incoming-requests-per-sec` to get the approximate "
+        "total number of requests.",
+    )
+    group.add_argument(
+        "--model-provider", choices=["mamba", "gpt"], default="gpt", help="Model provider"
+    )
+
+    return parser
+
+
+def get_curr_time() -> float:
+    """Get synchronized time across ranks."""
+    curr_time = torch.cuda.LongTensor([time.time_ns()])
+    if torch.distributed.is_initialized():
+        torch.distributed.broadcast(curr_time, src=0)
+    return curr_time.item() / 10**9
+
+
+class Request:
+    """Class to hold attributes for a single request.
+
+    A request is initialized with its prompt text. As it is added, processed,
+    and completed through the inference engine, the request is populated with its
+    start time, end time, and output tokens.
+
+    Args:
+        prompt_text (str): Prompt text.
+        time_offset (float): Artificial time offset for simulating incoming
+            requests. This value is later added to the `base_arrival_time` to
+            simulate the requests arrival time.
+        tokenizer (Any): Tokenizer for tokenizing the prompt.
+    """
+
+    def __init__(self, prompt_text: str, time_offset: float, tokenizer: Any):
+        self.prompt_text = prompt_text
+        self.prompt_tokens = tokenizer.tokenize(prompt_text)
+        self.output_text = None
+        self.output_tokens = []
+        self.time_offset = time_offset
+        self.time_arrival = None
+        self.time_start = None
+        self.time_end = None
+        self.state = "not-started"
+
+    def __str__(self) -> str:
+        return "state '%s'; prompt len %d; output len %d; '%s'" % (
+            self.state,
+            len(self.prompt_tokens),
+            len(self.output_tokens),
+            self.prompt_text,
+        )
+
+
+def get_user_requests(args: Namespace, tokenizer: Any) -> List[Request]:
+    requests = [Request(p, -1.0, tokenizer) for p in args.prompts]
+    return requests
+
+
+def get_auto_requests(args: Namespace, tokenizer: Any) -> List[Request]:
+    """Get example requests."""
+
+    import simpy  # Guard against this import in test case
+
+    random.seed(args.seed)
+
+    # Generate random time offsets.
+    def arrival(r):
+        while True:
+            yield env.timeout(random.expovariate(r))
+            time_offsets.append(env.now)
+
+    time_offsets = []
+    env = simpy.Environment()
+    env.process(arrival(args.incoming_requests_per_sec))
+    env.run(args.incoming_requests_duration)
+
+    # Ensure at least a single request.
+    if len(time_offsets) == 0:
+        time_offsets = [0.0]
+
+    # Initialize requests.
+    requests = [
+        Request("hi " * random.randint(*args.num_tokens_to_prompt), t, tokenizer)
+        for t in time_offsets
+    ]
+
+    return requests
+
+
+def build_requests(args: Namespace, tokenizer: Any) -> List[Request]:
+    if args.prompts:
+        return get_user_requests(args, tokenizer)
+    else:
+        return get_auto_requests(args, tokenizer)