Initial commit

4e867b3c · jerrrrry · 4e867b3c · 4e867b3c · 4e867b3c · 4e867b3c
Commit 4e867b3c authored Aug 06, 2025 by jerrrrry
20 changed files
--- a/Megatron-LM/examples/gpt3/README.md
+++ b/Megatron-LM/examples/gpt3/README.md
+# GPT3 MODEL
+
+## Table of contents
+- [1. Training Setup](#1-training-setup)
+- [2. Configurations](#2-configurations)
+- [3. Training Results](#3-training-results)
+
+## 1. Training setup
+<a id="markdown-training-setup" name="training-setup"></a>
+
+To run the model using a docker container run it as follows
+```
+PYTORCH_IMAGE=nvcr.io/nvidia/pytorch:24.01-py3
+CHECKPOINT_PATH="" #<Specify path>
+TENSORBOARD_LOGS_PATH=""#<Specify path>
+VOCAB_FILE="" #<Specify path to file>/gpt2-vocab.json
+MERGE_FILE="" #<Specify path to file>/gpt2-merges.txt
+DATA_PATH="" #<Specify path and file prefix>_text_document
+
+docker run \
+  --gpus=all \
+  --ipc=host \
+  --workdir /workspace/megatron-lm \
+  -v /path/to/data:/path/to/data \
+  -v /path/to/megatron-lm:/workspace/megatron-lm \
+  megatron-lm nvcr.io/nvidia/pytorch:24.01-py3 \
+  bash examples/gpt3/train_gpt3_175b_distributed.sh $CHECKPOINT_PATH $TENSORBOARD_LOGS_PATH $VOCAB_FILE $MERGE_FILE $DATA_PATH "
+
+```
+NOTE: Depending on the environment you are running it the above command might like slightly different.
+
+
+## 2. Configurations
+<a id="markdown-configurations" name="configurations"></a>
+The example in this folder shows you how to run 175B model. There are other configs you could run as well
+
+### 345M
+```
+       --num-layers 12 \
+       --hidden-size 512 \
+       --num-attention-heads 8 \
+       --seq-length 1024 \
+       --tensor-model-parallel-size 1 \
+       --pipeline-model-parallel-size 1 \
+
+```
+
+### 857M
+```
+       --num-layers 24 \
+       --hidden-size 1024 \
+       --num-attention-heads 16 \
+       --seq-length 2048 \
+       --tensor-model-parallel-size 1 \
+       --pipeline-model-parallel-size 1 \
+
+```
--- a/Megatron-LM/examples/gpt3/gpt_config.yaml
+++ b/Megatron-LM/examples/gpt3/gpt_config.yaml
+# WARNING: Yaml configs is currently an experimental feature
+language_model:
+  # model architecture
+  num_layers: 24
+  hidden_size: 1024
+  num_attention_heads: 16
+  num_query_groups: null
+
+  ffn_hidden_size: null
+  kv_channels: null
+  hidden_dropout: 0.0
+  attention_dropout: 0.0
+  fp32_residual_connection: False
+
+  apply_residual_connection_post_layernorm: False
+  layernorm_epsilon: 1.e-5
+  layernorm_zero_centered_gamma: True
+  add_bias_linear: False
+  bias_activation_fusion: False
+  add_qkv_bias: False
+  gated_linear_unit: False
+  activation_func: swiglu
+  num_moe_experts: null
+  rotary_interleaved: False
+  window_size: null
+
+  # initialization
+  init_method: null
+  init_method_std: 0.02
+  output_layer_init_method: null
+
+  # mixed-precision
+  apply_query_key_layer_scaling: False
+  attention_softmax_in_fp32: False
+
+  # fusion
+  bias_swiglu_fusion: True
+  masked_softmax_fusion: True
+  persist_layer_norm: False
+  memory_efficient_layer_norm: False
+  bias_dropout_fusion: True
+  apply_rope_fusion: True
+
+  # activation recomputation
+  recompute_granularity: null
+  recompute_method: null
+  recompute_num_layers: null
+  distribute_saved_activations: null
+
+  # fp8 related
+  fp8: null
+  fp8_margin: 0
+  fp8_interval: 1
+  fp8_amax_history_len: 1
+  fp8_amax_compute_algo: "most_recent"
+  fp8_wgrad: True
+
+  # miscellaneous
+  clone_scatter_output_in_embedding: True
+
+  normalization: "LayerNorm"  # alt value supported by TE: "RMSNorm"
+
+  # MoE related
+  moe_router_load_balancing_type: "aux_loss"
+  moe_router_topk: 2
+  moe_router_group_topk: null
+  moe_router_num_groups: null
+  moe_grouped_gemm: False
+  moe_aux_loss_coeff: 0  # 1e-2 would be a good start value for load balance loss.
+  moe_z_loss_coeff: null  # 1e-3 would be a good start value for z-loss
+  moe_input_jitter_eps: null
+  moe_token_dropping: False
+
+model_parallel:
+  # Model parallelism
+  tensor_model_parallel_size: 1
+  context_parallel_size: 1
+  pipeline_model_parallel_size: 1
+  virtual_pipeline_model_parallel_size: null
+  sequence_parallel: True
+  expert_model_parallel_size: 1
+
+  # Initialization
+  perform_initialization: True
+  use_cpu_initialization: null
+
+  # Training
+  fp16: False
+  bf16: True
+  params_dtype: null # Set from above arguments for core
+  timers: null
+
+  # Optimizations
+  gradient_accumulation_fusion: True
+  async_tensor_model_parallel_allreduce: True
+  tp_comm_overlap: False
+
+  # Debug Options
+  tp_comm_split_ag: True
+  tp_comm_atomic_ag: True
+  tp_comm_split_rs: True
+  tp_comm_atomic_rs: True
+  tp_comm_bulk_wgrad: True
+  tp_comm_bulk_dgrad: True
+
+  # Parallelism
+  finalize_model_grads_func: null
+
+  # Pipeline Parallel
+  pipeline_dtype: null
+  grad_scale_func: null
+  enable_autocast: False
+  autocast_dtype: null
+  variable_seq_lengths: False
+  num_microbatches_with_partial_activation_checkpoints: null
+  overlap_p2p_comm: False
+  batch_p2p_comm: True
+  batch_p2p_sync: True
+  use_ring_exchange_p2p: False
+  deallocate_pipeline_outputs: False
+  no_sync_func: null
+  grad_sync_func: null
+  param_sync_func: null
+  pipeline_model_parallel_split_rank: null
+
+  # CPU Offloading
+  cpu_offloading: False
+  cpu_offloading_num_layers: 0
+  _cpu_offloading_context: null
+  cpu_offloading_weights: False
+  cpu_offloading_activations: True
+
+  # Timing
+  barrier_with_L1_time: True
+
+# training:
+use_legacy_models: False
+spec: null
+micro_batch_size: 2
+global_batch_size: 128
+rampup_batch_size: [32, 32, 65324160] 
+check_for_nan_in_loss_and_grad: True
+num_layers_per_virtual_pipeline_stage: null
+
+encoder_num_layers: null
+decoder_num_layers: null
+rotary_seq_len_interpolation_factor: null
+add_position_embedding: False
+make_vocab_size_divisible_by: 128
+group_query_attention: False
+
+
+exit_signal_handler: False
+exit_duration_in_mins: null
+exit_interval: null
+
+untie_embeddings_and_output_weights: True
+position_embedding_type: rope
+rotary_percent: 0.5
+openai_gelu: False
+squared_relu: False
+swiglu: True
+onnx_safe: null
+bert_binary_head: True
+max_position_embeddings: 4096
+
+transformer_impl: local
+use_flash_attn: False
+seed: 1234
+data_parallel_random_init: False
+
+# Optimizer
+optimizer: adam
+lr: 2.5e-4
+lr_decay_style: cosine
+lr_decay_iters: null
+lr_decay_samples: 255126953
+lr_warmup_fraction: null
+lr_warmup_iters: 0
+lr_warmup_samples: 81381
+lr_warmup_init: 0.0
+min_lr: 2.5e-5
+weight_decay: 0.1
+start_weight_decay: null
+end_weight_decay: null
+weight_decay_incr_style: constant
+clip_grad: 1.0
+adam_beta1: 0.9
+adam_beta2: 0.95
+adam_eps: 1.e-08
+sgd_momentum: 0.9
+override_opt_param_scheduler: False
+use_checkpoint_opt_param_scheduler: False
+
+# checkpointing arguments
+save: null
+save_interval: 20000
+no_save_optim: null
+no_save_rng: null
+load: null
+no_load_optim: null
+no_load_rng: null
+finetune: False
+use_checkpoint_args: False
+exit_on_missing_checkpoint: False
+
+# loss arguments
+loss_scale: null
+initial_loss_scale: 4294967296
+min_loss_scale: 1.0
+loss_scale_window: 1000 
+hysteresis: 2
+accumulate_allreduce_grads_in_fp32: False
+fp16_lm_cross_entropy: False
+
+# distributed arguments
+distributed_backend: nccl
+distributed_timeout_minutes: 10
+overlap_grad_reduce: False
+align_grad_reduce: True
+overlap_param_gather: False
+align_param_gather: False
+scatter_gather_tensors_in_pipeline: True
+local_rank: null
+lazy_mpu_init: null
+empty_unused_memory_level: 0
+standalone_embedding_stage: False
+use_distributed_optimizer: False
+nccl_communicator_config_path: null
+
+train_iters: null
+eval_iters: 32
+eval_interval: 2000
+skip_train: False
+
+adlr_autoresume: False
+adlr_autoresume_interval: 1000
+
+# garbage collection
+manual_gc: False
+manual_gc_interval: 0
+manual_gc_eval: True
+
+tp_comm_overlap_cfg: null
+
+#data
+data_path: null
+split: '99,1,0'
+train_data_path: null
+valid_data_path: null
+test_data_path: null
+data_cache_path: null
+mock_data: False
+vocab_size: null
+vocab_file: null
+merge_file: null
+vocab_extra_ids: 0
+seq_length: 4096
+encoder_seq_length: null
+decoder_seq_length: null
+retriever_seq_length: 256
+sample_rate: 1.0
+mask_prob: 0.15
+short_seq_prob: 0.1
+num_workers: 2
+tokenizer_type: GPTSentencePieceTokenizer
+tokenizer_model: null
+reset_position_ids: False
+reset_attention_mask: False
+eod_mask_loss: False
+train_samples: 268554688
+dataloader_type: null
+
+#profile:
+profile: False
+profile_ranks: [0]
+profile_step_end: 12
+profile_step_start: 10
+
+#logging:
+log_params_norm: True
+log_num_zeros_in_grad: True
+log_throughput: False
+log_progress: False
+timing_log_level: 0
+timing_log_option: minmax
+tensorboard_log_interval: 1
+tensorboard_queue_size: 1000
+log_timers_to_tensorboard: False
+log_validation_ppl_to_tensorboard: False
+log_memory_to_tensorboard: False
+log_world_size_to_tensorboard: False
+log_loss_scale_to_tensorboard: True
+wandb_project: ''
+wandb_exp_name: ''
+wandb_save_dir: ''
+enable_one_logger: True
+one_logger_project: megatron-lm
+one_logger_run_name: null
+log_interval: 100
+tensorboard_dir: null
--- a/Megatron-LM/examples/gpt3/train_gpt3_175b_distributed.sh
+++ b/Megatron-LM/examples/gpt3/train_gpt3_175b_distributed.sh
+#!/bin/bash
+
+# Runs the "175B" parameter model
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+
+GPUS_PER_NODE=8
+# Change for multinode config
+MASTER_ADDR=localhost
+MASTER_PORT=6000
+NUM_NODES=1
+NODE_RANK=0
+WORLD_SIZE=$(($GPUS_PER_NODE*$NUM_NODES))
+
+CHECKPOINT_PATH=$1 #<Specify path>
+TENSORBOARD_LOGS_PATH=$2 #<Specify path>
+VOCAB_FILE=$3 #<Specify path to file>/gpt2-vocab.json
+MERGE_FILE=$4 #<Specify path to file>/gpt2-merges.txt
+DATA_PATH=$5 #<Specify path and file prefix>_text_document
+
+DISTRIBUTED_ARGS=(
+    --nproc_per_node $GPUS_PER_NODE 
+    --nnodes $NUM_NODES 
+    --master_addr $MASTER_ADDR 
+    --master_port $MASTER_PORT
+)
+
+GPT_MODEL_ARGS=(
+    --num-layers 96 
+    --hidden-size 12288 
+    --num-attention-heads 96 
+    --seq-length 2048 
+    --max-position-embeddings 2048 
+    --attention-backend auto # Can use (flash/fused/unfused/local)
+)
+
+TRAINING_ARGS=(
+    --micro-batch-size 1 
+    --global-batch-size 1536 
+    --rampup-batch-size 16 16 5859375 
+    --train-iters 500000 
+    --weight-decay 0.1 
+    --adam-beta1 0.9 
+    --adam-beta2 0.95 
+    --init-method-std 0.006 
+    --clip-grad 1.0 
+    --fp16
+    --lr 6.0e-5 
+    --lr-decay-style cosine 
+    --min-lr 6.0e-6
+    --lr-warmup-fraction .001 
+    --lr-decay-iters 430000 
+)
+
+MODEL_PARALLEL_ARGS=(
+	--tensor-model-parallel-size 8 
+	--pipeline-model-parallel-size 16 
+)
+
+DATA_ARGS=(
+    --data-path $DATA_PATH 
+    --vocab-file $VOCAB_FILE 
+    --merge-file $MERGE_FILE 
+    --split 949,50,1
+)
+
+EVAL_AND_LOGGING_ARGS=(
+    --log-interval 100
+    --save-interval 10000 
+    --eval-interval 1000 
+    --save $CHECKPOINT_PATH 
+    --load $CHECKPOINT_PATH 
+    --eval-iters 10
+    --tensorboard-dir $TENSORBOARD_LOGS_PATH 
+)
+
+torchrun ${DISTRIBUTED_ARGS[@]} pretrain_gpt.py \
+    ${GPT_MODEL_ARGS[@]} \
+    ${TRAINING_ARGS[@]} \
+    ${MODEL_PARALLEL_ARGS[@]} \
+    ${DATA_ARGS[@]} \
+    ${EVAL_AND_LOGGING_ARGS[@]}
--- a/Megatron-LM/examples/inference/README.md
+++ b/Megatron-LM/examples/inference/README.md
+### Megatron Core Inference Documentation
+This guide provides an example for Megatron Core for running model inference. 
+
+### Contents
+- [Megatron Core Inference Documentation](#megatron-core-inference-documentation)
+- [Contents](#contents)
+  - [1. Quick Start](#1-quick-start)
+    - [1.1 Understanding The Code](#11-understanding-the-code)
+    - [1.2 Running The Code](#12-running-the-code)
+  - [2. Flow of Control In MCore Backend](#2-flow-of-control-in-mcore-backend)
+  - [3. Customizing The Inference Pipeline](#3-customizing-the-inference-pipeline)
+    - [3.1. Create Your Own Inference Backend](#31-create-your-own-inference-backend)
+    - [3.2. Create Your Own Text Generation Controller](#32-create-your-own-text-generation-controller)
+    - [3.3. Support Other Models](#33-support-other-models)
+    - [3.3. Modify Inference Parameters](#33-modify-inference-parameters)
+  - [4. Future work](#4-future-work)
+
+<br>
+
+#### 1. Quick Start
+This example runs batch inference on a GPT model trained using Megatron Core. The entrypoint is [simple_gpt_batch_inference.py](./gpt/gpt_batch_inference.py)
+
+<br>
+
+##### 1.1 Code Walkthrough 
+***STEP 1 - Initialize model parallel and other default arguments***
+The micro batch size is set as 1 as it is not used in tensor-parallelism only, and for pipeline-parallel models it is calculated at runtime. 
+```python
+    initialize_megatron(
+        args_defaults={'no_load_rng': True, 'no_load_optim': True, 'micro_batch_size': 1}
+    )
+```
+
+***STEP 2 - Load the model using the model_provider_function***
+NOTE: The model provider function supports both MCore and Legacy models. 
+
+```python
+    model = get_model(model_provider, wrap_with_ddp=False)
+    load_checkpoint(model, None, None)
+    model = model[0]
+```
+
+***STEP 3 - Choose an engine***
+Text generation requires an inference engine, which includes a scheduler. The default engine is the [Megatron Core engine](../../megatron/core/inference/engine/mcore_engine.py) with a simple [text generation controller](../../megatron/core/inference/text_generation_controllers/text_generation_controller.py). TRTLLMEngine will be supported in the future.
+```python
+    inference_wrapped_model = GPTInferenceWrapper(model, args)
+    text_generation_controller = TextGenerationController(
+        inference_wrapped_model=inference_wrapped_model, 
+        tokenizer=tokenizer
+    )
+    inference_backend = MCoreEngine(
+        text_generation_controller=text_generation_controller, max_batch_size=args.max_batch_size
+    )
+```
+
+***STEP 4 - Run text generation***
+The [SamplingParams](../../megatron/core/inference/sampling_params.py) contains suggested defaults. Customize this to change top_p, top_k, number of tokens to generate etc. 
+*Note: The result is returned as a list of [InferenceRequests](../../megatron/core/inference/inference_request.py)*
+```python
+    results: List[InferenceRequest] = inference_engine.generate(
+        prompts=args.prompts, sampling_params=sampling_params
+    )
+    
+    if torch.distributed.get_rank() == 0:
+        for idx, result in enumerate(results):
+            print(f' ------------- RESULT FOR PROMPT {idx} --------------- ')
+            result = {
+                'id': result.request_id,
+                'input_prompt': result.prompt, 
+                'generated_text': result.generated_text,
+                'generated_tokens' : result.generated_tokens
+                }
+            print(result)
+```
+
+<br>
+
+##### 1.2 Running The Code
+An example run script is shown below. Set the tokenizer paths, inference params, and other settings appropriately. 
+
+For a quick recap on sampling parameters, refer to [this blog](https://ivibudh.medium.com/a-guide-to-controlling-llm-model-output-exploring-top-k-top-p-and-temperature-parameters-ed6a31313910).
+
+```
+# In a slurm cluster (You could also use docker)
+ACCOUNT=<account>
+MLM_PATH=/path/to/megatron-lm
+GPT_CKPT=/path/to/gpt/ckpt
+VOCAB_MERGE_FILE_PATH=/path/to/vocab/and/merge/file
+CONTAINER_IMAGE=nvcr.io/ea-bignlp/ga-participants/nemofw-training:23.11
+
+srun --account $ACCOUNT \
+--job-name=$ACCOUNT:inference \
+--partition=batch \
+--time=01:00:00 \
+--container-image $CONTAINER_IMAGE \
+--container-mounts $MLM_PATH:/workspace/megatron-lm/,$GPT_CKPT:/workspace/mcore_gpt_ckpt,$VOCAB_MERGE_FILE_PATH:/workspace/tokenizer \
+--no-container-mount-home \
+--pty /bin/bash \
+
+# Inside the container run the following. 
+
+cd megatron-lm/
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+
+TOKENIZER_ARGS=(
+    --vocab-file /workspace/tokenizer/gpt2-vocab.json
+    --merge-file /workspace/tokenizer/gpt2-merges.txt
+    --tokenizer-type GPT2BPETokenizer
+)
+
+MODEL_ARGS=(
+    --use-checkpoint-args
+    --use-mcore-models
+    --load /workspace/mcore_gpt_ckpt
+)
+
+INFERENCE_SPECIFIC_ARGS=(
+    --attention-dropout 0.0
+    --hidden-dropout 0.0
+    --num-tokens-to-generate 20
+    --max-batch-size 4
+)
+
+torchrun --nproc-per-node=4 examples/inference/gpt/simple_gpt_batch_inference.py \
+    ${TOKENIZER_ARGS[@]} \
+    ${MODEL_ARGS[@]} \
+    ${INFERENCE_SPECIFIC_ARGS[@]} \
+    --prompts "prompt one " "sample prompt two" "sample prompt 3"
+
+NOTE: Other parameters which can be customized for inference are :-
+--temperature (Sampling temperature)
+--top_k (top_k sampling)
+--top_p (top_p sampling)
+--num-tokens-to-generate (Number of tokens to generate for each prompt)
+--inference-batch-times-seqlen-threshold (During inference, if batch-size times sequence-length is smaller than this threshold then we will not use pipelining, otherwise we will.')
+--use-dist-ckpt (If using dist checkpoint format for the model)
+--use-legacy-models (If using legacy gpt model instead of mcore gpt model)
+
+```
+
+
+<br>
+
+
+#### 2. Control Flow in the MCore Backend
+An example of inference with static batching is provided in [gpt_batch_inference.py](./gpt/gpt_batch_inference.py).
+* [mcore_engine](../../megatron/core/inference/engines/mcore_engine.py) **generate()** function is called with the input prompts.
+* The `Scheduler` in the engine will add these prompts to the [active requests] pool (../../megatron/core/inference/inference_request.py) until max batch size is hit. Remaining requests will be added to the waiting requests pool. 
+* The engine will run until all requests (waiting + active) are completed. 
+    * The active requests are passed into  **generate_all_output_tokens_static_batch()** of the text generation controller . 
+    * This function uses the **prep_model_for_inference()** method of the [model_inference_wrappers](../../megatron/core/inference/model_inference_wrappers/abstract_model_inference_wrapper.py) and runs an autoregressive sampling loop
+    * In the autoregressive loop, the **get_batch_for_context_window()** method of the inference wrapper is called to slice out the input tokens and masks
+    * Input tokens and masks are passed it into the **run_one_forward_step()** method, which calls the model `.forward()` method to get the output logits
+    * Output logits are synchronized across all pipeline parallel ranks
+    * The text generation controller obtains the log probabilities and samples tokens based on the strategy defined in the sampling parameters.
+    * The sampled tokens are then appended to the input prompt tokens for the next iteration 
+    * The **update_generation_status()** method of the text generation controller checks which prompts have finished generating or hit a stop condition
+    * After the inference loop, the result is detokenized and stored as an attribute of the InferenceRequest. These requests are marked as completed. 
+    * The **update_requests_pool()** method of the scheduler moves completed requests into the completed request pool and waiting requests into the active request pool
+
+<br>
+
+#### 3. Customizing The Inference Pipeline
+
+The inference pipeline supports three levels of customization:
+
+* **Inference engine** - The MCore Engine is currently supported. Change this to add a new backend.
+* **Text generation controller** - The main sampling loop. This can be customized to support alternative tokenization, detokenization, or to implement a new sampling strategy.
+* **Inference Wrapped Model** - Change this to support a new model.
+* **Modify Inference Parameters** - Change this to update top_p, top_k, number of tokens to be generated, temperature, or other sampling parameters.
+
+<br>
+
+##### 3.1. Create Your Own Inference Backend 
+The  [abstract_engine.py](./../../megatron/core/inference/engine/abstract_engine.py) file contains a `generate` method that can be extended to support a new backend. 
+
+```python
+class AbstractEngine(ABC):
+    @staticmethod
+    def generate(self) -> dict:
+        """The abstract backend's generate function. 
+
+        To define a new backend, implement this method and return the outputs as a dictionary. 
+```
+
+<br>
+
+##### 3.2. Implement a new Sampling Loop 
+
+The [TextGenerationController](../../megatron/core/inference/text_generation_controllers/text_generation_controller.py) contains the main sampling loop and can be modified to support new tokenization, detokenization, or sampling strategies.
+
+``` python
+class TextGenerationController:
+
+    def tokenize_prompt(self, prompt: str) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Utility to tokenize the input prompts"""
+
+    def sample_from_logits(
+        self,
+        last_token_logits: torch.Tensor,
+        sampling_params: SamplingParams,
+        vocab_size: int,
+        generation_started : Optional[torch.Tensor] = None,
+        top_n_logprobs_dict: Dict[int, List[Dict[str, float]]] = None,
+    ) -> torch.Tensor:
+        """Samples the logits to generate outputs
+
+        Given the logits of the last token, this function samples according to the parameters defined in sampling_params and returns the sampled tokens. If sampling_params.top_n_logprobs > 0 
+        at each step it also updates the top_n_logprobs_dict.
+        """
+
+    def update_generation_status(
+        self,
+        updated_prompts_tokens: torch.Tensor,
+        generation_started: torch.Tensor,
+        current_context_end_position: int,
+        is_generation_done_tensor: torch.Tensor,
+        generated_sequence_lengths: torch.Tensor,
+    ) -> torch.Tensor:
+        """Function to check which prompts have reached an end condition
+
+        We check which prompts have reached an end condition and set the corresponding flags of the is_generation_done_tensor to True . The generated sequence lengths increases as we keep generating, until that prompts hits an eod condition. The generation started status tensor helps us determine which prompts have started generating
+        """
+
+    def generate_all_output_tokens_static_batch(
+        self, active_requests: OrderedDict[int, InferenceRequest],
+    ) -> OrderedDict[int, InferenceRequest]:
+        """Utility to generate all the output tokens and probabilities for the prompts .
+
+        This utility generates the output tokens for a static batch. It runs the forward steps till all prompts complete generation, updates the status of these requests to completed, adds the generated result and returns these requests
+        """
+
+    def detokenize_generations(self, prompt_tokens_with_generated_tokens: torch.Tensor) -> str:
+        """Detokenize the output generations"""
+```
+
+<br>
+
+##### 3.3. Support Other Models
+Extend [abstract_model_inference_wrapper.py](./../../megatron/core/inference/model_inference_wrappers/abstract_model_inference_wrapper.py) to support other models. The abstract model wrapper implements: 
+* Forward method which calls the model `forward` method depending on model parallel settings
+* Initializes the model and puts it in `.eval()` mode
+* Setup for the input parameters (max batch size, max seq length) 
+
+The following methods should be implemented: 
+```python
+class AbstractModelInferenceWrapper:
+    def prep_model_for_inference(self, prompts_tokens: torch.Tensor):
+        """A utility function for preparing model for inference
+
+        The function gets called once before the auto regressive inference loop. It puts the model in eval mode , and gets some model and inference data parameters. Extend this to build position ids ,attention mask etc, so that required slices can be extracted during the forward pass
+        """
+
+    @abc.abstractclassmethod
+    def get_batch_for_context_window(self) -> List:
+        """Returns the input data for inference 
+
+        This function gets called iteratively in the inference loop. It can be used to extract relevant input from the prompt tokens, attention mask etc. required for each step in inference.
+```
+
+Refer to [gpt_inference_wrapper.py](../../megatron/core/inference/model_inference_wrappers/gpt/gpt_inference_wrapper.py) for an example of implementing this for GPTModel.
+
+<br>
+
+##### 3.3. Modify Inference Parameters
+We use  [common inference params](../../megatron/core/inference/sampling_params.py) for text generation. Customize this if you want to change top_p, top_k, number of tokens to generate etc. If you want to add other attributes that you would use in the inference loop, you can do that as shown below
+
+```
+from megatron.core.inference.sampling_params import SamplingParams
+
+c = SamplingParams(temperature=0.5)
+c.add_attributes({'min_length':4, 'eod_id':153})
+```
+
+<br>
+
+#### 4. Future work
+The following features are planned for the future releases. 
+* Dynamic batching 
+* Paged Attention
+* TRTLLM Engine support
+* Support for multimodal inference
\ No newline at end of file
--- a/Megatron-LM/examples/inference/gpt/gpt_dynamic_inference.py
+++ b/Megatron-LM/examples/inference/gpt/gpt_dynamic_inference.py
+# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+
+import torch
+from argparse import ArgumentParser
+from collections import defaultdict
+from tqdm import tqdm
+from typing import List
+
+from megatron.core.inference.contexts import (
+    ContextOverflowError,
+    DynamicInferenceContext,
+)
+from megatron.core.inference.engines import DynamicInferenceEngine
+from megatron.core.inference.model_inference_wrappers.gpt.gpt_inference_wrapper import GPTInferenceWrapper
+from megatron.core.inference.sampling_params import SamplingParams
+from megatron.core.inference.text_generation_controllers.text_generation_controller import TextGenerationController
+from megatron.core.transformer.module import MegatronModule
+from megatron.training import (
+    get_args,
+    get_model as _get_model,
+    get_tokenizer,
+    initialize_megatron,
+)
+from megatron.training.checkpointing import load_checkpoint
+from pretrain_gpt import model_provider
+
+from .utils import add_common_inference_args, build_requests, get_curr_time, Request
+
+
+def add_dynamic_inference_args(parser: ArgumentParser) -> ArgumentParser:
+    """Dynamic inference arguments."""
+
+    add_common_inference_args(parser)
+
+    return parser
+
+
+def get_model() -> MegatronModule:
+    """Initialize model and load checkpoint."""
+
+    args = get_args()
+
+    # Build model.
+    model = _get_model(model_provider, wrap_with_ddp=False)
+
+    # Load checkpoint.
+    assert args.load is not None
+    args.exit_on_missing_checkpoint = True
+    load_checkpoint(model, None, None)
+
+    # No virtual PP.
+    assert len(model) == 1, "Above condition should have caught this"
+    model = model[0]
+
+    # Eval mode.
+    model.eval()
+
+    return model
+
+
+def get_inference_context(
+    requests: List[Request],
+    sampling_params: SamplingParams,
+):
+    """The inference context manages the KV cache and other inference state."""
+
+    args = get_args()
+
+    # Max sequence length.
+    max_gen_length = sampling_params.num_tokens_to_generate
+    max_context_length = max(len(r.prompt_tokens) for r in requests)
+    max_sequence_length = max_context_length + max_gen_length
+
+    # Inference context.
+    context = DynamicInferenceContext(
+        params_dtype=args.params_dtype,
+        num_layers=args.num_layers,
+        kv_channels=args.kv_channels,
+        num_attention_heads=args.num_query_groups if args.group_query_attention else args.num_attention_heads,
+        max_sequence_length=max_sequence_length,
+        buffer_size_gb=args.inference_dynamic_batching_buffer_size_gb,
+        buffer_guaranteed_fraction=args.inference_dynamic_batching_buffer_guaranteed_fraction,
+        buffer_overflow_factor=args.inference_dynamic_batching_buffer_overflow_factor,
+        max_requests_override=args.inference_dynamic_batching_max_requests_override,
+        max_tokens_override=args.inference_dynamic_batching_max_tokens_override,
+    )
+
+    return context
+
+
+def get_inference_controller(
+    model: MegatronModule,
+    context: DynamicInferenceContext,
+) -> TextGenerationController:
+    """Buid text generation controller, which manages the model inference context.
+
+    Args:
+        model (MegatronModule): Megatron GPT model.
+        context (DynamicInferenceContext): Context for managing KV cache.
+
+    Return:
+        (TextGenerationController) Inference text generation controller.
+    """
+
+    args = get_args()
+    tokenizer = get_tokenizer()
+
+    # Wrap model in inference wrapper.
+    model = GPTInferenceWrapper(model, args, context)
+
+    # Note: the following is taken from AbstractModelInferenceWrapper.prep_model_for_inference().
+    from megatron.core import parallel_state
+    model.model_is_pipeline_parallel = not (
+        parallel_state.is_pipeline_first_stage() and
+        parallel_state.is_pipeline_last_stage()
+    )
+
+    # Text generation controller.
+    controller = TextGenerationController(model, tokenizer)
+
+    return controller
+
+
+def run_inference(
+    requests: List[Request],
+    sampling_params: SamplingParams,
+    engine: DynamicInferenceEngine,
+) -> None:
+    """Add requests to engine and generate tokens.
+
+    Args:
+        requests (List[Request]): Requests that are to be added and processed.
+        sampling_params (SamplingParams): Sampling params for the logits.
+        engine (DynamicInferenceEngine): Inference engine that manages generating tokens.
+
+    Return:
+        None.
+    """
+
+    # Initialize request arrival times.
+    base_arrival_time = get_curr_time()
+    for request in requests:
+        request.time_arrival = request.time_offset + base_arrival_time
+
+    # Add and process requests.
+    num_requests_total = len(requests)
+    num_requests_added = 0
+    num_requests_finished = 0
+    step_id = 0
+    step_times = {"prefill": [], "decode": []}
+    add_times = []
+    output_times = []
+    tbar = tqdm(total=num_requests_total)
+    while True:
+        curr_time = get_curr_time()
+
+        # Add requests with 'earlier' arrival time.
+        add_start = get_curr_time()
+        while num_requests_added < num_requests_total:
+            request = requests[num_requests_added]
+            if request.time_arrival > curr_time:
+                break
+            try:
+
+                # Using `prompt_text` instead of `prompt_tokens` for fair comparison.
+                engine.add_request(num_requests_added, request.prompt_text)
+                request.time_start = get_curr_time()
+                request.state = "started"
+                num_requests_added += 1
+                tbar.update(1)
+            except ContextOverflowError:
+                break
+        add_times.append(get_curr_time() - add_start)
+
+        # Step inference engine (i.e., generate a token for each active request).
+        is_decode_only = engine.context.is_decode_only()
+        result, step_time = engine.step(sampling_params, verbose=True)
+        step_id += 1
+
+        # Append output tokens.
+        if result is not None:
+
+            output_start = get_curr_time()
+
+            if is_decode_only:
+                step_times["decode"].append(step_time)
+            else:
+                step_times["prefill"].append(step_time)
+
+            request_ids, finished_request_ids, sample = result
+            request_ids = request_ids.tolist()
+            sample = sample.tolist()
+            for request_id, token in zip(request_ids, sample):
+                request = requests[request_id]
+                request.output_tokens.append(token)
+                if request_id in finished_request_ids:
+                    request.time_end = get_curr_time()
+                    request.state = "finished"
+                    num_requests_finished += 1
+
+            output_times.append(get_curr_time() - output_start)
+
+        # Check if all requests are finished.
+        if not (engine.has_unfinished_requests() or
+                num_requests_added < num_requests_total):
+            break
+
+    return step_times, add_times, output_times
+
+
+if __name__ == "__main__":
+
+    # Initialize Megatron.
+    initialize_megatron(
+        extra_args_provider=add_dynamic_inference_args,
+        args_defaults={'no_load_rng': True,
+                       'no_load_optim': True},
+    )
+
+    args = get_args()
+    tokenizer = get_tokenizer()
+
+    # Sampling params.
+    sampling_params = SamplingParams(
+        temperature=args.temperature,
+        top_k=args.top_k,
+        top_p=args.top_p,
+        return_log_probs=args.return_log_probs,
+        num_tokens_to_generate=args.num_tokens_to_generate,
+    )
+
+    # Requests, context, conroller.
+    model = get_model()
+    requests = build_requests(args, tokenizer)
+    context = get_inference_context(requests, sampling_params)
+    controller = get_inference_controller(model, context)
+
+    # Inference engine.
+    engine = DynamicInferenceEngine(controller,
+                                    context,
+                                    termination_id=tokenizer.eod,
+                                    enable_cuda_graph=args.enable_cuda_graph,
+                                    random_seed=args.seed)
+
+    # Print setup.
+    setup_prefix = "dynamic | cg %d | %s | bf %.0f, flw %.1f [r %d, t %d], gtd %.2f [r %d] ... reqs %d" % (
+        args.enable_cuda_graph,
+        (
+            f"<user prompts, n {len(args.prompts)}>"
+            if args.prompts else
+            "<auto prompts> %s, %d, %.1e, %.1e" % (
+                "(%s)" % " ".join(map(str, args.num_tokens_to_prompt)),
+                args.num_tokens_to_generate,
+                args.incoming_requests_duration,
+                args.incoming_requests_per_sec,
+            )
+        ),
+        args.inference_dynamic_batching_buffer_size_gb,
+        args.inference_dynamic_batching_buffer_overflow_factor,
+        context.max_requests,
+        context.max_tokens,
+        args.inference_dynamic_batching_buffer_guaranteed_fraction,
+        context.gtd_request_count,
+        len(requests),
+    )
+    print("~~~")
+    print(setup_prefix)
+    print("~~~")
+
+    # Run and time test.
+    t = get_curr_time()
+    step_times, add_times, output_times = run_inference(requests, sampling_params, engine)
+    total_time = get_curr_time() - t
+
+    # Validate all requests finished.
+    for request in requests:
+        assert request.state == "finished"
+
+    # Detokenize outputs.
+    for request in requests:
+        request.output_text = tokenizer.detokenize(request.output_tokens)
+
+    # Print unique prompts + outputs.
+    if torch.distributed.get_rank() == 0:
+
+        print("~~~~ Unique prompts + outputs. ~~~~")
+
+        # Map requests by their prompt.
+        unique_prompt_map = defaultdict(list)
+        for request_idx, request in enumerate(requests):
+            unique_prompt_map[request.prompt_text].append(request_idx)
+
+        # Print unique prompts + outputs.
+        for unique_idx, (prompt_text, request_idxs) in enumerate(unique_prompt_map.items()):
+            request_idx = request_idxs[0]
+            request = requests[request_idx]
+            print(f"{unique_idx}/{len(unique_prompt_map)} [{len(request_idxs)}]. {prompt_text} ... %s" % request.output_text.replace("\n", "\\n"))
+
+    # Timing results.
+    stats = torch.cuda.memory_stats()
+    print("~~~")
+    print("%s ... mem %.1f/%.1f ... total time: %.3f ... step time: total %.3f [ p %.3f, d %.3f ], mean [ p %.3f, d %.3f ], count [ p %d, d %d ] ... add time: %.3f, output time: %.3f." % (
+        setup_prefix,
+        stats["allocated_bytes.all.peak"] / (1024**3),
+        stats["reserved_bytes.all.peak"] / (1024**3),
+        sum(step_times["prefill"]) + sum(step_times["decode"]) + sum(add_times),
+        sum(step_times["prefill"]) + sum(step_times["decode"]),
+        sum(step_times["prefill"]),
+        sum(step_times["decode"]),
+        sum(step_times["prefill"]) / len(step_times["prefill"]),
+        sum(step_times["decode"]) / len(step_times["decode"]),
+        len(step_times["prefill"]),
+        len(step_times["decode"]),
+        sum(add_times),
+        sum(output_times),
+    ))
+    print("~~~")
--- a/Megatron-LM/examples/inference/gpt/gpt_dynamic_inference_12b.sh
+++ b/Megatron-LM/examples/inference/gpt/gpt_dynamic_inference_12b.sh
+# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+
+# Run dynamic batching inference on the 12B GPT model.
+
+set -u
+
+pip install simpy
+pip install sentencepiece
+pip install tiktoken
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+
+: ${CHECKPOINT_DIR:?"CHECKPOINT_DIR is not set"}
+: ${TOKENIZER_MODEL:?"TOKENIZER_MODEL is not set"}
+
+: ${NUM_TOKENS_TO_PROMPT="8 32"}
+: ${NUM_TOKENS_TO_GENERATE=256}
+: ${INCOMING_REQUESTS_DURATION=10.}
+: ${INCOMING_REQUESTS_PER_SEC=100.}
+
+: ${INFERENCE_DYNAMIC_BATCHING_BUFFER_SIZE_GB=50.}
+: ${INFERENCE_DYNAMIC_BATCHING_BUFFER_OVERFLOW_FACTOR=1.}
+: ${INFERENCE_DYNAMIC_BATCHING_BUFFER_GUARANTEED_FRACTION=0.05}
+
+: ${ENGINE=dynamic}
+# NSIGHT_PREFIX=/path/to/nsight/profile
+
+# --inference-rng-tracker \ # ... re-add after bugfix.
+ARGS=" \
+    --no-persist-layer-norm \
+    --apply-layernorm-1p \
+    --no-position-embedding \
+    --group-query-attention \
+    --num-query-groups 8 \
+    --load ${CHECKPOINT_DIR} \
+    --use-checkpoint-args \
+    --untie-embeddings-and-output-weights \
+    --disable-bias-linear \
+    --use-rotary-position-embeddings \
+    --position-embedding-type rope \
+    --rotary-base 1000000 \
+    --rotary-percent 1.0 \
+    --swiglu \
+    --normalization RMSNorm \
+    --attention-dropout 0.0 \
+    --hidden-dropout 0.0 \
+    --exit-duration-in-mins 5740 \
+    --tensor-model-parallel-size 1 \
+    --pipeline-model-parallel-size 1 \
+    --num-layers 40 \
+    --hidden-size 5120 \
+    --ffn-hidden-size 14336 \
+    --num-attention-heads 32 \
+    --kv-channels 128 \
+    --seq-length 1024 \
+    --max-position-embeddings 1024 \
+    --micro-batch-size 64 \
+    --bf16 \
+    --tokenizer-type TikTokenizer \
+    --tiktoken-pattern v2 \
+    --tokenizer-model ${TOKENIZER_MODEL} \
+    --distributed-timeout-minutes 2400 \
+    --transformer-impl local \
+    --use-flash-attn \
+    \
+    --inference-dynamic-batching \
+    --inference-dynamic-batching-buffer-size-gb ${INFERENCE_DYNAMIC_BATCHING_BUFFER_SIZE_GB} \
+    --inference-dynamic-batching-buffer-overflow-factor ${INFERENCE_DYNAMIC_BATCHING_BUFFER_OVERFLOW_FACTOR} \
+    --inference-dynamic-batching-buffer-guaranteed-fraction ${INFERENCE_DYNAMIC_BATCHING_BUFFER_GUARANTEED_FRACTION} \
+    \
+    --enable-cuda-graph \
+"
+
+if [[ -v PROMPTS ]]; then
+    ARGS+=" --prompts ${PROMPTS}"
+else
+    ARGS+=" \
+        --num-tokens-to-prompt ${NUM_TOKENS_TO_PROMPT} \
+        --num-tokens-to-generate ${NUM_TOKENS_TO_GENERATE} \
+        --incoming-requests-duration ${INCOMING_REQUESTS_DURATION} \
+        --incoming-requests-per-sec ${INCOMING_REQUESTS_PER_SEC} \
+    "
+fi
+
+CMD="python -m examples.inference.gpt.gpt_${ENGINE}_inference ${ARGS}"
+if [[ -v NSIGHT_PREFIX ]]; then
+    CMD="nsys profile -t cuda,nvtx,mpi -s none --wait=primary --show-output=true --force-overwrite=true --export=sqlite -o ${NSIGHT_PREFIX} ${CMD}"
+fi
+
+eval ${CMD}
--- a/Megatron-LM/examples/inference/gpt/gpt_dynamic_inference_357m.sh
+++ b/Megatron-LM/examples/inference/gpt/gpt_dynamic_inference_357m.sh
+# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+
+# Run dynamic batching inference on the 357M GPT model.
+
+set -u
+
+pip install simpy
+pip install sentencepiece
+pip install tiktoken
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+
+: ${CHECKPOINT_DIR:?"CHECKPOINT_DIR is not set"}
+: ${VOCAB_FILE:?"VOCAB_FILE is not set"}
+: ${MERGE_FILE:?"MERGE_FILE is not set"}
+
+: ${NUM_TOKENS_TO_PROMPT="8 32"}
+: ${NUM_TOKENS_TO_GENERATE=256}
+: ${INCOMING_REQUESTS_DURATION=10.}
+: ${INCOMING_REQUESTS_PER_SEC=100.}
+
+: ${INFERENCE_DYNAMIC_BATCHING_BUFFER_SIZE_GB=50.}
+: ${INFERENCE_DYNAMIC_BATCHING_BUFFER_OVERFLOW_FACTOR=1.}
+: ${INFERENCE_DYNAMIC_BATCHING_BUFFER_GUARANTEED_FRACTION=0.05}
+
+: ${ENGINE=dynamic}
+# NSIGHT_PREFIX=/path/to/nsight/profile
+
+# --inference-rng-tracker \ # ... re-add after bugfix.
+ARGS=" \
+    --exit-on-missing-checkpoint \
+    --transformer-impl local \
+    --load ${CHECKPOINT_DIR} \
+    --tokenizer-type GPT2BPETokenizer \
+    --vocab-file ${VOCAB_FILE} \
+    --merge-file ${MERGE_FILE} \
+    --exit-on-missing-checkpoint \
+    --max-position-embeddings 2048 \
+    --seq-length 2048 \
+    --tensor-model-parallel-size 1 \
+    --pipeline-model-parallel-size 1 \
+    --num-layers 24 \
+    --num-attention-heads 16 \
+    --hidden-size 1024 \
+    --bf16 \
+    --micro-batch-size 1 \
+    --attention-dropout 0.0 \
+    --hidden-dropout 0.0 \
+    --seed 42 \
+    --use-flash-attn \
+    \
+    --inference-dynamic-batching \
+    --inference-dynamic-batching-buffer-size-gb ${INFERENCE_DYNAMIC_BATCHING_BUFFER_SIZE_GB} \
+    --inference-dynamic-batching-buffer-overflow-factor ${INFERENCE_DYNAMIC_BATCHING_BUFFER_OVERFLOW_FACTOR} \
+    --inference-dynamic-batching-buffer-guaranteed-fraction ${INFERENCE_DYNAMIC_BATCHING_BUFFER_GUARANTEED_FRACTION} \
+    \
+    --enable-cuda-graph \
+"
+
+if [[ -v PROMPTS ]]; then
+    ARGS+=" --prompts ${PROMPTS}"
+else
+    ARGS+=" \
+        --num-tokens-to-prompt ${NUM_TOKENS_TO_PROMPT} \
+        --num-tokens-to-generate ${NUM_TOKENS_TO_GENERATE} \
+        --incoming-requests-duration ${INCOMING_REQUESTS_DURATION} \
+        --incoming-requests-per-sec ${INCOMING_REQUESTS_PER_SEC} \
+    "
+fi
+
+CMD="python -m examples.inference.gpt.gpt_${ENGINE}_inference ${ARGS}"
+if [[ -v NSIGHT_PREFIX ]]; then
+    CMD="nsys profile -t cuda,nvtx,mpi -s none --wait=primary --show-output=true --force-overwrite=true --export=sqlite -o ${NSIGHT_PREFIX} ${CMD}"
+fi
+
+eval ${CMD}
--- a/Megatron-LM/examples/inference/gpt/gpt_static_inference.py
+++ b/Megatron-LM/examples/inference/gpt/gpt_static_inference.py
+# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+
+import os
+from megatron.core.inference.model_inference_wrappers.inference_wrapper_config import (
+    InferenceWrapperConfig,
+)
+from pretrain_gpt import model_provider
+import torch
+import sys
+import time
+import tqdm
+import warnings
+from argparse import Namespace
+from megatron.core.inference.contexts import StaticInferenceContext
+from megatron.core.inference.engines import StaticInferenceEngine
+from megatron.core.inference.sampling_params import SamplingParams
+from megatron.core.inference.model_inference_wrappers.gpt.gpt_inference_wrapper import (
+    GPTInferenceWrapper,
+)
+from megatron.core.inference.inference_request import InferenceRequest
+from megatron.core.inference.text_generation_controllers.text_generation_controller import (
+    TextGenerationController,
+)
+from megatron.core.transformer.module import MegatronModule
+
+sys.path.append(
+    os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir, os.path.pardir))
+)
+
+from megatron.training import get_args
+from megatron.training import get_tokenizer
+from megatron.training.checkpointing import load_checkpoint
+from megatron.core import mpu
+from megatron.training.initialize import initialize_megatron
+from megatron.training import get_model
+import asyncio
+from typing import AsyncIterator, List
+
+from examples.inference.gpt.utils import add_common_inference_args, build_requests
+
+
+def add_static_inference_args(parser):
+    """Static inference arguments."""
+
+    add_common_inference_args(parser)
+
+    group = parser.add_argument_group(title='Static inference')
+    group.add_argument(
+        "--max-batch-size", type=int, default=8, dest="inference_max_requests",
+        help='Max number of prompts to process at once'
+    )
+    group.add_argument("--stream", action="store_true", default=False, help="Stream output tokens")
+
+    return parser
+
+
+def get_inference_engine(args: Namespace, model: MegatronModule) -> StaticInferenceEngine:
+    """Utility to get the relevant backend for running inference
+
+    This function will automatically chose the TRTLLMBackend when possible, and if not revert to Mcore backend if the user does not specify any backends. TRT LLM Backend is not implmented yet.
+
+    Args:
+        args (Namespace): The user arguments parsed from command line
+        model (MegatronModule): The megatron model .
+
+    Returns:
+        AbstractBackend: The chosen backend
+    """
+    tokenizer = get_tokenizer()
+
+    inference_wrapper_config = InferenceWrapperConfig(
+        hidden_size=args.hidden_size,
+        inference_batch_times_seqlen_threshold=args.inference_batch_times_seqlen_threshold,
+        fp32_residual_connection=args.fp32_residual_connection,
+        params_dtype=args.params_dtype,
+        padded_vocab_size=args.padded_vocab_size,
+        inference_max_requests=args.inference_max_requests,
+        inference_max_seq_length=args.inference_max_seq_length,
+    )
+
+    inference_context = StaticInferenceContext.from_config(inference_wrapper_config)
+
+    inference_wrapped_model = GPTInferenceWrapper(
+        model,
+        inference_wrapper_config,
+        inference_context
+    )
+    text_generation_controller = TextGenerationController(inference_wrapped_model=inference_wrapped_model, tokenizer=tokenizer)
+    return StaticInferenceEngine(text_generation_controller=text_generation_controller)
+
+
+async def generate(
+    inference_engine: StaticInferenceEngine,
+    sampling_params: SamplingParams,
+    prompts: List[str],
+) -> List[InferenceRequest]:
+    async def collect_stream(prompt, request_id, stream_generator):
+        print(f"Request {request_id}: {prompt}", end="", flush=True)
+        prev_idx = 0
+        async for output in stream_generator:
+            print(output.generated_text[prev_idx:], end="", flush=True)
+            prev_idx = len(output.generated_text)
+        print()
+
+    request_ids: List[str] = [
+        inference_engine.add_request(
+            prompt=prompt, sampling_params=sampling_params, streaming=True
+        )
+        for prompt in prompts
+    ]
+    stream_generators = [inference_engine.get_stream_generator(request_id) for request_id in request_ids]
+
+    tasks = [
+        asyncio.create_task(collect_stream(prompt, request_id, stream_generator))
+        for (prompt, request_id, stream_generator) in zip(prompts, request_ids, stream_generators)
+    ]
+
+    await inference_engine.run_engine_async()
+    await asyncio.gather(*tasks)
+
+    results: List[InferenceRequest] = [
+        inference_engine.scheduler.completed_request_pool[request_id] for request_id in request_ids
+    ]
+
+    return results
+
+def main():
+    """Main program."""
+
+    # Note: The default args passed here can be overwritten by using appropriate params (check arguments.py file)
+    # Micro batch size is not needed to be set by user. (It is calculated based on inference-batch-times-seqlen-threshold argument)
+    initialize_megatron(
+        extra_args_provider=add_static_inference_args,
+        args_defaults={
+            'no_load_rng': True,
+            'no_load_optim': True,
+            'micro_batch_size': 1,
+            'exit_on_missing_checkpoint': True,
+        },
+    )
+
+    # Set up model and load checkpoint
+    model = get_model(model_provider, wrap_with_ddp=False)
+    load_checkpoint(model, None, None)
+    model = model[0]
+
+    args = get_args()
+
+    inference_engine = get_inference_engine(args, model)
+
+    sampling_params = SamplingParams(
+        temperature=args.temperature,
+        top_k=args.top_k,
+        top_p=args.top_p,
+        return_log_probs=args.return_log_probs,
+        num_tokens_to_generate=args.num_tokens_to_generate,
+        top_n_logprobs=args.top_n_logprobs,
+    )
+
+    requests = build_requests(args, get_tokenizer())
+    prompts = [ r.prompt_text for r in requests ]
+
+    if args.enable_cuda_graph:
+        print(f"Running warmup for CUDA graphs...")
+        inference_engine.generate(
+                prompts=prompts, sampling_params=sampling_params
+            )
+    start_time = time.perf_counter()
+    if args.stream:
+        results: List[InferenceRequest] = asyncio.run(generate(inference_engine, sampling_params, prompts))
+    else:
+        results: List[InferenceRequest] = inference_engine.generate(
+            prompts=prompts, sampling_params=sampling_params,
+        )
+    end_time = time.perf_counter()
+    latency = end_time - start_time
+
+    if torch.distributed.get_rank() == 0:
+        for idx, result in enumerate(results):
+            print(f' \n------------- RESULT FOR PROMPT {idx} --------------- ')
+            result_dict = {
+                'id': result.request_id,
+                'input_prompt': result.prompt,
+                'generated_text': result.generated_text,
+                'generated_tokens': result.generated_tokens,
+                'latency': latency,
+            }
+            if sampling_params.top_n_logprobs > 0 :
+                result_dict['generated_top_n_logprobs'] = result.generated_top_n_logprobs
+
+            print(result_dict)
+
+    # Print unique prompts + outputs.
+    if torch.distributed.get_rank() == 0:
+
+        print("~~~~ Unique prompts + outputs. ~~~~")
+
+        # Map results by their prompt.
+        from collections import defaultdict
+        unique_prompt_map = defaultdict(list)
+        for result_idx, result in enumerate(results):
+            unique_prompt_map[result.prompt].append(result_idx)
+
+        # Print unique prompts + outputs.
+        for unique_idx, (prompt_text, result_idxs) in enumerate(unique_prompt_map.items()):
+            result_idx = result_idxs[0]
+            result = results[result_idx]
+            print(f"{unique_idx}/{len(unique_prompt_map)} [{len(result_idxs)}]. {prompt_text} ... %s" % result.generated_text.replace("\n", "\\n"))
+
+
+    stats = torch.cuda.memory_stats()
+    print("static | cg %d | %s | reqs %d [ batch %d ] ... mem %.1f/%.1f ... time %.3f." % (
+        args.enable_cuda_graph,
+        (
+            f"<user prompts>"
+            if args.prompts else
+            "<auto prompts> %s, %d, %.1e, %.1e" % (
+                "(%s)" % " ".join(map(str, args.num_tokens_to_prompt)),
+                args.num_tokens_to_generate,
+                args.incoming_requests_duration,
+                args.incoming_requests_per_sec,
+            )
+        ),
+        len(requests),
+        args.inference_max_requests,
+        stats["allocated_bytes.all.peak"] / (1024**3),
+        stats["reserved_bytes.all.peak"] / (1024**3),
+        latency,
+    ))
+
+    torch.distributed.destroy_process_group()
+
+if __name__ == "__main__":
+    main()
--- a/Megatron-LM/examples/inference/gpt/utils.py
+++ b/Megatron-LM/examples/inference/gpt/utils.py
+# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+
+import random
+import simpy
+import time
+import torch
+from argparse import ArgumentParser, Namespace
+from typing import Any, List
+
+
+def add_common_inference_args(parser: ArgumentParser) -> ArgumentParser:
+    """Common inference arguments."""
+
+    group = parser.add_argument_group(title='Common inference')
+
+    group.add_argument("--temperature", type=float, default=1.0, help='Sampling temperature.')
+    group.add_argument("--top_k", type=int, default=1, help='Top k sampling.')
+    group.add_argument("--top_p", type=float, default=0.0, help='Top p sampling.')
+    group.add_argument(
+        "--return-log-probs",
+        action='store_true',
+        default=False,
+        help='Return the log probabilities of the final output tokens',
+    )
+    group.add_argument(
+        "--prompts",
+        metavar='N',
+        type=str,
+        nargs='+',
+        help='Input prompts with each prompt within quotes and seperated by space',
+    )
+    group.add_argument(
+        "--num-tokens-to-prompt",
+        type=int,
+        nargs="+",
+        default=[64, 1024],
+        help='Number of tokens to use for simulated prompts. This should be a '
+        'space-separated pair of integers, and the generated prompt lengths will '
+        'be uniformly sampled within this range.',
+    )
+    group.add_argument(
+        "--num-tokens-to-generate",
+        type=int,
+        default=30,
+        help='Number of tokens to generate for each prompt',
+    )
+    group.add_argument(
+        "--top-n-logprobs",
+        type=int,
+        default=0,
+        help='Return the top n logprobs for the generated tokens and their corresponding token as a dictionary',
+    )
+    group.add_argument("--incoming-requests-per-sec", type=float, default=100.,
+                       help="Simulated number of requests per second.")
+    group.add_argument("--incoming-requests-duration", type=float, default=10.,
+                       help="Total amount of time to simulate that requests are "
+                       "arriving. Multiply this value with "
+                       "`--incoming-requests-per-sec` to get the approximate "
+                       "total number of requests.")
+
+    return parser
+
+def get_curr_time() -> float:
+    """Get synchronized time across ranks."""
+    curr_time = torch.cuda.LongTensor([time.time_ns()])
+    if torch.distributed.is_initialized():
+        torch.distributed.broadcast(
+            curr_time,
+            src=0)
+    return curr_time.item() / 10**9
+
+
+class Request:
+    """Class to hold attributes for a single request.
+
+    A request is initialized with its prompt text. As it is added, processed,
+    and completed through the inference engine, the request is populated with its
+    start time, end time, and output tokens.
+
+    Args:
+        prompt_text (str): Prompt text.
+        time_offset (float): Artificial time offset for simulating incoming
+            requests. This value is later added to the `base_arrival_time` to
+            simulate the requests arrival time.
+        tokenizer (Any): Tokenizer for tokenizing the prompt.
+    """
+
+    def __init__(
+        self,
+        prompt_text: str,
+        time_offset: float,
+        tokenizer: Any,
+    ):
+        self.prompt_text = prompt_text
+        self.prompt_tokens = tokenizer.tokenize(prompt_text)
+        self.output_text = None
+        self.output_tokens = []
+        self.time_offset = time_offset
+        self.time_arrival = None
+        self.time_start = None
+        self.time_end = None
+        self.state = "not-started"
+
+    def __str__(self) -> str:
+        return "state '%s'; prompt len %d; output len %d; '%s'" % (
+            self.state,
+            len(self.prompt_tokens),
+            len(self.output_tokens),
+            self.prompt_text,
+        )
+
+
+def get_user_requests(args: Namespace, tokenizer: Any) -> List[Request]:
+    requests = [ Request(p, -1., tokenizer) for p in args.prompts ]
+    return requests
+
+
+def get_auto_requests(args: Namespace, tokenizer: Any) -> List[Request]:
+    """Get example requests."""
+
+    random.seed(args.seed)
+
+    # Generate random time offsets.
+    def arrival(r):
+        while True:
+            yield env.timeout(random.expovariate(r))
+            time_offsets.append(env.now)
+
+    time_offsets = []
+    env = simpy.Environment()
+    env.process(arrival(args.incoming_requests_per_sec))
+    env.run(args.incoming_requests_duration)
+
+    # Ensure at least a single request.
+    if len(time_offsets) == 0:
+        time_offsets = [ 0. ]
+
+    # Initialize requests.
+    requests = [ Request(
+        "hi " * random.randint(*args.num_tokens_to_prompt),
+        t,
+        tokenizer,
+    ) for t in time_offsets ]
+
+    # Round down to multiple of --inference-max-requests, until cuda graphs are
+    # fixed with static inference batching.
+    # todo: @lmcafee, remove following lines after fix.
+    factor = getattr(args, "inference_max_requests", 8)
+    rounded_len = factor * (len(requests) // factor)
+    requests = requests[:rounded_len]
+
+    return requests
+
+
+def build_requests(args: Namespace, tokenizer: Any) -> List[Request]:
+    if args.prompts:
+        return get_user_requests(args, tokenizer)
+    else:
+        return get_auto_requests(args, tokenizer)
--- a/Megatron-LM/examples/inference/llama_mistral/huggingface_reference.py
+++ b/Megatron-LM/examples/inference/llama_mistral/huggingface_reference.py
+import argparse
+from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
+
+# Set up argument parsing
+parser = argparse.ArgumentParser(description="Script for text generation with a specific model and prompt.")
+parser.add_argument('--prompt', type=str, required=True, help="Prompt text to use for text generation")
+parser.add_argument('--model-path', type=str, required=True, help="Path to the Huggingface model checkpoint")
+
+# Parse command-line arguments
+args = parser.parse_args()
+
+model_path = args.model_path
+prompt = args.prompt
+
+config = AutoConfig.from_pretrained(model_path)
+tokenizer = AutoTokenizer.from_pretrained(model_path, config=config)
+model = AutoModelForCausalLM.from_pretrained(model_path, config=config).cuda()
+
+inputs = tokenizer(prompt, return_tensors="pt")
+for key in inputs:
+    inputs[key] = inputs[key].cuda()
+# top_k, top_p and do_sample are set for greedy argmax based sampling
+
+outputs = model.generate(**inputs, max_length=100, do_sample=False, top_p=0, top_k=0, temperature=1.0)
+print(tokenizer.decode(outputs[0], skip_special_tokens=True))
\ No newline at end of file
--- a/Megatron-LM/examples/inference/llama_mistral/run_text_generation_llama3.1.sh
+++ b/Megatron-LM/examples/inference/llama_mistral/run_text_generation_llama3.1.sh
+#!/bin/bash
+# This example will start serving the Llama3.1-8B model
+export NCCL_IB_SL=1
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+export NVTE_APPLY_QK_LAYER_SCALING=0
+
+DISTRIBUTED_ARGS="--nproc_per_node 1 \
+                  --nnodes 1 \
+                  --node_rank 0 \
+                  --master_addr 0.0.0.0 \
+                  --master_port 6000"
+
+# Ensure CHECKPOINT and TOKENIZER_MODEL are provided
+if [ -z "$1" ] || [ -z "$2" ]; then
+  echo "Error: You must provide CHECKPOINT and TOKENIZER_MODEL as command-line arguments."
+  echo "Usage: $0 /path/to/checkpoint /path/to/tokenizer_model"
+  exit 1
+fi
+
+# Assign command-line arguments to variables
+CHECKPOINT=$1
+TOKENIZER_MODEL=$2
+
+pip install flask-restful
+
+torchrun $DISTRIBUTED_ARGS tools/run_text_generation_server.py   \
+      --use-checkpoint-args \
+      --disable-bias-linear \
+      --tokenizer-type HuggingFaceTokenizer \
+      --tokenizer-model ${TOKENIZER_MODEL} \
+      --transformer-impl transformer_engine \
+      --normalization RMSNorm \
+      --group-query-attention \
+      --num-query-groups 8 \
+      --no-masked-softmax-fusion \
+      --attention-softmax-in-fp32 \
+      --attention-dropout 0.0 \
+      --hidden-dropout 0.0 \
+      --untie-embeddings-and-output-weights \
+      --position-embedding-type rope \
+      --rotary-percent 1.0 \
+      --rotary-base 500000 \
+      --use-rope-scaling \
+      --use-rotary-position-embeddings \
+      --swiglu \
+      --tensor-model-parallel-size 1  \
+      --pipeline-model-parallel-size 1  \
+      --num-layers 32  \
+      --hidden-size 4096  \
+      --ffn-hidden-size 14336 \
+      --load ${CHECKPOINT}  \
+      --num-attention-heads 32  \
+      --max-position-embeddings 131072  \
+      --bf16  \
+      --micro-batch-size 1  \
+      --seq-length 8192
--- a/Megatron-LM/examples/inference/llama_mistral/run_text_generation_llama3.sh
+++ b/Megatron-LM/examples/inference/llama_mistral/run_text_generation_llama3.sh
+#!/bin/bash
+# This example will start serving the Llama3-8B model
+export NCCL_IB_SL=1
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+export NVTE_APPLY_QK_LAYER_SCALING=0
+
+DISTRIBUTED_ARGS="--nproc_per_node 1 \
+                  --nnodes 1 \
+                  --node_rank 0 \
+                  --master_addr 0.0.0.0 \
+                  --master_port 6000"
+
+# Ensure CHECKPOINT and TOKENIZER_MODEL are provided
+if [ -z "$1" ] || [ -z "$2" ]; then
+  echo "Error: You must provide CHECKPOINT and TOKENIZER_MODEL as command-line arguments."
+  echo "Usage: $0 /path/to/checkpoint /path/to/tokenizer_model"
+  exit 1
+fi
+
+# Assign command-line arguments to variables
+CHECKPOINT=$1
+TOKENIZER_MODEL=$2
+
+pip install flask-restful
+
+torchrun $DISTRIBUTED_ARGS tools/run_text_generation_server.py   \
+      --use-checkpoint-args \
+      --disable-bias-linear \
+      --tokenizer-type HuggingFaceTokenizer \
+      --tokenizer-model ${TOKENIZER_MODEL} \
+      --transformer-impl transformer_engine \
+      --normalization RMSNorm \
+      --group-query-attention \
+      --num-query-groups 8 \
+      --no-masked-softmax-fusion \
+      --attention-softmax-in-fp32 \
+      --attention-dropout 0.0 \
+      --hidden-dropout 0.0 \
+      --untie-embeddings-and-output-weights \
+      --position-embedding-type rope \
+      --rotary-percent 1.0 \
+      --rotary-base 500000 \
+      --use-rotary-position-embeddings \
+      --swiglu \
+      --tensor-model-parallel-size 1  \
+      --pipeline-model-parallel-size 1  \
+      --num-layers 32  \
+      --hidden-size 4096  \
+      --ffn-hidden-size 14336 \
+      --load ${CHECKPOINT}  \
+      --num-attention-heads 32  \
+      --max-position-embeddings 8192  \
+      --bf16  \
+      --micro-batch-size 1  \
+      --seq-length 8192
--- a/Megatron-LM/examples/inference/llama_mistral/run_text_generation_mistral.sh
+++ b/Megatron-LM/examples/inference/llama_mistral/run_text_generation_mistral.sh
+#!/bin/bash
+# This example will start serving the Mistral-7B-v0.3 model
+export NCCL_IB_SL=1
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+
+DISTRIBUTED_ARGS="--nproc_per_node 1 \
+                  --nnodes 1 \
+                  --node_rank 0 \
+                  --master_addr 0.0.0.0 \
+                  --master_port 6000"
+
+# Ensure CHECKPOINT and TOKENIZER_MODEL are provided
+if [ -z "$1" ] || [ -z "$2" ]; then
+  echo "Error: You must provide CHECKPOINT and TOKENIZER_MODEL as command-line arguments."
+  echo "Usage: $0 /path/to/checkpoint /path/to/tokenizer_model"
+  exit 1
+fi
+
+# Assign command-line arguments to variables
+CHECKPOINT=$1
+TOKENIZER_MODEL=$2
+
+pip install flask-restful
+
+torchrun $DISTRIBUTED_ARGS tools/run_text_generation_server.py   \
+       --tokenizer-type HuggingFaceTokenizer \
+       --tokenizer-model ${TOKENIZER_MODEL} \
+       --use-checkpoint-args \
+       --apply-layernorm-1p \
+       --transformer-impl transformer_engine \
+       --normalization RMSNorm \
+       --group-query-attention \
+       --num-query-groups 8 \
+       --no-masked-softmax-fusion \
+       --use-flash-attn \
+       --untie-embeddings-and-output-weights \
+       --disable-bias-linear \
+       --position-embedding-type rope \
+       --rotary-percent 1.0 \
+       --rotary-base 1000000 \
+       --swiglu \
+       --ffn-hidden-size 14336 \
+       --tensor-model-parallel-size 1  \
+       --pipeline-model-parallel-size 1  \
+       --num-layers 32  \
+       --hidden-size 4096  \
+       --load ${CHECKPOINT}  \
+       --num-attention-heads 32  \
+       --max-position-embeddings 4096  \
+       --bf16  \
+       --micro-batch-size 1  \
+       --seq-length 4096  \
+       --seed 101
--- a/Megatron-LM/examples/inference/run_text_generation_server_345M.sh
+++ b/Megatron-LM/examples/inference/run_text_generation_server_345M.sh
+#!/bin/bash
+# This example will start serving the 345M model.
+DISTRIBUTED_ARGS="--nproc_per_node 1 \
+                  --nnodes 1 \
+                  --node_rank 0 \
+                  --master_addr localhost \
+                  --master_port 6000"
+
+CHECKPOINT=<Path to checkpoint (e.g /345m)>
+VOCAB_FILE=<Path to vocab.json (e.g. /gpt2-vocab.json)>
+MERGE_FILE=<Path to merges.txt (e.g. /gpt2-merges.txt)>
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+
+pip install flask-restful
+
+torchrun $DISTRIBUTED_ARGS tools/run_text_generation_server.py   \
+       --tensor-model-parallel-size 1  \
+       --pipeline-model-parallel-size 1  \
+       --num-layers 24  \
+       --hidden-size 1024  \
+       --load ${CHECKPOINT}  \
+       --num-attention-heads 16  \
+       --max-position-embeddings 1024  \
+       --tokenizer-type GPT2BPETokenizer  \
+       --fp16  \
+       --micro-batch-size 1  \
+       --seq-length 1024  \
+       --vocab-file $VOCAB_FILE  \
+       --merge-file $MERGE_FILE  \
+       --seed 42
--- a/Megatron-LM/examples/inference/run_text_generation_server_345M_8_tensor_parallel.sh
+++ b/Megatron-LM/examples/inference/run_text_generation_server_345M_8_tensor_parallel.sh
+#!/bin/bash
+# This example will start serving the 345M model that is partitioned 8 way tensor parallel
+DISTRIBUTED_ARGS="--nproc_per_node 8 \
+                  --nnodes 1 \
+                  --node_rank 0 \
+                  --master_addr localhost \
+                  --master_port 6000"
+
+CHECKPOINT=<Path to checkpoint (e.g /345m)>
+VOCAB_FILE=<Path to vocab.json (e.g. /gpt2-vocab.json)>
+MERGE_FILE=<Path to merges.txt (e.g. /gpt2-merges.txt)>
+
+pip install flask-restful
+
+python -m torch.distributed.launch $DISTRIBUTED_ARGS tools/run_text_generation_server.py   \
+       --tensor-model-parallel-size 8  \
+       --pipeline-model-parallel-size 1  \
+       --num-layers 24  \
+       --hidden-size 1024  \
+       --load ${CHECKPOINT}  \
+       --num-attention-heads 16  \
+       --max-position-embeddings 1024  \
+       --tokenizer-type GPT2BPETokenizer  \
+       --fp16  \
+       --micro-batch-size 1  \
+       --seq-length 1024  \
+       --vocab-file $VOCAB_FILE  \
+       --merge-file $MERGE_FILE  \
+       --seed 42
--- a/Megatron-LM/examples/inference/t5/simple_t5_batch_inference.py
+++ b/Megatron-LM/examples/inference/t5/simple_t5_batch_inference.py
+import os
+import sys
+from argparse import Namespace
+
+import torch
+
+import pretrain_t5
+from megatron.core.inference.sampling_params import SamplingParams
+from megatron.core.inference.engines import AbstractEngine, StaticInferenceEngine
+from megatron.core.inference.inference_request import InferenceRequest
+from megatron.core.inference.model_inference_wrappers.inference_wrapper_config import (
+    InferenceWrapperConfig,
+)
+from megatron.core.inference.model_inference_wrappers.t5.t5_inference_wrapper import (
+    T5InferenceWrapper,
+)
+from megatron.core.inference.text_generation_controllers.encoder_decoder_text_generation_controller import (
+    EncoderDecoderTextGenerationController,
+)
+from megatron.core.transformer.module import MegatronModule
+from pretrain_t5 import model_provider
+
+sys.path.append(
+    os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir, os.path.pardir))
+)
+
+from typing import List
+
+from megatron.core import mpu
+from megatron.training import get_args, get_model, get_tokenizer
+from megatron.training.checkpointing import load_checkpoint
+from megatron.training.initialize import initialize_megatron
+
+
+def add_text_generate_args(parser):
+    """Text generation arguments."""
+    group = parser.add_argument_group(title='text generation')
+
+    group.add_argument("--temperature", type=float, default=1.0, help='Sampling temperature.')
+    group.add_argument("--top_k", type=int, default=1, help='Top k sampling.')
+    group.add_argument("--top_p", type=float, default=0.0, help='Top p sampling.')
+    group.add_argument(
+        "--return-log-probs",
+        action='store_true',
+        default=False,
+        help='Return the log probabilities of the final output tokens',
+    )
+    group.add_argument(
+        "--num-tokens-to-generate",
+        type=int,
+        default=30,
+        help='Number of tokens to generate for each prompt',
+    )
+    group.add_argument(
+        "--encoder-prompts",
+        metavar='N',
+        type=str,
+        nargs='+',
+        help='Encoder input prompts with each prompt within quotes and seperated by space',
+    )
+    group.add_argument(
+        "--max-batch-size", type=int, default=1, help='Max number of prompts to process at once'
+    )
+    return parser
+
+
+def get_inference_engine(args: Namespace, model: MegatronModule) -> AbstractEngine:
+    """Utility to get the relevant backend for running inference
+
+    This function will automatically chose the TRTLLMBackend when possible, and if not revert to Mcore backend if the user does not specify any backends. TRT LLM Backend is not implmented yet.
+
+    Args:
+        args (Namespace): The user arguments parsed from command line
+        model (MegatronModule): The megatron model .
+
+    Returns:
+        AbstractBackend: The chosen backend
+    """
+    tokenizer = get_tokenizer()
+
+    inference_wrapper_config = InferenceWrapperConfig(
+        hidden_size=args.hidden_size,
+        inference_batch_times_seqlen_threshold=args.inference_batch_times_seqlen_threshold,
+        fp32_residual_connection=args.fp32_residual_connection,
+        params_dtype=args.params_dtype,
+        padded_vocab_size=args.padded_vocab_size,
+    )
+
+    inference_wrapped_model = T5InferenceWrapper(model, inference_wrapper_config)
+    text_generation_controller = EncoderDecoderTextGenerationController(
+        inference_wrapped_model=inference_wrapped_model, tokenizer=tokenizer
+    )
+    return StaticInferenceEngine(
+        text_generation_controller=text_generation_controller, max_batch_size=args.max_batch_size
+    )
+
+
+def main():
+    """Main program."""
+
+    # Note: The default args passed here can be overwritten by using appropriate params (check arguments.py file)
+    # Micro batch size is not needed to be set by user. (It is calculated based on inference-batch-times-seqlen-threshold argument)
+    initialize_megatron(
+        extra_args_provider=add_text_generate_args,
+        args_defaults={
+            'no_load_rng': True,
+            'no_load_optim': True,
+            'micro_batch_size': 1,
+            'exit_on_missing_checkpoint': True,
+        },
+    )
+
+    # Set up model and load checkpoint
+    model = get_model(model_provider, wrap_with_ddp=False)
+    load_checkpoint(model, None, None)
+    model = model[0]
+
+    args = get_args()
+
+    inference_engine = get_inference_engine(args, model)
+
+    sampling_params = SamplingParams(
+        temperature=args.temperature,
+        top_k=args.top_k,
+        top_p=args.top_p,
+        return_log_probs=args.return_log_probs,
+        num_tokens_to_generate=args.num_tokens_to_generate,
+    )
+
+    tokenizer = get_tokenizer()
+    decoder_prompts = [""] * len(
+        args.encoder_prompts
+    )  # for T5, the prompt is provided as encoder input, hence decoder_prompts is empty
+    args.prompts = decoder_prompts
+
+    results: List[InferenceRequest] = inference_engine.generate(
+        prompts=args.prompts,
+        add_BOS=True,
+        encoder_prompts=args.encoder_prompts,
+        sampling_params=sampling_params,
+    )
+
+    if torch.distributed.get_rank() == 0:
+        for idx, result in enumerate(results):
+            print(f' \n------------- RESULT FOR PROMPT {idx} --------------- ')
+            result = {
+                'id': result.request_id,
+                'input_prompt': result.prompt,
+                'generated_text': result.generated_text,
+                'generated_tokens': result.generated_tokens,
+            }
+            print(result)
+
+
+if __name__ == "__main__":
+    main()
--- a/Megatron-LM/examples/mamba/.gitignore
+++ b/Megatron-LM/examples/mamba/.gitignore
+checkpoints/
+data-cache/
+tensorboard/
+triton-cache/
--- a/Megatron-LM/examples/mamba/Dockerfile
+++ b/Megatron-LM/examples/mamba/Dockerfile
+FROM nvcr.io/nvidia/pytorch:24.01-py3
+
+RUN pip uninstall -y triton && \
+    pip install triton==2.1.0 sentencepiece==0.1.99 flask-restful
+
+# The causal-conv1d and mamba-ssm packages below are built from scratch here
+# (which takes significant time) because there are no wheels available on PyPI
+# for these relatively newer versions of the packages that are compatible with
+# the older NGC-variant PyTorch version (e.g. version 2.2.0.dev231106) that we
+# are using (in the NGC base container). Generally, if the package is not
+# compatible with the PyTorch version, then it will generate a Python import
+# error. The package authors tend to only release wheels for new versions of
+# these pacakges which are compatible with the versions of regular PyTorch and
+# NGC-variant PyTorch that are newer at the time of release. So, to use newer
+# versions of these packages with relatively older versions of the NGC PyTorch
+# container, we tend to have to build the packages from scratch.
+
+RUN cd /tmp && \
+    git clone https://github.com/Dao-AILab/causal-conv1d.git && \
+    cd causal-conv1d && \
+    git checkout v1.2.2.post1 && \
+    CAUSAL_CONV1D_FORCE_BUILD=TRUE pip install . && \
+    cd .. && \
+    rm -rf causal-conv1d
+
+RUN cd /tmp && \
+    git clone https://github.com/state-spaces/mamba.git && \
+    cd mamba && \
+    git checkout v2.0.3 && \
+    MAMBA_FORCE_BUILD=TRUE pip install . && \
+    cd .. && \
+    rm -rf mamba
--- a/Megatron-LM/examples/mamba/README.md
+++ b/Megatron-LM/examples/mamba/README.md
+# Mamba-based Language Models
+
+## Introduction
+
+This document is an entrypoint into the code used for
+<em>[An Empirical Study of Mamba-based Language Models](https://arxiv.org/abs/2406.07887)</em>.
+
+We are releasing the parameters for some of the models described in that
+technical report via
+[HuggingFace](https://huggingface.co/collections/nvidia/ssms-666a362c5c3bb7e4a6bcfb9c).
+The code in the `main` branch is no longer compatible with the `Mamba2-*`
+checkpoints. You can load them using the
+[fixed snapshot of the code used for the technical report](https://github.com/NVIDIA/Megatron-LM/tree/ssm/examples/mamba).
+
+## Installation
+
+Create and run a Docker container using the [Dockerfile](./Dockerfile).
+
+```
+docker build -t your_image_name:your_tag .
+docker run --gpus all -it --rm \
+  -v /path/to/megatron:/workspace/megatron \
+  -v /path/to/dataset:/workspace/dataset \
+  -v /path/to/checkpoints:/workspace/checkpoints \
+  -w /workspace/megatron/examples/mamba \
+  your_image_name:your_tag
+```
+
+## Train
+
+[`train.sh`](./train.sh) is an example pretraining script, showing how to run on
+a single node. Select between 800M-scale and 8B-scale models by setting the
+`MODEL_SCALE` variable. The 8B-scale hybrid model architecture is the same as
+the one described in the technical report.
+
+## Text Generation
+
+Use [`run_text_gen_server_8b.sh`](./run_text_gen_server_8b.sh) to start a text
+generation server using an 8B hybrid checkpoint. This is configured to run the
+8B hybrid model described in the technical report, with tensor model parallel
+set to 1.
+
+The arguments in the script will need to be changed if using a checkpoint with a
+different model parallel configuration or other differences, such as model
+architecture. For example, to run the 8B pure Mamba-2 model, change
+`--hybrid-attention-ratio` and `--hybrid-mlp-ratio` to 0.0, or remove them.
+
+Use [`run_text_gen_server_8b_gpt3.sh`](./run_text_gen_server_8b_gpt3.sh) to start
+a text generation server using the 8B reference Transformer checkpoint.
+
+## Checkpoint Formats
+
+For inference, the model must be configured to match the checkpoint file used,
+including the hybrid layer configuration and model parallel configuration.
+
+If you need to convert a hybrid checkpoint file to a different tensor parallel
+or pipeline parallel size, use
+[the hybrid conversion script](../../tools/checkpoint/hybrid_conversion.py).
+There is an example run command at the end of that file.
+
+Before running that script, you will need to set `PYTHONPATH` to include the
+root directory of your Megatron-LM repository clone.
+
+```
+export PYTHONPATH=<path-to-megatron>:PYTHONPATH
+```
+
+## Hybrid Options
+
+`--hybrid-attention-ratio ATT` specifies a target ratio of attention layers
+to total layers. For example, 4 attention layers out of 48 total layers is
+specified by `--hybrid-attention-ratio 0.08`.
+
+`--hybrid-mlp-ratio MLP` specifies a target ratio of MLP layers to total
+layers. For example, 24 MLP layers out of 48 total layers is specified by
+`--hybrid-mlp-ratio 0.5`.
+
+* (`ATT` + `MLP`) must be less than or equal to 1.0.
+* (1.0 - `ATT` - `MLP`) is the hybrid mamba ratio, the ratio of mamba layers to
+total layers.
+* `ATT` = `MLP` = 0 is a pure Mamba model.
+* `ATT` = `MLP` = 0.5 is a transfomer model.
+
+If either `ATT` or `MLP` is greater than 0.0 or if `--hybrid-override-pattern`
+is specified, the logfile will include information about the hybrid layer
+pattern used. `--hybrid-override-pattern` can be used to specify a different
+pattern than the default, algorithmically-generated one.
+
+## Mamba vs Mamba-2
+
+This codebase currently only supports Mamba-2, and not the original version of
+Mamba. However, the
+[fixed snapshot of the code used for the technical report](https://github.com/NVIDIA/Megatron-LM/tree/ssm/examples/mamba)
+can be configured to run the original version of Mamba.
--- a/Megatron-LM/examples/mamba/run_text_gen_server_8b.sh
+++ b/Megatron-LM/examples/mamba/run_text_gen_server_8b.sh
+#!/bin/bash
+
+# Use: ./run_text_gen_server_8b.sh <checkpoint-path> <tokenizer-path>
+# To launch the client: python ../../tools/text_generation_cli.py <URL-provided-by-server>
+
+CHECKPOINT_PATH=$1
+TOKENIZER_PATH=$2
+
+DISTRIBUTED_ARGS="--nproc_per_node 1 \
+                  --nnodes 1 \
+                  --node_rank 0 \
+                  --master_addr localhost \
+                  --master_port 6000"
+
+export NCCL_IB_SL=1
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+export NCCL_IB_TIMEOUT=19
+export NCCL_IB_QPS_PER_CONNECTION=4
+
+export TRITON_CACHE_DIR="./triton-cache/"
+export TRITON_CACHE_MANAGER="megatron.core.ssm.triton_cache_manager:ParallelFileCacheManager"
+
+torchrun $DISTRIBUTED_ARGS ../../tools/run_mamba_text_generation_server.py \
+       --tensor-model-parallel-size 1  \
+       --pipeline-model-parallel-size 1  \
+       --untie-embeddings-and-output-weights \
+       --num-layers 56  \
+       --hidden-size 4096  \
+       --load ${CHECKPOINT_PATH}  \
+       --num-attention-heads 32  \
+       --group-query-attention \
+       --num-query-groups 8 \
+       --hybrid-attention-ratio 0.08 \
+       --hybrid-mlp-ratio 0.5 \
+       --attention-dropout 0.0 \
+       --hidden-dropout 0.0 \
+       --disable-bias-linear \
+       --normalization RMSNorm \
+       --seq-length 4096  \
+       --max-position-embeddings 4096  \
+       --position-embedding-type none \
+       --tokenizer-type GPTSentencePieceTokenizer  \
+       --tokenizer-model ${TOKENIZER_PATH} \
+       --distributed-backend nccl \
+       --distributed-timeout-minutes 1440 \
+       --bf16  \
+       --micro-batch-size 1  \
+       --use-mcore-models \
+       --spec megatron.core.models.mamba.mamba_layer_specs mamba_stack_spec \
+       --seed 42