首次上传

d444a97a · yangzhong · d444a97a · d444a97a · d444a97a · d444a97a
Commit d444a97a authored Oct 30, 2025 by yangzhong
20 changed files
--- a/examples/export/ptq_and_trtllm_export/ptq_trtllm_mixtral_8x7b.sh
+++ b/examples/export/ptq_and_trtllm_export/ptq_trtllm_mixtral_8x7b.sh
+#!/bin/bash
+set -e
+
+DEFAULT_NAME="/checkpoints/Mistral-NeMo-12B-Base"
+NAME="${1:-$DEFAULT_NAME}"
+
+DEFAULT_QUANT_CFG="fp8"
+QUANT_CFG="${2:-$DEFAULT_QUANT_CFG}"
+
+# NOTE: UNFUSED ATTENTION MUST BE USED TO AVOID ADDITIONAL STATE_DICT KEY MISMATCH.
+export NVTE_FLASH_ATTN=0
+export NVTE_FUSED_ATTN=0
+export NVTE_UNFUSED_ATTN=1
+
+# CHANGE THE FOLLOWING IF YOU MOUNT YOUR DATA AND CHECKPOINTS DIFFERENTLY IN THE CONTAINER.
+TP="8"
+INFERENCE_TP=${TP}
+DECODER_TYPE="llama"
+CHECKPOINT_LOAD_DIR="${NAME}"
+
+if [ "$QUANT_CFG" = "int4_awq" ]; then
+    INFERENCE_TP="1"
+fi
+
+additional_options=" \
+    --export-quant-cfg ${QUANT_CFG} \
+    --export-legacy-megatron \
+    --export-te-mcore-model \
+    --calib-batch-size 8 \
+    --decoder ${DECODER_TYPE} \
+    --export-dir /tmp/trtllm_ckpt \
+    --inference-tensor-parallel ${INFERENCE_TP} "
+
+# DO NOT CHANGE THE SETTING BELOW UNLESS YOU KNOW WHAT YOU ARE DOING!!!
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+
+options=" \
+    --untie-embeddings-and-output-weights \
+    --no-masked-softmax-fusion \
+    --no-position-embedding \
+    --use-mcore-models \
+    --disable-bias-linear \
+    --rotary-percent 1.0 \
+    --attention-dropout 0.0 \
+    --hidden-dropout 0.0 \
+    --tensor-model-parallel-size ${TP} \
+    --pipeline-model-parallel-size 1 \
+    --num-layers 32 \
+    --hidden-size 4096 \
+    --ffn-hidden-size 14336 \
+    --num-attention-heads 32 \
+    --seq-length 4096 \
+    --kv-channels 128 \
+    --normalization RMSNorm \
+    --swiglu \
+    --num-query-groups 8 \
+    --num-experts 8 \
+    --moe-router-topk 2 \
+    --moe-aux-loss-coeff 1e-2 \
+    --moe-router-load-balancing-type aux_loss \
+    --group-query-attention \
+    --position-embedding-type rope \
+    --no-rope-fusion \
+    --max-position-embeddings 32768 \
+    --micro-batch-size 1 \
+    --tokenizer-type HuggingFaceTokenizer \
+    --tiktoken-pattern v2 \
+    --tokenizer-model mistralai/Mixtral-8x7B-Instruct-v0.1 \
+    --save-interval 1000000 \
+    --load ${CHECKPOINT_LOAD_DIR} \
+    --bf16 \
+    --rotary-base 1000000 \
+    --use-dist-ckpt"
+
+# Precompile CUDA extentions
+python -c "import modelopt.torch.quantization.extensions as ext; print(ext.cuda_ext); print(ext.cuda_ext_fp8)"
+
+# Acquire launch configuration where variable launch_config will be set
+launch_config="--nproc_per_node=${TP}"
+
+# Launch multi-process with torchrun
+torchrun ${launch_config} examples/export/ptq_and_trtllm_export/text_generation_ptq.py ${options} ${additional_options}
+
+
--- a/examples/export/ptq_and_trtllm_export/text_generation_ptq.py
+++ b/examples/export/ptq_and_trtllm_export/text_generation_ptq.py
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+
+"""Sample Generate GPT."""
+import functools
+import os
+import sys
+from pathlib import Path
+
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../../../")))
+
+import modelopt.torch.quantization as mtq
+import torch
+from datasets import load_dataset
+from tqdm import tqdm
+
+# [ModelOpt]: changing the default model provider to the ModelOpt version
+from megatron.core import mpu
+from megatron.inference.arguments import add_modelopt_args
+from megatron.inference.checkpointing import load_modelopt_checkpoint
+from megatron.inference.gpt.model_provider import model_provider
+from megatron.inference.text_generation import generate_and_post_process
+from megatron.training import get_args, get_model, initialize_megatron
+from megatron.training.checkpointing import save_checkpoint
+from megatron.training.utils import print_rank_0, unwrap_model
+
+QUANT_CFG_CHOICES = {
+    "int8": mtq.INT8_DEFAULT_CFG,
+    "int8_sq": mtq.INT8_SMOOTHQUANT_CFG,
+    "fp8": mtq.FP8_DEFAULT_CFG,
+    "int4_awq": mtq.INT4_AWQ_CFG,
+    "w4a8_awq": mtq.W4A8_AWQ_BETA_CFG,
+    "int4": mtq.INT4_BLOCKWISE_WEIGHT_ONLY_CFG,
+}
+
+
+def add_trtllm_ckpt_export_args(parser):
+    """Add additional arguments for TensorRT-LLM."""
+    group = parser.add_argument_group(title="trtllm")
+
+    group.add_argument(
+        "--export-dir", type=str, help="The output TensorRT-LLM checkpoint.",
+    )
+    group.add_argument(
+        "--decoder", type=str, choices=["gptnext", 'llama'], help="The decoder type of the model.",
+    )
+    group.add_argument(
+        "--inference-tensor-parallel",
+        type=int,
+        help="Tensor parallel for the inference time, can be different from the training config.",
+        default=1,
+    )
+
+
+def add_text_generate_ptq_args(parser):
+    """Add additional arguments for ModelOpt text generation PTQ."""
+    group = parser.add_argument_group(title='ModelOpt text generation ptq')
+    group.add_argument(
+        "--calib-dataset",
+        type=str,
+        default="cnn_dailymail",
+        help="Calibration datasets from HuggingFace datasets.",
+    )
+    group.add_argument(
+        "--calib-batch-size", type=int, default=4, help="Batch size to use for ptq calibration."
+    )
+    group.add_argument(
+        "--calib-size", type=int, default=512, help="Samples to use for ptq calibration."
+    )
+    parser.add_argument(
+        "--prompts",
+        type=str,
+        default=(
+            "Born in north-east France, Soyer trained as a|Born in California, Soyer trained as a"
+        ),
+        help="Input texts. Please use | to separate different batches.",
+    )
+    add_modelopt_args(parser)
+    add_trtllm_ckpt_export_args(parser)
+    return parser
+
+
+def get_calib_dataloader(
+    data="cnn_dailymail", batch_size=4, calib_size=512, max_sequence_length=512
+):
+    if data == "pileval":
+        dataset = load_dataset(
+            "json", data_files="https://the-eye.eu/public/AI/pile/val.jsonl.zst", split="train"
+        )
+        text_column = "text"
+    elif data == "wikitext":
+        dataset = load_dataset("wikitext", "wikitext-103-v1", split="train")
+        text_column = "text"
+    elif data == "cnn_dailymail":
+        dataset = load_dataset("cnn_dailymail", name="3.0.0", split="train")
+        text_column = "article"
+
+    calib_size = max(min(len(dataset), calib_size), batch_size)
+    for i in range(calib_size // batch_size):
+        batch = dataset[i * batch_size : (i + 1) * batch_size][text_column]
+        for j in range(len(batch)):
+            batch[j] = batch[j][:max_sequence_length]
+        yield batch
+
+
+
+if __name__ == "__main__":
+    initialize_megatron(
+        extra_args_provider=add_text_generate_ptq_args,
+        args_defaults={
+            'tokenizer_type': 'GPT2BPETokenizer',
+            'no_load_rng': True,
+            'no_load_optim': True,
+        },
+    )
+
+    args = get_args()
+    if args.num_layers_per_virtual_pipeline_stage is not None:
+        print_rank_0("Interleaved pipeline schedule is not yet supported for text generation.")
+        exit()
+
+    print_rank_0("WARNING: Forcing exit_on_missing_checkpoint to True for text generation.")
+    args.exit_on_missing_checkpoint = True
+    if hasattr(args, 'moe_grouped_gemm') and args.moe_grouped_gemm == True:
+        print_rank_0("WARNING: Forcing moe_grouped_gemm to False for PTQ and export.")
+        args.moe_grouped_gemm = False
+
+    # Set up model and load checkpoint
+    # [ModelOpt]: make sure that output logits are allgathered.
+    text_generation_model_provider = functools.partial(model_provider, parallel_output=False)
+    model = get_model(text_generation_model_provider, wrap_with_ddp=False)
+
+    if args.load is not None:
+        load_modelopt_checkpoint(model, strict=not args.untie_embeddings_and_output_weights)
+        print_rank_0("Done loading checkpoint")
+
+    # Removing virtual pipeline parallel and other wrapper
+    assert len(model) == 1, "Above condition should have caught this"
+    unwrapped_model = unwrap_model(model)
+
+    all_prompts = args.prompts.split("|")
+
+    def custom_prompt_forward_loop_func(model):
+        for prompt in tqdm(all_prompts):
+            if mpu.is_pipeline_first_stage() and mpu.get_tensor_model_parallel_rank() == 0:
+                (
+                    prompts_plus_generations,
+                    prompts_plus_generations_segments,
+                    logprobs,
+                    _,
+                ) = generate_and_post_process(
+                    model,
+                    prompts=[prompt],
+                    tokens_to_generate=128,
+                    return_output_log_probs=True,
+                    temperature=1.0,
+                )
+                print_rank_0(prompts_plus_generations)
+            else:
+                generate_and_post_process(model)
+
+    def hf_dataset_forword_loop_func(model):
+        dataloader = get_calib_dataloader(args.calib_dataset, args.calib_batch_size, args.calib_size)
+        for prompts in tqdm(dataloader, total=args.calib_size//args.calib_batch_size):
+            if mpu.is_pipeline_first_stage() and mpu.get_tensor_model_parallel_rank() == 0:
+                (
+                    prompts_plus_generations,
+                    prompts_plus_generations_segments,
+                    logprobs,
+                    _,
+                ) = generate_and_post_process(
+                    model,
+                    prompts=prompts,
+                    tokens_to_generate=0,
+                    return_output_log_probs=False,
+                    temperature=1.0,
+                )
+            else:
+                generate_and_post_process(model)
+
+    ptq_forward_loop_func = custom_prompt_forward_loop_func
+    if args.calib_dataset is not None:
+        ptq_forward_loop_func = hf_dataset_forword_loop_func
+
+    if args.export_quant_cfg in QUANT_CFG_CHOICES:
+        mtq_config = QUANT_CFG_CHOICES[args.export_quant_cfg]
+        if "*output_layer*" not in mtq_config["quant_cfg"]:
+            mtq_config["quant_cfg"]["*output_layer*"] = {"enable": False}
+        if "awq" in args.export_quant_cfg:
+            weight_quantizer = mtq_config["quant_cfg"]["*weight_quantizer"]  # type: ignore
+            if isinstance(weight_quantizer, list):
+                weight_quantizer = weight_quantizer[0]
+            weight_quantizer["block_sizes"][-1] = 128
+        print_rank_0("Quantizing the model...")
+        mtq.quantize(unwrapped_model[0], mtq_config, ptq_forward_loop_func)
+
+    custom_prompt_forward_loop_func(model[0])
+
+    if args.save is not None and args.export_quant_cfg in QUANT_CFG_CHOICES:
+        save_checkpoint(1, unwrapped_model, None, None, 0)
+
+    print_rank_0(f"Fake Quantized Model:\n {unwrapped_model[0]}")
+
+    if args.export_dir:
+        assert args.decoder in ["gptnext", "llama"], f"Decoder type {args.decoder} not supported."
+        Path(args.export_dir).mkdir(parents=True, exist_ok=True)
+        print_rank_0("Exporting TensorRT-LLM checkpoints.")
+
+        from modelopt.torch.export import export_tensorrt_llm_checkpoint
+
+        # In TRT LLM, squared relu activation does not support bf16. So we use fp16 by default.
+        export_tensorrt_llm_checkpoint(
+            unwrapped_model[0],
+            args.decoder,
+            torch.bfloat16 if args.bf16 else torch.float16,
+            export_dir=args.export_dir,
+            inference_tensor_parallel=args.inference_tensor_parallel,
+            inference_pipeline_parallel=1,
+            use_nfs_workspace=True,
+        )
+
+        print_rank_0(f"TensorRT-LLM checkpoints saved to {args.export_dir}")
+        torch.distributed.barrier()
--- a/examples/export/ptq_and_trtllm_export/trtllm_text_generation.py
+++ b/examples/export/ptq_and_trtllm_export/trtllm_text_generation.py
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+
+"""An example script to run the tensorrt_llm engine."""
+
+import argparse
+from pathlib import Path
+import subprocess
+from typing import Optional, Union
+
+import numpy as np
+import torch
+from modelopt.deploy.llm import LLM
+from tensorrt_llm.models import PretrainedConfig
+from transformers import AutoTokenizer, T5Tokenizer
+import tensorrt_llm
+
+
+def parse_arguments():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--tokenizer", type=str, default="")
+    parser.add_argument("--engine-dir", type=str, default="/tmp/trtllm_engine")
+    parser.add_argument(
+        "--input-texts",
+        type=str,
+        default=(
+            "Born in north-east France, Soyer trained as a|Born in California, Soyer trained as a"
+        ),
+        help="Input texts. Please use | to separate different batches.",
+    )
+    return parser.parse_args()
+
+
+def run(args):
+    try:
+        tokenizer = AutoTokenizer.from_pretrained(args.tokenizer, trust_remote_code=True)
+    except Exception as e:
+        raise Exception(f"Failed to load tokenizer: {e}")
+
+    print(tokenizer, tokenizer.vocab_size)
+
+    input_texts = args.input_texts.split("|")
+    assert input_texts, "input_text not specified"
+    print(input_texts)
+
+    free_memory_before = torch.cuda.mem_get_info()
+
+    # This is a ModelOpt wrapper on top of tensorrt_llm.hlapi.llm.LLM
+    llm_engine = LLM(args.engine_dir, tokenizer)
+
+    torch.cuda.cudart().cudaProfilerStart()
+    # outputs = llm_engine.generate_text(input_texts, args.max_output_len, args.max_beam_width)
+    outputs = llm_engine.generate(input_texts)
+    torch.cuda.cudart().cudaProfilerStop()
+
+    free_memory_after = torch.cuda.mem_get_info()
+    print(
+        f"Used GPU memory: {(free_memory_before[0] - free_memory_after[0]) / 1024 / 1024 / 1024} GB"
+    )
+    print(outputs)
+
+
+if __name__ == "__main__":
+    args = parse_arguments()
+    run(args)
--- a/examples/export/trtllm_export/README.md
+++ b/examples/export/trtllm_export/README.md
+# Megatron Core To TRTLLM Export Documentation
+This guide will walk you through how you can use the megatron core export for exporting models to trtllm format
+
+### Contents
+- [Megatron Core To TRTLLM Export Documentation](#megatron-core-to-trtllm-export-documentation)
+- [Contents](#contents)
+  - [1. Quick Start](#1-quick-start)
+    - [1.1 Understanding The Code](#11-understanding-the-code)
+    - [1.2 Running The Code](#12-running-the-code)
+  - [2. GPU Export](#2-gpu-export)
+  - [3. Future work](#4-future-work)
+
+#### 1. Quick Start
+This will walk you through the flow of converting an mcore gpt model to trtllm format using single device mode. The file can be found at [gpt_single_device_cpu_export.py](./single_device_export/gpt_single_device_cpu_export.py)
+
+NOTE: For faster performance, if your entire model will fit into gpu memory, pre transfer the model state dict to gpu and then call the get_trtllm_pretrained_config_and_model_weights function.
+
+<br>
+
+##### 1.1 Understanding The Code
+***STEP 1 - We initialize model parallel and other default arguments***
+We initalize tp and pp to 1 so that we can get the full model state dict on cpu
+```python
+    initialize_distributed(tensor_model_parallel_size=1, pipeline_model_parallel_size=1)
+```
+
+***STEP 2 - We load the model using the model_provider_function***
+NOTE: We create a simple gpt model
+
+```python
+    transformer_config = TransformerConfig(
+        num_layers=2, 
+        hidden_size=64, # Needs to be atleast 32 times num_attn_heads
+        num_attention_heads=2, 
+        use_cpu_initialization=True, 
+        pipeline_dtype=torch.float32,
+    )
+
+    gpt_model = GPTModel(
+        config=transformer_config, 
+        transformer_layer_spec=get_gpt_layer_local_spec(), 
+        vocab_size=100, 
+        max_sequence_length=_SEQUENCE_LENGTH,
+    )
+
+    # Optionally you can also load a model using this code 
+    # sharded_state_dict=gpt_model.sharded_state_dict(prefix='')
+    # checkpoint = dist_checkpointing.load(sharded_state_dict=sharded_state_dict, checkpoint_dir=checkpoint_path)
+    # gpt_model.load_state_dict(checkpoint)
+
+```
+
+***STEP 3 - Instantiate the TRTLLM Helper***
+We instantiate the [TRTLLM Helper](../../../megatron/core/export/trtllm/trtllm_helper.py)  For the GPT model we instantiate trtllm_helper as shown below.
+```python
+    if hasattr(gpt_model, "rotary_pos_emb"):
+        seq_len_interpolation_factor =  gpt_model.rotary_pos_emb.seq_len_interpolation_factor
+
+    trtllm_helper = TRTLLMHelper(
+                        transformer_config=gpt_model.config, 
+                        model_type=ModelType.gpt,
+                        position_embedding_type = gpt_model.position_embedding_type, 
+                        max_position_embeddings = gpt_model.max_position_embeddings, 
+                        rotary_percentage = gpt_model.rotary_percent,
+                        rotary_base = gpt_model.rotary_base,
+                        moe_tp_mode = 2,
+                        multi_query_mode = False,
+                        activation = "gelu", 
+                        seq_len_interpolation_factor = seq_len_interpolation_factor,
+                        share_embeddings_and_output_weights=gpt_model.share_embeddings_and_output_weights
+                    )   
+```
+
+***STEP 4 - Get the TRTLLM Weights and configs***
+To convert model weights to trtllm weights and configs, we use the [single_device_converter](../../../megatron/core/export/trtllm/trtllm_weights_converter/single_device_trtllm_model_weights_converter.py). We pass as inputs the model state dict, and export config. In this example we use inference tp size as 2 for the export. 
+
+```python
+    model_state_dict={}
+    for key , val in gpt_model.state_dict().items():
+        # val is non for _extra_state layers . We filter it out
+        if val is not None:
+            model_state_dict[key] = val
+
+    export_config = ExportConfig(inference_tp_size = 2)
+    weight_list, config_list = trtllm_helper.get_trtllm_pretrained_config_and_model_weights(
+        model_state_dict= model_state_dict,
+        dtype = DataType.bfloat16,
+        export_config=export_config
+    )
+```
+
+***STEP 5 - Build the TRTLLM Engine***
+Following code is used to build the TRTLLM Engine. 
+
+```python
+    for trtllm_model_weights, trtllm_model_config in zip(weight_list, config_list):
+        trtllm_helper.build_and_save_engine(
+            max_input_len=256,
+            max_output_len=256,
+            max_batch_size=8,
+            engine_dir='/opt/megatron-lm/engine',
+            trtllm_model_weights=trtllm_model_weights,
+            trtllm_model_config=trtllm_model_config,
+            lora_ckpt_list=None,
+            use_lora_plugin=None,
+            max_lora_rank=64,
+            lora_target_modules=None,
+            max_prompt_embedding_table_size=0,
+            paged_kv_cache=True,
+            remove_input_padding=True,
+            paged_context_fmha=False,
+            use_refit=False,
+            max_num_tokens=None,
+            max_seq_len=512,
+            opt_num_tokens=None,
+            max_beam_width=1,
+            tokens_per_block=128,
+            multiple_profiles=False,
+            gpt_attention_plugin="auto",
+            gemm_plugin="auto",
+        )
+```
+<br>
+
+##### 1.2 Running The Code
+An example run script is shown below. 
+
+```
+# In a workstation 
+MLM_PATH=/path/to/megatron-lm
+CONTAINER_IMAGE=gitlab-master.nvidia.com:5005/dl/joc/nemo-ci/trtllm_0.12/train:pipe.17669124-x86
+
+docker run -it --gpus=all --ipc=host -v $MLM_PATH/:/opt/megatron-lm $CONTAINER_IMAGE bash
+
+# Inside the container run the following. 
+
+cd /opt/megatron-lm/
+
+CUDA_VISIBLE_DEVICES=0 torchrun --nproc-per-node 1  examples/export/trtllm_export/single_device_export/gpt_single_device_cpu_export.py
+```
+
+<br>
+
+#### 2. GPU Export
+You can use the [gpt_distributed_gpu_export.py](./distributed_export/gpt_distributed_gpu_export.py) to run a more optimized on device distributed. version of trtllm export. Internally this uses the [distributed_converter](../../../megatron/core/export/trtllm/trtllm_weights_converter/distributed_trtllm_model_weights_converter.py) to convert model weights on device. 
+In the single device version you collect all the model weights on CPU/GPU, convert it to trtllm format, and then store the engine back on disk. In the GPU version you load each individual state dict on the gpus, convert it on the device itself and store the engine on disk. 
+
+To run the gpu version 
+
+```
+CUDA_VISIBLE_DEVICES=0,1 torchrun --nproc-per-node 2  examples/export/trtllm_export/distributed_export/gpt_distributed_gpu_export.py
+```
+
+<br>
+
+#### 3. Future work
+The following are planned for the future releases . 
+* Pipeline parallellism for export (Work in progress) 
+* GPU Export for more models (Work in progress for some models)
+* Refit functionality
+* VLLM Support
\ No newline at end of file
--- a/examples/export/trtllm_export/distributed_export/gpt_distributed_gpu_export.py
+++ b/examples/export/trtllm_export/distributed_export/gpt_distributed_gpu_export.py
+import os
+import torch
+from megatron.core import parallel_state
+from megatron.core import dist_checkpointing
+from megatron.core.export.model_type import ModelType
+from megatron.core.export.data_type import DataType
+from megatron.core.export.trtllm.trtllm_helper import TRTLLMHelper
+from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed
+from megatron.core.transformer.transformer_config import TransformerConfig
+from megatron.core.models.gpt.gpt_model import GPTModel
+from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_local_spec
+
+
+_SEQUENCE_LENGTH = 64
+_VOCAB_SIZE = 256
+
+def initialize_distributed(tensor_model_parallel_size=1, pipeline_model_parallel_size=1):
+    parallel_state.destroy_model_parallel()
+
+    # Torch setup for distributed training
+    rank = int(os.environ['LOCAL_RANK'])
+    world_size = torch.cuda.device_count()
+    torch.cuda.set_device(rank)
+    torch.distributed.init_process_group(world_size=world_size, rank=rank)
+
+    # Megatron core distributed training initialization
+    parallel_state.initialize_model_parallel(tensor_model_parallel_size = tensor_model_parallel_size, pipeline_model_parallel_size=pipeline_model_parallel_size)
+
+def model_provider():
+    """Build the model."""
+
+    transformer_config = TransformerConfig(
+        num_layers=2, 
+        hidden_size=64, 
+        num_attention_heads=2, 
+        use_cpu_initialization=True, 
+        pipeline_dtype=torch.float32
+    )
+
+    gpt_model = GPTModel(
+        config=transformer_config, 
+        transformer_layer_spec=get_gpt_layer_local_spec(), 
+        vocab_size=_VOCAB_SIZE, 
+        max_sequence_length=_SEQUENCE_LENGTH,
+    )
+
+    return gpt_model
+
+def load_distributed_checkpoint(checkpoint_path, gpt_model):
+    sharded_state_dict=gpt_model.sharded_state_dict(prefix='')
+    checkpoint = dist_checkpointing.load(sharded_state_dict=sharded_state_dict, checkpoint_dir=checkpoint_path)
+    gpt_model.load_state_dict(checkpoint)
+    return gpt_model
+
+if __name__ == "__main__":
+    initialize_distributed(tensor_model_parallel_size=2, pipeline_model_parallel_size=1)
+    model_parallel_cuda_manual_seed(123)
+
+    gpt_model = model_provider()
+    device = torch.device("cuda")
+    gpt_model.to(device) 
+    
+    # Optionally you can also load a gpt model from ckpt_path using this code below
+    # gpt_model = load_distributed_checkpoint(gpt_model=gpt_model, checkpoint_path=ckpt_path)
+
+    seq_len_interpolation_factor = None
+    if hasattr(gpt_model, "rotary_pos_emb"):
+        seq_len_interpolation_factor =  gpt_model.rotary_pos_emb.seq_len_interpolation_factor
+
+    trtllm_helper = TRTLLMHelper(
+                        transformer_config=gpt_model.config, 
+                        model_type=ModelType.gpt,
+                        position_embedding_type = gpt_model.position_embedding_type, 
+                        max_position_embeddings = gpt_model.max_position_embeddings, 
+                        rotary_percentage = gpt_model.rotary_percent,
+                        rotary_base = gpt_model.rotary_base,
+                        moe_tp_mode = 2,
+                        multi_query_mode = False,
+                        activation = "gelu", 
+                        seq_len_interpolation_factor = seq_len_interpolation_factor,
+                        share_embeddings_and_output_weights=gpt_model.share_embeddings_and_output_weights
+                    )
+    
+
+    trtllm_model_weights, trtllm_model_config = trtllm_helper.get_trtllm_pretrained_config_and_model_weights(
+        model_state_dict= gpt_model.state_dict(),
+        dtype = DataType.bfloat16,
+        on_device_distributed_conversion=True, 
+        vocab_size=_VOCAB_SIZE, 
+        gpus_per_node=2,
+    )
+
+    trtllm_helper.build_and_save_engine(
+        max_input_len=256,
+        max_output_len=256,
+        max_batch_size=8,
+        engine_dir='/opt/megatron-lm/engine',
+        trtllm_model_weights=trtllm_model_weights[0],
+        trtllm_model_config=trtllm_model_config[0],
+        lora_ckpt_list=None,
+        use_lora_plugin=None,
+        max_lora_rank=64,
+        lora_target_modules=None,
+        max_prompt_embedding_table_size=0,
+        paged_kv_cache=True,
+        remove_input_padding=True,
+        paged_context_fmha=False,
+        use_refit=False,
+        max_num_tokens=None,
+        max_seq_len=512,
+        opt_num_tokens=None,
+        max_beam_width=1,
+        tokens_per_block=128,
+        multiple_profiles=False,
+        gpt_attention_plugin="auto",
+        gemm_plugin="auto",
+    )
--- a/examples/export/trtllm_export/single_device_export/gpt_single_device_cpu_export.py
+++ b/examples/export/trtllm_export/single_device_export/gpt_single_device_cpu_export.py
+import os
+import torch
+from megatron.core import parallel_state
+from megatron.core import dist_checkpointing
+from megatron.core.export.model_type import ModelType
+from megatron.core.export.data_type import DataType
+from megatron.core.export.export_config import ExportConfig
+from megatron.core.export.trtllm.trtllm_helper import TRTLLMHelper
+from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed
+from megatron.core.transformer.transformer_config import TransformerConfig
+from megatron.core.models.gpt.gpt_model import GPTModel
+from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_local_spec
+
+
+_SEQUENCE_LENGTH = 64
+
+
+def initialize_distributed(tensor_model_parallel_size=1, pipeline_model_parallel_size=1):
+    parallel_state.destroy_model_parallel()
+
+    # Torch setup for distributed training
+    rank = int(os.environ['LOCAL_RANK'])
+    world_size = torch.cuda.device_count()
+    torch.cuda.set_device(rank)
+    torch.distributed.init_process_group(world_size=world_size, rank=rank)
+
+    # Megatron core distributed training initialization
+    parallel_state.initialize_model_parallel(tensor_model_parallel_size, pipeline_model_parallel_size)
+
+def model_provider():
+    """Build the model."""
+
+    transformer_config = TransformerConfig(
+        num_layers=2, 
+        hidden_size=64, # Needs to be atleast 32 times num_attn_heads
+        num_attention_heads=2, 
+        use_cpu_initialization=True, 
+        pipeline_dtype=torch.float32,
+    )
+
+    gpt_model = GPTModel(
+        config=transformer_config, 
+        transformer_layer_spec=get_gpt_layer_local_spec(), 
+        vocab_size=100, 
+        max_sequence_length=_SEQUENCE_LENGTH,
+    )
+
+    return gpt_model
+
+def load_distributed_checkpoint(checkpoint_path, gpt_model):
+    sharded_state_dict=gpt_model.sharded_state_dict(prefix='')
+    checkpoint = dist_checkpointing.load(sharded_state_dict=sharded_state_dict, checkpoint_dir=checkpoint_path)
+    gpt_model.load_state_dict(checkpoint)
+    return gpt_model
+
+if __name__ == "__main__":
+    # Need to use TP1 PP1 for export on single device
+    initialize_distributed(tensor_model_parallel_size=1, pipeline_model_parallel_size=1)
+    model_parallel_cuda_manual_seed(123)
+
+    gpt_model = model_provider()
+
+    # Optionally you can also load a gpt model from ckpt_path using this code below
+    # gpt_model = load_distributed_checkpoint(gpt_model=gpt_model, checkpoint_path=ckpt_path)
+
+    seq_len_interpolation_factor = None
+    if hasattr(gpt_model, "rotary_pos_emb"):
+        seq_len_interpolation_factor =  gpt_model.rotary_pos_emb.seq_len_interpolation_factor
+
+    trtllm_helper = TRTLLMHelper(
+                        transformer_config=gpt_model.config, 
+                        model_type=ModelType.gpt,
+                        position_embedding_type = gpt_model.position_embedding_type, 
+                        max_position_embeddings = gpt_model.max_position_embeddings, 
+                        rotary_percentage = gpt_model.rotary_percent,
+                        rotary_base = gpt_model.rotary_base,
+                        moe_tp_mode = 2,
+                        multi_query_mode = False,
+                        activation = "gelu", 
+                        seq_len_interpolation_factor = seq_len_interpolation_factor,
+                        share_embeddings_and_output_weights=gpt_model.share_embeddings_and_output_weights
+                    )
+    
+
+    export_config = ExportConfig(inference_tp_size = 2)
+    # NOTE : For faster performance, if your entire model will fit in gpu memory, transfer model state dict to GPU and then call this api
+    weight_list, config_list = trtllm_helper.get_trtllm_pretrained_config_and_model_weights(
+        model_state_dict= gpt_model.state_dict(),
+        dtype = DataType.bfloat16,
+        export_config=export_config
+    )
+
+    for trtllm_model_weights, trtllm_model_config in zip(weight_list, config_list):
+        trtllm_helper.build_and_save_engine(
+            max_input_len=256,
+            max_output_len=256,
+            max_batch_size=8,
+            engine_dir='/opt/megatron-lm/engine',
+            trtllm_model_weights=trtllm_model_weights,
+            trtllm_model_config=trtllm_model_config,
+            lora_ckpt_list=None,
+            use_lora_plugin=None,
+            max_lora_rank=64,
+            lora_target_modules=None,
+            max_prompt_embedding_table_size=0,
+            paged_kv_cache=True,
+            remove_input_padding=True,
+            paged_context_fmha=False,
+            use_refit=False,
+            max_num_tokens=None,
+            max_seq_len=512,
+            opt_num_tokens=None,
+            max_beam_width=1,
+            tokens_per_block=128,
+            multiple_profiles=False,
+            gpt_attention_plugin="auto",
+            gemm_plugin="auto",
+        )
\ No newline at end of file
--- a/examples/gpt3/README.md
+++ b/examples/gpt3/README.md
+# GPT3 MODEL
+
+## Table of contents
+- [1. Training Setup](#1-training-setup)
+- [2. Configurations](#2-configurations)
+- [3. Training Results](#3-training-results)
+
+## 1. Training setup
+<a id="markdown-training-setup" name="training-setup"></a>
+
+To run the model using a docker container run it as follows
+```
+PYTORCH_IMAGE=nvcr.io/nvidia/pytorch:24.01-py3
+CHECKPOINT_PATH="" #<Specify path>
+TENSORBOARD_LOGS_PATH=""#<Specify path>
+VOCAB_FILE="" #<Specify path to file>/gpt2-vocab.json
+MERGE_FILE="" #<Specify path to file>/gpt2-merges.txt
+DATA_PATH="" #<Specify path and file prefix>_text_document
+
+docker run \
+  --gpus=all \
+  --ipc=host \
+  --workdir /workspace/megatron-lm \
+  -v /path/to/data:/path/to/data \
+  -v /path/to/megatron-lm:/workspace/megatron-lm \
+  megatron-lm nvcr.io/nvidia/pytorch:24.01-py3 \
+  bash examples/gpt3/train_gpt3_175b_distributed.sh $CHECKPOINT_PATH $TENSORBOARD_LOGS_PATH $VOCAB_FILE $MERGE_FILE $DATA_PATH "
+
+```
+NOTE: Depending on the environment you are running it the above command might like slightly different.
+
+
+## 2. Configurations
+<a id="markdown-configurations" name="configurations"></a>
+The example in this folder shows you how to run 175B model. There are other configs you could run as well
+
+### 345M
+```
+       --num-layers 12 \
+       --hidden-size 512 \
+       --num-attention-heads 8 \
+       --seq-length 1024 \
+       --tensor-model-parallel-size 1 \
+       --pipeline-model-parallel-size 1 \
+
+```
+
+### 857M
+```
+       --num-layers 24 \
+       --hidden-size 1024 \
+       --num-attention-heads 16 \
+       --seq-length 2048 \
+       --tensor-model-parallel-size 1 \
+       --pipeline-model-parallel-size 1 \
+
+```
--- a/examples/gpt3/gpt_config.yaml
+++ b/examples/gpt3/gpt_config.yaml
+# WARNING: Yaml configs is currently an experimental feature
+language_model:
+  # model architecture
+  num_layers: 24
+  hidden_size: 1024
+  num_attention_heads: 16
+  num_query_groups: null
+
+  ffn_hidden_size: null
+  kv_channels: null
+  hidden_dropout: 0.0
+  attention_dropout: 0.0
+  fp32_residual_connection: False
+
+  apply_residual_connection_post_layernorm: False
+  layernorm_epsilon: 1.e-5
+  layernorm_zero_centered_gamma: True
+  add_bias_linear: False
+  bias_activation_fusion: False
+  add_qkv_bias: False
+  gated_linear_unit: False
+  activation_func: swiglu
+  num_moe_experts: null
+  rotary_interleaved: False
+  window_size: null
+
+  # initialization
+  init_method: null
+  init_method_std: 0.02
+  output_layer_init_method: null
+
+  # mixed-precision
+  apply_query_key_layer_scaling: False
+  attention_softmax_in_fp32: False
+
+  # fusion
+  bias_swiglu_fusion: True
+  masked_softmax_fusion: True
+  persist_layer_norm: False
+  memory_efficient_layer_norm: False
+  bias_dropout_fusion: True
+  apply_rope_fusion: True
+
+  # activation recomputation
+  recompute_granularity: null
+  recompute_method: null
+  recompute_num_layers: null
+  distribute_saved_activations: null
+
+  # fp8 related
+  fp8: null
+  fp8_margin: 0
+  fp8_interval: 1
+  fp8_amax_history_len: 1
+  fp8_amax_compute_algo: "most_recent"
+  fp8_wgrad: True
+
+  # miscellaneous
+  clone_scatter_output_in_embedding: True
+
+  normalization: "LayerNorm"  # alt value supported by TE: "RMSNorm"
+
+  # MoE related
+  moe_router_load_balancing_type: "aux_loss"
+  moe_router_topk: 2
+  moe_router_topk_limited_devices: null
+  moe_grouped_gemm: False
+  moe_aux_loss_coeff: 0  # 1e-2 would be a good start value for load balance loss.
+  moe_z_loss_coeff: null  # 1e-3 would be a good start value for z-loss
+  moe_input_jitter_eps: null
+  moe_token_dropping: False
+
+model_parallel:
+  # Model parallelism
+  tensor_model_parallel_size: 1
+  context_parallel_size: 1
+  pipeline_model_parallel_size: 1
+  virtual_pipeline_model_parallel_size: null
+  sequence_parallel: True
+  expert_model_parallel_size: 1
+
+  # Initialization
+  perform_initialization: True
+  use_cpu_initialization: null
+
+  # Training
+  fp16: False
+  bf16: True
+  params_dtype: null # Set from above arguments for core
+  timers: null
+
+  # Optimizations
+  gradient_accumulation_fusion: True
+  async_tensor_model_parallel_allreduce: True
+  tp_comm_overlap: False
+
+  # Debug Options
+  tp_comm_split_ag: True
+  tp_comm_atomic_ag: True
+  tp_comm_split_rs: True
+  tp_comm_atomic_rs: True
+  tp_comm_bulk_wgrad: True
+  tp_comm_bulk_dgrad: True
+
+  # Parallelism
+  finalize_model_grads_func: null
+
+  # Pipeline Parallel
+  pipeline_dtype: null
+  grad_scale_func: null
+  enable_autocast: False
+  autocast_dtype: null
+  variable_seq_lengths: False
+  num_microbatches_with_partial_activation_checkpoints: null
+  overlap_p2p_comm: False
+  batch_p2p_comm: True
+  batch_p2p_sync: True
+  use_ring_exchange_p2p: False
+  deallocate_pipeline_outputs: False
+  no_sync_func: null
+  grad_sync_func: null
+  param_sync_func: null
+  pipeline_model_parallel_split_rank: null
+
+  # CPU Offloading
+  cpu_offloading: False
+  cpu_offloading_num_layers: 0
+  _cpu_offloading_context: null
+  cpu_offloading_weights: False
+  cpu_offloading_activations: True
+
+  # Timing
+  barrier_with_L1_time: True
+
+# training:
+use_legacy_models: False
+spec: null
+micro_batch_size: 2
+global_batch_size: 128
+rampup_batch_size: [32, 32, 65324160] 
+check_for_nan_in_loss_and_grad: True
+num_layers_per_virtual_pipeline_stage: null
+
+encoder_num_layers: null
+decoder_num_layers: null
+rotary_seq_len_interpolation_factor: null
+add_position_embedding: False
+make_vocab_size_divisible_by: 128
+group_query_attention: False
+
+
+exit_signal_handler: False
+exit_duration_in_mins: null
+exit_interval: null
+
+untie_embeddings_and_output_weights: True
+position_embedding_type: rope
+rotary_percent: 0.5
+openai_gelu: False
+squared_relu: False
+swiglu: True
+onnx_safe: null
+bert_binary_head: True
+max_position_embeddings: 4096
+
+transformer_impl: local
+use_flash_attn: False
+seed: 1234
+data_parallel_random_init: False
+
+# Optimizer
+optimizer: adam
+lr: 2.5e-4
+lr_decay_style: cosine
+lr_decay_iters: null
+lr_decay_samples: 255126953
+lr_warmup_fraction: null
+lr_warmup_iters: 0
+lr_warmup_samples: 81381
+lr_warmup_init: 0.0
+min_lr: 2.5e-5
+weight_decay: 0.1
+start_weight_decay: null
+end_weight_decay: null
+weight_decay_incr_style: constant
+clip_grad: 1.0
+adam_beta1: 0.9
+adam_beta2: 0.95
+adam_eps: 1.e-08
+sgd_momentum: 0.9
+override_opt_param_scheduler: False
+use_checkpoint_opt_param_scheduler: False
+
+# checkpointing arguments
+save: null
+save_interval: 20000
+no_save_optim: null
+no_save_rng: null
+load: null
+no_load_optim: null
+no_load_rng: null
+finetune: False
+use_checkpoint_args: False
+exit_on_missing_checkpoint: False
+
+# loss arguments
+loss_scale: null
+initial_loss_scale: 4294967296
+min_loss_scale: 1.0
+loss_scale_window: 1000 
+hysteresis: 2
+accumulate_allreduce_grads_in_fp32: False
+fp16_lm_cross_entropy: False
+
+# distributed arguments
+distributed_backend: nccl
+distributed_timeout_minutes: 10
+overlap_grad_reduce: False
+align_grad_reduce: True
+overlap_param_gather: False
+align_param_gather: False
+scatter_gather_tensors_in_pipeline: True
+local_rank: null
+lazy_mpu_init: null
+empty_unused_memory_level: 0
+standalone_embedding_stage: False
+use_distributed_optimizer: False
+nccl_communicator_config_path: null
+
+train_iters: null
+eval_iters: 32
+eval_interval: 2000
+skip_train: False
+
+adlr_autoresume: False
+adlr_autoresume_interval: 1000
+
+# garbage collection
+manual_gc: False
+manual_gc_interval: 0
+manual_gc_eval: True
+
+tp_comm_overlap_cfg: null
+
+#data
+data_path: null
+split: '99,1,0'
+train_data_path: null
+valid_data_path: null
+test_data_path: null
+data_cache_path: null
+mock_data: False
+vocab_size: null
+vocab_file: null
+merge_file: null
+vocab_extra_ids: 0
+seq_length: 4096
+encoder_seq_length: null
+decoder_seq_length: null
+retriever_seq_length: 256
+sample_rate: 1.0
+mask_prob: 0.15
+short_seq_prob: 0.1
+num_workers: 2
+tokenizer_type: GPTSentencePieceTokenizer
+tokenizer_model: null
+reset_position_ids: False
+reset_attention_mask: False
+eod_mask_loss: False
+train_samples: 268554688
+dataloader_type: null
+
+#profile:
+profile: False
+profile_ranks: [0]
+profile_step_end: 12
+profile_step_start: 10
+
+#logging:
+log_params_norm: True
+log_num_zeros_in_grad: True
+log_throughput: False
+log_progress: False
+timing_log_level: 0
+timing_log_option: minmax
+tensorboard_log_interval: 1
+tensorboard_queue_size: 1000
+log_timers_to_tensorboard: False
+log_validation_ppl_to_tensorboard: False
+log_memory_to_tensorboard: False
+log_world_size_to_tensorboard: False
+log_loss_scale_to_tensorboard: True
+wandb_project: ''
+wandb_exp_name: ''
+wandb_save_dir: ''
+enable_one_logger: True
+one_logger_project: megatron-lm
+one_logger_run_name: null
+log_interval: 100
+tensorboard_dir: null
--- a/examples/gpt3/train_gpt3_175b_distributed.sh
+++ b/examples/gpt3/train_gpt3_175b_distributed.sh
+#!/bin/bash
+
+# Runs the "175B" parameter model
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+
+GPUS_PER_NODE=8
+# Change for multinode config
+MASTER_ADDR=localhost
+MASTER_PORT=6000
+NUM_NODES=1
+NODE_RANK=0
+WORLD_SIZE=$(($GPUS_PER_NODE*$NUM_NODES))
+
+CHECKPOINT_PATH=$1 #<Specify path>
+TENSORBOARD_LOGS_PATH=$2 #<Specify path>
+VOCAB_FILE=$3 #<Specify path to file>/gpt2-vocab.json
+MERGE_FILE=$4 #<Specify path to file>/gpt2-merges.txt
+DATA_PATH=$5 #<Specify path and file prefix>_text_document
+
+DISTRIBUTED_ARGS=(
+    --nproc_per_node $GPUS_PER_NODE 
+    --nnodes $NUM_NODES 
+    --master_addr $MASTER_ADDR 
+    --master_port $MASTER_PORT
+)
+
+GPT_MODEL_ARGS=(
+    --num-layers 96 
+    --hidden-size 12288 
+    --num-attention-heads 96 
+    --seq-length 2048 
+    --max-position-embeddings 2048 
+    --attention-backend auto # Can use (flash/fused/unfused/local)
+)
+
+TRAINING_ARGS=(
+    --micro-batch-size 1 
+    --global-batch-size 1536 
+    --rampup-batch-size 16 16 5859375 
+    --train-iters 500000 
+    --weight-decay 0.1 
+    --adam-beta1 0.9 
+    --adam-beta2 0.95 
+    --init-method-std 0.006 
+    --clip-grad 1.0 
+    --fp16
+    --lr 6.0e-5 
+    --lr-decay-style cosine 
+    --min-lr 6.0e-6
+    --lr-warmup-fraction .001 
+    --lr-decay-iters 430000 
+)
+
+MODEL_PARALLEL_ARGS=(
+	--tensor-model-parallel-size 8 
+	--pipeline-model-parallel-size 16 
+)
+
+DATA_ARGS=(
+    --data-path $DATA_PATH 
+    --vocab-file $VOCAB_FILE 
+    --merge-file $MERGE_FILE 
+    --split 949,50,1
+)
+
+EVAL_AND_LOGGING_ARGS=(
+    --log-interval 100
+    --save-interval 10000 
+    --eval-interval 1000 
+    --save $CHECKPOINT_PATH 
+    --load $CHECKPOINT_PATH 
+    --eval-iters 10
+    --tensorboard-dir $TENSORBOARD_LOGS_PATH 
+)
+
+torchrun ${DISTRIBUTED_ARGS[@]} pretrain_gpt.py \
+    ${GPT_MODEL_ARGS[@]} \
+    ${TRAINING_ARGS[@]} \
+    ${MODEL_PARALLEL_ARGS[@]} \
+    ${DATA_ARGS[@]} \
+    ${EVAL_AND_LOGGING_ARGS[@]}
--- a/examples/inference/README.md
+++ b/examples/inference/README.md
+### Megatron Core Inference Documentation
+This guide provides an example for Megatron Core for running model inference. 
+
+### Contents
+- [Megatron Core Inference Documentation](#megatron-core-inference-documentation)
+- [Contents](#contents)
+  - [1. Quick Start](#1-quick-start)
+    - [1.1 Understanding The Code](#11-understanding-the-code)
+    - [1.2 Running The Code](#12-running-the-code)
+  - [2. Flow of Control In MCore Backend](#2-flow-of-control-in-mcore-backend)
+  - [3. Customizing The Inference Pipeline](#3-customizing-the-inference-pipeline)
+    - [3.1. Create Your Own Inference Backend](#31-create-your-own-inference-backend)
+    - [3.2. Create Your Own Text Generation Controller](#32-create-your-own-text-generation-controller)
+    - [3.3. Support Other Models](#33-support-other-models)
+    - [3.3. Modify Inference Parameters](#33-modify-inference-parameters)
+  - [4. Future work](#4-future-work)
+
+<br>
+
+#### 1. Quick Start
+This example runs batch inference on a GPT model trained using Megatron Core. The entrypoint is [simple_gpt_batch_inference.py](./gpt/gpt_batch_inference.py)
+
+<br>
+
+##### 1.1 Code Walkthrough 
+***STEP 1 - Initialize model parallel and other default arguments***
+The micro batch size is set as 1 as it is not used in tensor-parallelism only, and for pipeline-parallel models it is calculated at runtime. 
+```python
+    initialize_megatron(
+        args_defaults={'no_load_rng': True, 'no_load_optim': True, 'micro_batch_size': 1}
+    )
+```
+
+***STEP 2 - Load the model using the model_provider_function***
+NOTE: The model provider function supports both MCore and Legacy models. 
+
+```python
+    model = get_model(model_provider, wrap_with_ddp=False)
+    load_checkpoint(model, None, None)
+    model = model[0]
+```
+
+***STEP 3 - Choose an engine***
+Text generation requires an inference engine, which includes a scheduler. The default engine is the [Megatron Core engine](../../megatron/core/inference/engine/mcore_engine.py) with a simple [text generation controller](../../megatron/core/inference/text_generation_controllers/text_generation_controller.py). TRTLLMEngine will be supported in the future.
+```python
+    inference_wrapped_model = GPTInferenceWrapper(model, args)
+    text_generation_controller = TextGenerationController(
+        inference_wrapped_model=inference_wrapped_model, 
+        tokenizer=tokenizer
+    )
+    inference_backend = MCoreEngine(
+        text_generation_controller=text_generation_controller, max_batch_size=args.max_batch_size
+    )
+```
+
+***STEP 4 - Run text generation***
+The [SamplingParams](../../megatron/core/inference/sampling_params.py) contains suggested defaults. Customize this to change top_p, top_k, number of tokens to generate etc. 
+*Note: The result is returned as a list of [InferenceRequests](../../megatron/core/inference/inference_request.py)*
+```python
+    results: List[InferenceRequest] = inference_engine.generate(
+        prompts=args.prompts, sampling_params=sampling_params
+    )
+    
+    if torch.distributed.get_rank() == 0:
+        for idx, result in enumerate(results):
+            print(f' ------------- RESULT FOR PROMPT {idx} --------------- ')
+            result = {
+                'id': result.request_id,
+                'input_prompt': result.prompt, 
+                'generated_text': result.generated_text,
+                'generated_tokens' : result.generated_tokens
+                }
+            print(result)
+```
+
+<br>
+
+##### 1.2 Running The Code
+An example run script is shown below. Set the tokenizer paths, inference params, and other settings appropriately. 
+
+For a quick recap on sampling parameters, refer to [this blog](https://ivibudh.medium.com/a-guide-to-controlling-llm-model-output-exploring-top-k-top-p-and-temperature-parameters-ed6a31313910).
+
+```
+# In a slurm cluster (You could also use docker)
+ACCOUNT=<account>
+MLM_PATH=/path/to/megatron-lm
+GPT_CKPT=/path/to/gpt/ckpt
+VOCAB_MERGE_FILE_PATH=/path/to/vocab/and/merge/file
+CONTAINER_IMAGE=nvcr.io/ea-bignlp/ga-participants/nemofw-training:23.11
+
+srun --account $ACCOUNT \
+--job-name=$ACCOUNT:inference \
+--partition=batch \
+--time=01:00:00 \
+--container-image $CONTAINER_IMAGE \
+--container-mounts $MLM_PATH:/workspace/megatron-lm/,$GPT_CKPT:/workspace/mcore_gpt_ckpt,$VOCAB_MERGE_FILE_PATH:/workspace/tokenizer \
+--no-container-mount-home \
+--pty /bin/bash \
+
+# Inside the container run the following. 
+
+cd megatron-lm/
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+
+TOKENIZER_ARGS=(
+    --vocab-file /workspace/tokenizer/gpt2-vocab.json
+    --merge-file /workspace/tokenizer/gpt2-merges.txt
+    --tokenizer-type GPT2BPETokenizer
+)
+
+MODEL_ARGS=(
+    --use-checkpoint-args
+    --use-mcore-models
+    --load /workspace/mcore_gpt_ckpt
+)
+
+INFERENCE_SPECIFIC_ARGS=(
+    --attention-dropout 0.0
+    --hidden-dropout 0.0
+    --num-tokens-to-generate 20
+    --max-batch-size 4
+)
+
+torchrun --nproc-per-node=4 examples/inference/gpt/simple_gpt_batch_inference.py \
+    ${TOKENIZER_ARGS[@]} \
+    ${MODEL_ARGS[@]} \
+    ${INFERENCE_SPECIFIC_ARGS[@]} \
+    --prompts "prompt one " "sample prompt two" "sample prompt 3"
+
+NOTE: Other parameters which can be customized for inference are :-
+--temperature (Sampling temperature)
+--top_k (top_k sampling)
+--top_p (top_p sampling)
+--num-tokens-to-generate (Number of tokens to generate for each prompt)
+--inference-batch-times-seqlen-threshold (During inference, if batch-size times sequence-length is smaller than this threshold then we will not use pipelining, otherwise we will.')
+--use-dist-ckpt (If using dist checkpoint format for the model)
+--use-legacy-models (If using legacy gpt model instead of mcore gpt model)
+
+```
+
+
+<br>
+
+
+#### 2. Control Flow in the MCore Backend
+An example of inference with static batching is provided in [gpt_batch_inference.py](./gpt/gpt_batch_inference.py).
+* [mcore_engine](../../megatron/core/inference/engines/mcore_engine.py) **generate()** function is called with the input prompts.
+* The `Scheduler` in the engine will add these prompts to the [active requests] pool (../../megatron/core/inference/inference_request.py) until max batch size is hit. Remaining requests will be added to the waiting requests pool. 
+* The engine will run until all requests (waiting + active) are completed. 
+    * The active requests are passed into  **generate_all_output_tokens_static_batch()** of the text generation controller . 
+    * This function uses the **prep_model_for_inference()** method of the [model_inference_wrappers](../../megatron/core/inference/model_inference_wrappers/abstract_model_inference_wrapper.py) and runs an autoregressive sampling loop
+    * In the autoregressive loop, the **get_batch_for_context_window()** method of the inference wrapper is called to slice out the input tokens and masks
+    * Input tokens and masks are passed it into the **run_one_forward_step()** method, which calls the model `.forward()` method to get the output logits
+    * Output logits are synchronized across all pipeline parallel ranks
+    * The text generation controller obtains the log probabilities and samples tokens based on the strategy defined in the sampling parameters.
+    * The sampled tokens are then appended to the input prompt tokens for the next iteration 
+    * The **update_generation_status()** method of the text generation controller checks which prompts have finished generating or hit a stop condition
+    * After the inference loop, the result is detokenized and stored as an attribute of the InferenceRequest. These requests are marked as completed. 
+    * The **update_requests_pool()** method of the scheduler moves completed requests into the completed request pool and waiting requests into the active request pool
+
+<br>
+
+#### 3. Customizing The Inference Pipeline
+
+The inference pipeline supports three levels of customization:
+
+* **Inference engine** - The MCore Engine is currently supported. Change this to add a new backend.
+* **Text generation controller** - The main sampling loop. This can be customized to support alternative tokenization, detokenization, or to implement a new sampling strategy.
+* **Inference Wrapped Model** - Change this to support a new model.
+* **Modify Inference Parameters** - Change this to update top_p, top_k, number of tokens to be generated, temperature, or other sampling parameters.
+
+<br>
+
+##### 3.1. Create Your Own Inference Backend 
+The  [abstract_engine.py](./../../megatron/core/inference/engine/abstract_engine.py) file contains a `generate` method that can be extended to support a new backend. 
+
+```python
+class AbstractEngine(ABC):
+    @staticmethod
+    def generate(self) -> dict:
+        """The abstract backend's generate function. 
+
+        To define a new backend, implement this method and return the outputs as a dictionary. 
+```
+
+<br>
+
+##### 3.2. Implement a new Sampling Loop 
+
+The [TextGenerationController](../../megatron/core/inference/text_generation_controllers/text_generation_controller.py) contains the main sampling loop and can be modified to support new tokenization, detokenization, or sampling strategies.
+
+``` python
+class TextGenerationController:
+
+    def tokenize_prompt(self, prompt: str) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Utility to tokenize the input prompts"""
+
+    def sample_from_logits(
+        self,
+        last_token_logits: torch.Tensor,
+        sampling_params: SamplingParams,
+        vocab_size: int,
+    ) -> torch.Tensor:
+        """Samples the logits to generate outputs
+
+        Given the logits of the last token, this function samples according to the parameters defined in sampling_params and returns the sampled tokens.
+        """
+
+    def update_generation_status(
+        self,
+        updated_prompts_tokens: torch.Tensor,
+        generation_started: torch.Tensor,
+        current_context_end_position: int,
+        is_generation_done_tensor: torch.Tensor,
+        generated_sequence_lengths: torch.Tensor,
+    ) -> torch.Tensor:
+        """Function to check which prompts have reached an end condition
+
+        We check which prompts have reached an end condition and set the corresponding flags of the is_generation_done_tensor to True . The generated sequence lengths increases as we keep generating, until that prompts hits an eod condition. The generation started status tensor helps us determine which prompts have started generating
+        """
+
+    def generate_all_output_tokens_static_batch(
+        self, active_requests: OrderedDict[int, InferenceRequest],
+    ) -> OrderedDict[int, InferenceRequest]:
+        """Utility to generate all the output tokens and probabilities for the prompts .
+
+        This utility generates the output tokens for a static batch. It runs the forward steps till all prompts complete generation, updates the status of these requests to completed, adds the generated result and returns these requests
+        """
+
+    def detokenize_generations(self, prompt_tokens_with_generated_tokens: torch.Tensor) -> str:
+        """Detokenize the output generations"""
+```
+
+<br>
+
+##### 3.3. Support Other Models
+Extend [abstract_model_inference_wrapper.py](./../../megatron/core/inference/model_inference_wrappers/abstract_model_inference_wrapper.py) to support other models. The abstract model wrapper implements: 
+* Forward method which calls the model `forward` method depending on model parallel settings
+* Initializes the model and puts it in `.eval()` mode
+* Setup for the input parameters (max batch size, max seq length) 
+
+The following methods should be implemented: 
+```python
+class AbstractModelInferenceWrapper:
+    def prep_model_for_inference(self, prompts_tokens: torch.Tensor):
+        """A utility function for preparing model for inference
+
+        The function gets called once before the auto regressive inference loop. It puts the model in eval mode , and gets some model and inference data parameters. Extend this to build position ids ,attention mask etc, so that required slices can be extracted during the forward pass
+        """
+
+    @abc.abstractclassmethod
+    def get_batch_for_context_window(self) -> List:
+        """Returns the input data for inference 
+
+        This function gets called iteratively in the inference loop. It can be used to extract relevant input from the prompt tokens, attention mask etc. required for each step in inference.
+```
+
+Refer to [gpt_inference_wrapper.py](../../megatron/core/inference/model_inference_wrappers/gpt/gpt_inference_wrapper.py) for an example of implementing this for GPTModel.
+
+<br>
+
+##### 3.3. Modify Inference Parameters
+We use  [common inference params](../../megatron/core/inference/sampling_params.py) for text generation. Customize this if you want to change top_p, top_k, number of tokens to generate etc. If you want to add other attributes that you would use in the inference loop, you can do that as shown below
+
+```
+from megatron.core.inference.sampling_params import SamplingParams
+
+c = SamplingParams(temperature=0.5)
+c.add_attributes({'min_length':4, 'eod_id':153})
+```
+
+<br>
+
+#### 4. Future work
+The following features are planned for the future releases. 
+* Dynamic batching 
+* Paged Attention
+* TRTLLM Engine support
+* Support for multimodal inference
\ No newline at end of file
--- a/examples/inference/gpt/gpt_batch_inference.py
+++ b/examples/inference/gpt/gpt_batch_inference.py
+import os
+from megatron.core.inference.model_inference_wrappers.inference_wrapper_config import InferenceWrapperConfig
+from pretrain_gpt import model_provider
+import torch
+import sys
+from argparse import Namespace
+from megatron.core.inference.engines.abstract_engine import AbstractEngine
+from megatron.core.inference.engines.mcore_engine import MCoreEngine
+from megatron.core.inference.sampling_params import SamplingParams
+from megatron.core.inference.model_inference_wrappers.gpt.gpt_inference_wrapper import GPTInferenceWrapper
+from megatron.core.inference.inference_request import InferenceRequest
+from megatron.core.inference.text_generation_controllers.text_generation_controller import TextGenerationController
+from megatron.core.transformer.module import MegatronModule
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__),
+                                             os.path.pardir, os.path.pardir)))
+
+from megatron.training import get_args
+from megatron.training import get_tokenizer
+from megatron.training.checkpointing import load_checkpoint
+from megatron.core import mpu
+from megatron.training.initialize import initialize_megatron
+from megatron.training import get_model
+from typing import List
+
+def add_text_generate_args(parser):
+    """Text generation arguments."""
+    group = parser.add_argument_group(title='text generation')
+
+    group.add_argument("--temperature", type=float, default=1.0,
+                       help='Sampling temperature.')
+    group.add_argument("--top_k", type=int, default=1,
+                       help='Top k sampling.')
+    group.add_argument("--top_p", type=float, default=0.0,
+                       help='Top p sampling.')
+    group.add_argument("--return-log-probs", action='store_true', default=False,
+                       help='Return the log probabilities of the final output tokens')
+    group.add_argument("--num-tokens-to-generate", type=int, default=30,
+                       help='Number of tokens to generate for each prompt')
+    group.add_argument("--prompts", metavar='N', type=str, nargs='+',
+                       help='Input prompts with each prompt within quotes and seperated by space')
+    group.add_argument("--max-batch-size", type=int, default=1,
+                       help='Max number of prompts to process at once')
+    return parser
+
+
+def get_inference_engine(args: Namespace, model: MegatronModule) -> AbstractEngine:
+    """Utility to get the relevant backend for running inference
+
+    This function will automatically chose the TRTLLMBackend when possible, and if not revert to Mcore backend if the user does not specify any backends. TRT LLM Backend is not implmented yet. 
+
+    Args:
+        args (Namespace): The user arguments parsed from command line
+        model (MegatronModule): The megatron model . 
+
+    Returns:
+        AbstractBackend: The chosen backend
+    """
+    tokenizer = get_tokenizer()
+
+    inference_wrapper_config = InferenceWrapperConfig(
+        hidden_size=args.hidden_size,
+        inference_batch_times_seqlen_threshold=args.inference_batch_times_seqlen_threshold,
+        fp32_residual_connection=args.fp32_residual_connection,
+        params_dtype=args.params_dtype,
+        padded_vocab_size=args.padded_vocab_size
+    )
+
+    inference_wrapped_model = GPTInferenceWrapper(model, inference_wrapper_config)
+    text_generation_controller = TextGenerationController(inference_wrapped_model=inference_wrapped_model, tokenizer=tokenizer)
+    return MCoreEngine(text_generation_controller=text_generation_controller, max_batch_size=args.max_batch_size)
+            
+def main():
+    """Main program."""
+
+    # Note: The default args passed here can be overwritten by using appropriate params (check arguments.py file)
+    # Micro batch size is not needed to be set by user. (It is calculated based on inference-batch-times-seqlen-threshold argument)
+    initialize_megatron(extra_args_provider=add_text_generate_args,
+                        args_defaults={'no_load_rng': True,
+                                       'no_load_optim': True,
+                                       'micro_batch_size': 1, 
+                                       'exit_on_missing_checkpoint': True})
+
+    # Set up model and load checkpoint
+    model = get_model(model_provider, wrap_with_ddp=False)
+    load_checkpoint(model, None, None)
+    model = model[0]
+
+    args = get_args()
+
+    inference_engine = get_inference_engine(args, model)
+
+    sampling_params = SamplingParams(
+        temperature=args.temperature, 
+        top_k=args.top_k, 
+        top_p=args.top_p, 
+        return_log_probs=args.return_log_probs, 
+        num_tokens_to_generate=args.num_tokens_to_generate)
+
+    results: List[InferenceRequest] = inference_engine.generate(
+        prompts=args.prompts, sampling_params=sampling_params
+    )
+    
+    if torch.distributed.get_rank() == 0:
+        for idx, result in enumerate(results):
+            print(f' \n------------- RESULT FOR PROMPT {idx} --------------- ')
+            result = {
+                'id': result.request_id,
+                'input_prompt': result.prompt, 
+                'generated_text': result.generated_text,
+                'generated_tokens' : result.generated_tokens
+                }
+            print(result)
+
+if __name__ == "__main__":
+    main()
--- a/examples/inference/llama_mistral/huggingface_reference.py
+++ b/examples/inference/llama_mistral/huggingface_reference.py
+import argparse
+from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
+
+# Set up argument parsing
+parser = argparse.ArgumentParser(description="Script for text generation with a specific model and prompt.")
+parser.add_argument('--prompt', type=str, required=True, help="Prompt text to use for text generation")
+parser.add_argument('--model-path', type=str, required=True, help="Path to the Huggingface model checkpoint")
+
+# Parse command-line arguments
+args = parser.parse_args()
+
+model_path = args.model_path
+prompt = args.prompt
+
+config = AutoConfig.from_pretrained(model_path)
+tokenizer = AutoTokenizer.from_pretrained(model_path, config=config)
+model = AutoModelForCausalLM.from_pretrained(model_path, config=config).cuda()
+
+inputs = tokenizer(prompt, return_tensors="pt")
+for key in inputs:
+    inputs[key] = inputs[key].cuda()
+# top_k, top_p and do_sample are set for greedy argmax based sampling
+
+outputs = model.generate(**inputs, max_length=100, do_sample=False, top_p=0, top_k=0, temperature=1.0)
+print(tokenizer.decode(outputs[0], skip_special_tokens=True))
\ No newline at end of file
--- a/examples/inference/llama_mistral/run_text_generation_llama3.1.sh
+++ b/examples/inference/llama_mistral/run_text_generation_llama3.1.sh
+#!/bin/bash
+# This example will start serving the Llama3.1-8B model
+export NCCL_IB_SL=1
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+export NVTE_APPLY_QK_LAYER_SCALING=0
+
+DISTRIBUTED_ARGS="--nproc_per_node 1 \
+                  --nnodes 1 \
+                  --node_rank 0 \
+                  --master_addr 0.0.0.0 \
+                  --master_port 6000"
+
+# Ensure CHECKPOINT and TOKENIZER_MODEL are provided
+if [ -z "$1" ] || [ -z "$2" ]; then
+  echo "Error: You must provide CHECKPOINT and TOKENIZER_MODEL as command-line arguments."
+  echo "Usage: $0 /path/to/checkpoint /path/to/tokenizer_model"
+  exit 1
+fi
+
+# Assign command-line arguments to variables
+CHECKPOINT=$1
+TOKENIZER_MODEL=$2
+
+pip install flask-restful
+
+torchrun $DISTRIBUTED_ARGS tools/run_text_generation_server.py   \
+      --use-checkpoint-args \
+      --disable-bias-linear \
+      --tokenizer-type HuggingFaceTokenizer \
+      --tokenizer-model ${TOKENIZER_MODEL} \
+      --transformer-impl transformer_engine \
+      --normalization RMSNorm \
+      --group-query-attention \
+      --num-query-groups 8 \
+      --no-masked-softmax-fusion \
+      --attention-softmax-in-fp32 \
+      --attention-dropout 0.0 \
+      --hidden-dropout 0.0 \
+      --untie-embeddings-and-output-weights \
+      --position-embedding-type rope \
+      --rotary-percent 1.0 \
+      --rotary-base 500000 \
+      --use-rope-scaling \
+      --use-rotary-position-embeddings \
+      --swiglu \
+      --tensor-model-parallel-size 1  \
+      --pipeline-model-parallel-size 1  \
+      --num-layers 32  \
+      --hidden-size 4096  \
+      --ffn-hidden-size 14336 \
+      --load ${CHECKPOINT}  \
+      --num-attention-heads 32  \
+      --max-position-embeddings 131072  \
+      --bf16  \
+      --micro-batch-size 1  \
+      --seq-length 8192
--- a/examples/inference/llama_mistral/run_text_generation_llama3.sh
+++ b/examples/inference/llama_mistral/run_text_generation_llama3.sh
+#!/bin/bash
+# This example will start serving the Llama3-8B model
+export NCCL_IB_SL=1
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+export NVTE_APPLY_QK_LAYER_SCALING=0
+
+DISTRIBUTED_ARGS="--nproc_per_node 1 \
+                  --nnodes 1 \
+                  --node_rank 0 \
+                  --master_addr 0.0.0.0 \
+                  --master_port 6000"
+
+# Ensure CHECKPOINT and TOKENIZER_MODEL are provided
+if [ -z "$1" ] || [ -z "$2" ]; then
+  echo "Error: You must provide CHECKPOINT and TOKENIZER_MODEL as command-line arguments."
+  echo "Usage: $0 /path/to/checkpoint /path/to/tokenizer_model"
+  exit 1
+fi
+
+# Assign command-line arguments to variables
+CHECKPOINT=$1
+TOKENIZER_MODEL=$2
+
+pip install flask-restful
+
+torchrun $DISTRIBUTED_ARGS tools/run_text_generation_server.py   \
+      --use-checkpoint-args \
+      --disable-bias-linear \
+      --tokenizer-type HuggingFaceTokenizer \
+      --tokenizer-model ${TOKENIZER_MODEL} \
+      --transformer-impl transformer_engine \
+      --normalization RMSNorm \
+      --group-query-attention \
+      --num-query-groups 8 \
+      --no-masked-softmax-fusion \
+      --attention-softmax-in-fp32 \
+      --attention-dropout 0.0 \
+      --hidden-dropout 0.0 \
+      --untie-embeddings-and-output-weights \
+      --position-embedding-type rope \
+      --rotary-percent 1.0 \
+      --rotary-base 500000 \
+      --use-rotary-position-embeddings \
+      --swiglu \
+      --tensor-model-parallel-size 1  \
+      --pipeline-model-parallel-size 1  \
+      --num-layers 32  \
+      --hidden-size 4096  \
+      --ffn-hidden-size 14336 \
+      --load ${CHECKPOINT}  \
+      --num-attention-heads 32  \
+      --max-position-embeddings 8192  \
+      --bf16  \
+      --micro-batch-size 1  \
+      --seq-length 8192
--- a/examples/inference/llama_mistral/run_text_generation_mistral.sh
+++ b/examples/inference/llama_mistral/run_text_generation_mistral.sh
+#!/bin/bash
+# This example will start serving the Mistral-7B-v0.3 model
+export NCCL_IB_SL=1
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+
+DISTRIBUTED_ARGS="--nproc_per_node 1 \
+                  --nnodes 1 \
+                  --node_rank 0 \
+                  --master_addr 0.0.0.0 \
+                  --master_port 6000"
+
+# Ensure CHECKPOINT and TOKENIZER_MODEL are provided
+if [ -z "$1" ] || [ -z "$2" ]; then
+  echo "Error: You must provide CHECKPOINT and TOKENIZER_MODEL as command-line arguments."
+  echo "Usage: $0 /path/to/checkpoint /path/to/tokenizer_model"
+  exit 1
+fi
+
+# Assign command-line arguments to variables
+CHECKPOINT=$1
+TOKENIZER_MODEL=$2
+
+pip install flask-restful
+
+torchrun $DISTRIBUTED_ARGS tools/run_text_generation_server.py   \
+       --tokenizer-type HuggingFaceTokenizer \
+       --tokenizer-model ${TOKENIZER_MODEL} \
+       --use-checkpoint-args \
+       --apply-layernorm-1p \
+       --transformer-impl transformer_engine \
+       --normalization RMSNorm \
+       --group-query-attention \
+       --num-query-groups 8 \
+       --no-masked-softmax-fusion \
+       --use-flash-attn \
+       --untie-embeddings-and-output-weights \
+       --disable-bias-linear \
+       --position-embedding-type rope \
+       --rotary-percent 1.0 \
+       --rotary-base 1000000 \
+       --swiglu \
+       --ffn-hidden-size 14336 \
+       --tensor-model-parallel-size 1  \
+       --pipeline-model-parallel-size 1  \
+       --num-layers 32  \
+       --hidden-size 4096  \
+       --load ${CHECKPOINT}  \
+       --num-attention-heads 32  \
+       --max-position-embeddings 4096  \
+       --bf16  \
+       --micro-batch-size 1  \
+       --seq-length 4096  \
+       --seed 101
--- a/examples/inference/run_text_generation_server_345M.sh
+++ b/examples/inference/run_text_generation_server_345M.sh
+#!/bin/bash
+# This example will start serving the 345M model.
+DISTRIBUTED_ARGS="--nproc_per_node 1 \
+                  --nnodes 1 \
+                  --node_rank 0 \
+                  --master_addr localhost \
+                  --master_port 6000"
+
+CHECKPOINT=<Path to checkpoint (e.g /345m)>
+VOCAB_FILE=<Path to vocab.json (e.g. /gpt2-vocab.json)>
+MERGE_FILE=<Path to merges.txt (e.g. /gpt2-merges.txt)>
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+
+pip install flask-restful
+
+torchrun $DISTRIBUTED_ARGS tools/run_text_generation_server.py   \
+       --tensor-model-parallel-size 1  \
+       --pipeline-model-parallel-size 1  \
+       --num-layers 24  \
+       --hidden-size 1024  \
+       --load ${CHECKPOINT}  \
+       --num-attention-heads 16  \
+       --max-position-embeddings 1024  \
+       --tokenizer-type GPT2BPETokenizer  \
+       --fp16  \
+       --micro-batch-size 1  \
+       --seq-length 1024  \
+       --vocab-file $VOCAB_FILE  \
+       --merge-file $MERGE_FILE  \
+       --seed 42
--- a/examples/inference/run_text_generation_server_345M_8_tensor_parallel.sh
+++ b/examples/inference/run_text_generation_server_345M_8_tensor_parallel.sh
+#!/bin/bash
+# This example will start serving the 345M model that is partitioned 8 way tensor parallel
+DISTRIBUTED_ARGS="--nproc_per_node 8 \
+                  --nnodes 1 \
+                  --node_rank 0 \
+                  --master_addr localhost \
+                  --master_port 6000"
+
+CHECKPOINT=<Path to checkpoint (e.g /345m)>
+VOCAB_FILE=<Path to vocab.json (e.g. /gpt2-vocab.json)>
+MERGE_FILE=<Path to merges.txt (e.g. /gpt2-merges.txt)>
+
+pip install flask-restful
+
+python -m torch.distributed.launch $DISTRIBUTED_ARGS tools/run_text_generation_server.py   \
+       --tensor-model-parallel-size 8  \
+       --pipeline-model-parallel-size 1  \
+       --num-layers 24  \
+       --hidden-size 1024  \
+       --load ${CHECKPOINT}  \
+       --num-attention-heads 16  \
+       --max-position-embeddings 1024  \
+       --tokenizer-type GPT2BPETokenizer  \
+       --fp16  \
+       --micro-batch-size 1  \
+       --seq-length 1024  \
+       --vocab-file $VOCAB_FILE  \
+       --merge-file $MERGE_FILE  \
+       --seed 42
--- a/examples/inference/t5/simple_t5_batch_inference.py
+++ b/examples/inference/t5/simple_t5_batch_inference.py
+import os
+import sys
+from argparse import Namespace
+
+import torch
+
+import pretrain_t5
+from megatron.core.inference.sampling_params import SamplingParams
+from megatron.core.inference.engines.abstract_engine import AbstractEngine
+from megatron.core.inference.engines.mcore_engine import MCoreEngine
+from megatron.core.inference.inference_request import InferenceRequest
+from megatron.core.inference.model_inference_wrappers.inference_wrapper_config import (
+    InferenceWrapperConfig,
+)
+from megatron.core.inference.model_inference_wrappers.t5.t5_inference_wrapper import (
+    T5InferenceWrapper,
+)
+from megatron.core.inference.text_generation_controllers.encoder_decoder_text_generation_controller import (
+    EncoderDecoderTextGenerationController,
+)
+from megatron.core.transformer.module import MegatronModule
+from pretrain_t5 import model_provider
+
+sys.path.append(
+    os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir, os.path.pardir))
+)
+
+from typing import List
+
+from megatron.core import mpu
+from megatron.training import get_args, get_model, get_tokenizer
+from megatron.training.checkpointing import load_checkpoint
+from megatron.training.initialize import initialize_megatron
+
+
+def add_text_generate_args(parser):
+    """Text generation arguments."""
+    group = parser.add_argument_group(title='text generation')
+
+    group.add_argument("--temperature", type=float, default=1.0, help='Sampling temperature.')
+    group.add_argument("--top_k", type=int, default=1, help='Top k sampling.')
+    group.add_argument("--top_p", type=float, default=0.0, help='Top p sampling.')
+    group.add_argument(
+        "--return-log-probs",
+        action='store_true',
+        default=False,
+        help='Return the log probabilities of the final output tokens',
+    )
+    group.add_argument(
+        "--num-tokens-to-generate",
+        type=int,
+        default=30,
+        help='Number of tokens to generate for each prompt',
+    )
+    group.add_argument(
+        "--encoder-prompts",
+        metavar='N',
+        type=str,
+        nargs='+',
+        help='Encoder input prompts with each prompt within quotes and seperated by space',
+    )
+    group.add_argument(
+        "--max-batch-size", type=int, default=1, help='Max number of prompts to process at once'
+    )
+    return parser
+
+
+def get_inference_engine(args: Namespace, model: MegatronModule) -> AbstractEngine:
+    """Utility to get the relevant backend for running inference
+
+    This function will automatically chose the TRTLLMBackend when possible, and if not revert to Mcore backend if the user does not specify any backends. TRT LLM Backend is not implmented yet.
+
+    Args:
+        args (Namespace): The user arguments parsed from command line
+        model (MegatronModule): The megatron model .
+
+    Returns:
+        AbstractBackend: The chosen backend
+    """
+    tokenizer = get_tokenizer()
+
+    inference_wrapper_config = InferenceWrapperConfig(
+        hidden_size=args.hidden_size,
+        inference_batch_times_seqlen_threshold=args.inference_batch_times_seqlen_threshold,
+        fp32_residual_connection=args.fp32_residual_connection,
+        params_dtype=args.params_dtype,
+        padded_vocab_size=args.padded_vocab_size,
+    )
+
+    inference_wrapped_model = T5InferenceWrapper(model, inference_wrapper_config)
+    text_generation_controller = EncoderDecoderTextGenerationController(
+        inference_wrapped_model=inference_wrapped_model, tokenizer=tokenizer
+    )
+    return MCoreEngine(
+        text_generation_controller=text_generation_controller, max_batch_size=args.max_batch_size
+    )
+
+
+def main():
+    """Main program."""
+
+    # Note: The default args passed here can be overwritten by using appropriate params (check arguments.py file)
+    # Micro batch size is not needed to be set by user. (It is calculated based on inference-batch-times-seqlen-threshold argument)
+    initialize_megatron(
+        extra_args_provider=add_text_generate_args,
+        args_defaults={
+            'no_load_rng': True,
+            'no_load_optim': True,
+            'micro_batch_size': 1,
+            'exit_on_missing_checkpoint': True,
+        },
+    )
+
+    # Set up model and load checkpoint
+    model = get_model(model_provider, wrap_with_ddp=False)
+    load_checkpoint(model, None, None)
+    model = model[0]
+
+    args = get_args()
+
+    inference_engine = get_inference_engine(args, model)
+
+    sampling_params = SamplingParams(
+        temperature=args.temperature,
+        top_k=args.top_k,
+        top_p=args.top_p,
+        return_log_probs=args.return_log_probs,
+        num_tokens_to_generate=args.num_tokens_to_generate,
+    )
+
+    tokenizer = get_tokenizer()
+    decoder_prompts = [""] * len(
+        args.encoder_prompts
+    )  # for T5, the prompt is provided as encoder input, hence decoder_prompts is empty
+    args.prompts = decoder_prompts
+
+    results: List[InferenceRequest] = inference_engine.generate(
+        prompts=args.prompts,
+        add_BOS=True,
+        encoder_prompts=args.encoder_prompts,
+        sampling_params=sampling_params,
+    )
+
+    if torch.distributed.get_rank() == 0:
+        for idx, result in enumerate(results):
+            print(f' \n------------- RESULT FOR PROMPT {idx} --------------- ')
+            result = {
+                'id': result.request_id,
+                'input_prompt': result.prompt,
+                'generated_text': result.generated_text,
+                'generated_tokens': result.generated_tokens,
+            }
+            print(result)
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/mamba/.gitignore
+++ b/examples/mamba/.gitignore
+checkpoints/
+data-cache/
+tensorboard/
+triton-cache/
--- a/examples/mamba/Dockerfile
+++ b/examples/mamba/Dockerfile
+FROM nvcr.io/nvidia/pytorch:24.01-py3
+
+RUN pip uninstall -y triton && \
+    pip install triton==2.1.0 sentencepiece==0.1.99 flask-restful
+
+# The causal-conv1d and mamba-ssm packages below are built from scratch here
+# (which takes significant time) because there are no wheels available on PyPI
+# for these relatively newer versions of the packages that are compatible with
+# the older NGC-variant PyTorch version (e.g. version 2.2.0.dev231106) that we
+# are using (in the NGC base container). Generally, if the package is not
+# compatible with the PyTorch version, then it will generate a Python import
+# error. The package authors tend to only release wheels for new versions of
+# these pacakges which are compatible with the versions of regular PyTorch and
+# NGC-variant PyTorch that are newer at the time of release. So, to use newer
+# versions of these packages with relatively older versions of the NGC PyTorch
+# container, we tend to have to build the packages from scratch.
+
+RUN cd /tmp && \
+    git clone https://github.com/Dao-AILab/causal-conv1d.git && \
+    cd causal-conv1d && \
+    git checkout v1.2.2.post1 && \
+    CAUSAL_CONV1D_FORCE_BUILD=TRUE pip install . && \
+    cd .. && \
+    rm -rf causal-conv1d
+
+RUN cd /tmp && \
+    git clone https://github.com/state-spaces/mamba.git && \
+    cd mamba && \
+    git checkout v2.0.3 && \
+    MAMBA_FORCE_BUILD=TRUE pip install . && \
+    cd .. && \
+    rm -rf mamba