更新0.12

160bf237 · wangxj · b01809dd · 160bf237 · 160bf237 · 160bf237
Commit 160bf237 authored Mar 05, 2025 by wangxj
20 changed files
--- a/megatron/core/inference/communication_utils.py
+++ b/megatron/core/inference/communication_utils.py
@@ -14,6 +14,10 @@ def broadcast_from_last_pipeline_stage(size, dtype, tensor=None):
    """Broadcast a tensor from last pipeline stage to all ranks."""

    if parallel_state.is_pipeline_last_stage():
+        assert size == list(
+            tensor.shape
+        ), f"Expected tensor of shape {size} but got {list(tensor.shape)}"
+        assert dtype == tensor.dtype, f"Expected tensor of type {dtype} but got {tensor.dtype}"
        _is_cuda(tensor)
        assert tensor.is_contiguous()
    else:

--- a/megatron/core/inference/engines/__init__.py
+++ b/megatron/core/inference/engines/__init__.py
--- a/megatron/core/inference/engines/abstract_engine.py
+++ b/megatron/core/inference/engines/abstract_engine.py
--- a/megatron/core/inference/engines/mcore_engine.py
+++ b/megatron/core/inference/engines/mcore_engine.py
 # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
-from typing import Dict, List
+import asyncio
+import warnings
+from collections import OrderedDict
+from typing import AsyncGenerator, Dict, List, Optional, Union

 import torch

-from megatron.core.inference.common_inference_params import CommonInferenceParams
+from megatron.core.inference.async_stream import AsyncStream
 from megatron.core.inference.engines.abstract_engine import AbstractEngine
 from megatron.core.inference.inference_request import InferenceRequest
+from megatron.core.inference.sampling_params import SamplingParams
 from megatron.core.inference.scheduler import Scheduler
-from megatron.core.inference.text_generation_controllers.simple_text_generation_controller import (
-    SimpleTextGenerationController,
+from megatron.core.inference.text_generation_controllers.text_generation_controller import (
+    TextGenerationController,
 )


@@ -19,31 +23,105 @@ class MCoreEngine(AbstractEngine):
    Supports any model that is callable (Accepts the inputs and outputs the tensor)

    Args:
-        text_generation_controller (SimpleTextGenerationController): A text generation
+        text_generation_controller (TextGenerationController): A text generation
            controller that will be used to define how to preprocess prompts, generate
            outputs and detokenizer the output tokens.
-        max_batch_size : The maxinum number of requests to process at once
+        max_batch_size (int, optional): The maximum number of requests to process at once.
+            Will be set from the InferenceWrapperConfig in `text_generation_controller` by
+            default.
        random_seed (int, optional): Use a random seed if you want deterministic
            results. Defaults to None.
    """

    def __init__(
        self,
-        text_generation_controller: SimpleTextGenerationController,
-        max_batch_size,
-        random_seed: int = None,
+        text_generation_controller: TextGenerationController,
+        max_batch_size: Optional[int] = None,
+        random_seed: Optional[int] = None,
    ):
+        inference_wrapper_config = (
+            text_generation_controller.inference_wrapped_model.inference_wrapper_config
+        )
+        inference_max_batch_size = inference_wrapper_config.inference_max_requests
+        if max_batch_size is None:
+            max_batch_size = inference_max_batch_size
+        elif max_batch_size > inference_max_batch_size:
+            warnings.warn(
+                f"Engine `max_batch_size` ({max_batch_size}) > "
+                f"`inference_max_requests` in `inference_wrapper_config` "
+                f"({inference_max_batch_size}); setting `max_batch_size` to "
+                f"{inference_max_batch_size}",
+                UserWarning,
+            )
+            max_batch_size = inference_max_batch_size
        self.text_generation_controller = text_generation_controller
        self.random_seed = random_seed
        self.scheduler = Scheduler(max_batch_size=max_batch_size)

+    def get_new_request_id(self) -> str:
+        """Gets a new request id from the scheduler"""
+        return self.scheduler.get_new_request_id()
+
+    def add_request(
+        self,
+        prompt: Optional[str] = None,
+        add_BOS: bool = False,
+        encoder_prompt: Optional[str] = None,
+        inference_parameters: Optional[SamplingParams] = None,
+        streaming: bool = False,
+        inference_request: Optional[InferenceRequest] = None,
+    ) -> str:
+        """
+        Adds a request to the scheduler and returns the request ID.
+
+        Args:
+            prompt (str): A prompt string
+            add_BOS (bool): Whether to add BOS token to beginning of the prompt
+            encoder_prompt (str): The encoder prompt string
+            inference_parameters (SamplingParams): The inference parameters
+            streaming (bool): Whether to stream incremental outputs for this request
+            inference_request (InferenceRequest, optional): A fully constructed request.
+                Defaults to None.
+
+        Returns:
+            The newly created request ID.
+        """
+        assert (
+            prompt is not None or inference_request is not None
+        ), f"At least one of `prompt` or `inference_request` must be specified"
+
+        if inference_request is None:
+            prompt_tokens = self.text_generation_controller.tokenize_prompt(prompt, add_BOS)
+        else:
+            prompt_tokens = inference_request.prompt_tokens
+
+        return self.scheduler.add_request(
+            prompt=prompt,
+            prompt_tokens=prompt_tokens,
+            encoder_prompt=encoder_prompt,
+            inference_parameters=inference_parameters,
+            streaming=streaming,
+            inference_request=inference_request,
+        )
+
+    def get_stream_generator(
+        self, request_id: str
+    ) -> Union[AsyncGenerator[InferenceRequest, None], None]:
+        """Returns the stream generator for the given request ID if it exists."""
+        stream = self.scheduler.streams.get(request_id, None)
+        if stream is not None:
+            return stream.generator()
+        return None
+
    def generate(
        self,
-        prompts: List[str],
+        prompts: Optional[List[str]] = None,
        add_BOS: bool = False,
-        encoder_prompts: List[str] = None,
-        common_inference_params: CommonInferenceParams = None,
-    ) -> dict:
+        encoder_prompts: Optional[List[str]] = None,
+        common_inference_params: Optional[SamplingParams] = None,
+        sampling_params: Optional[SamplingParams] = None,
+        inference_requests: Optional[List[InferenceRequest]] = None,
+    ) -> List[InferenceRequest]:
        """The megatron core inference backend generate function

        This backend returns the output generations as a dictionary.
@@ -54,31 +132,47 @@ class MCoreEngine(AbstractEngine):
            prompts (List[str]): All the prompts as a list of strings
            add_BOS (bool): Whether to add BOS token to beginning of prompts
            encoder_prompts (List[dict]): All the encoder prompts as a list of strings
-            common_inference_params (CommonInferenceParams): The inference parameters
+            common_inference_params: Deprecated. Only used for backward compatibility with
+            MCore <= 0.9.0. Use `sampling_params` going forward.
+            sampling_params (SamplingParams): The request-level sampling parameters
+            inference_requests (List[InferenceRequest]): A pre-populated list of inference requests

        Returns:
            List[InferenceRequest]: The output is list of inference requests containing the
            generated tokens, texts and log probs if required
        """
        # TODO :M core- get rng state tracker
+
+        request_ids: List[str] = []
+
        if self.random_seed:
            torch.random.manual_seed(self.random_seed)

-        for i in range(len(prompts)):
-            prompt = prompts[i]
-            encoder_prompt = encoder_prompts[i] if encoder_prompts is not None else None
-            prompt_tokens = self.text_generation_controller.tokenize_prompt(prompt, add_BOS)
+        if inference_requests is None:
+            assert prompts is not None

-            self.scheduler.add_request(
-                prompt=prompt,
-                prompt_tokens=prompt_tokens,
-                encoder_prompt=encoder_prompt,
-                inference_parameters=common_inference_params,
-            )
+            if common_inference_params:
+                sampling_params = common_inference_params
+
+            for i in range(len(prompts)):
+                prompt = prompts[i]
+                encoder_prompt = encoder_prompts[i] if encoder_prompts is not None else None
+                request_id = self.add_request(
+                    prompt=prompt,
+                    encoder_prompt=encoder_prompt,
+                    inference_parameters=sampling_params,
+                )
+                request_ids.append(request_id)
+        else:
+            for inference_request in inference_requests:
+                request_ids.append(inference_request.request_id)
+                self.scheduler.add_request(inference_request=inference_request)

        self.run_engine()

-        result: List[InferenceRequest] = self.scheduler.completed_request_pool.values()
+        result: List[InferenceRequest] = [
+            self.scheduler.completed_request_pool[request_id] for request_id in request_ids
+        ]
        return result

    def run_engine(self):
@@ -92,10 +186,15 @@ class MCoreEngine(AbstractEngine):
                Defaults to False.
        """
        while self.scheduler.have_requests_pending():
-            active_requests: Dict[int, InferenceRequest] = self.scheduler.active_request_pool.copy()
-            result_dict: Dict[int, InferenceRequest] = (
+            active_requests: Dict[str, InferenceRequest] = self.scheduler.active_request_pool.copy()
+            active_streams: Dict[str, AsyncStream] = OrderedDict()
+            for request_id in active_requests:
+                if (stream := self.scheduler.streams.get(request_id, None)) is not None:
+                    assert isinstance(stream, AsyncStream), stream
+                    active_streams[request_id] = stream
+            result_dict: Dict[str, InferenceRequest] = (
                self.text_generation_controller.generate_all_output_tokens_static_batch(
-                    active_requests
+                    active_requests, active_streams
                )
            )

@@ -105,9 +204,25 @@ class MCoreEngine(AbstractEngine):
        """ 
            if dynamic_batching:
                result_dict: Dict[
-                    int, InferenceRequest
+                    str, InferenceRequest
                ] = self.text_generation_controller.generate_output_tokens_one_step_dynamic_batch(
                    active_requests
                )
            self.scheduler.update_requests_pools(result_dict=result_dict)         
        """
+
+    def _wrapped_run_engine(self, cuda_device):
+        """
+        Explicitly sets the CUDA device before running the engine.
+
+        This is to ensure that the CUDA device is correctly propagated when running
+        in a new thread context.
+        """
+        torch.cuda.set_device(cuda_device)
+        self.run_engine()
+
+    async def run_engine_async(self):
+        """Runs the engine asynchronously using asyncio"""
+        loop = asyncio.get_running_loop()
+
+        await loop.run_in_executor(None, self._wrapped_run_engine, torch.cuda.current_device())
--- a/megatron/core/inference/inference_request.py
+++ b/megatron/core/inference/inference_request.py
 # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 from dataclasses import dataclass
 from enum import Enum
-from typing import List
+from typing import List, Optional

 import torch

-from megatron.core.inference.common_inference_params import CommonInferenceParams
+from megatron.core.inference.sampling_params import SamplingParams


 # class syntax
@@ -18,7 +18,7 @@ class Status(Enum):
    COMPLETED = 4


-@dataclass
+@dataclass(kw_only=True)
 class InferenceRequest:
    """Class for one inference request

@@ -28,12 +28,25 @@ class InferenceRequest:

    request_id: str
    prompt: str
-    inference_parameters: CommonInferenceParams
-    prompt_tokens: List[int]
-    arrival_time: float
-    status: Status
-    encoder_prompt: str = None
-    generated_text: str = None
-    generated_tokens: torch.Tensor = None
-    generated_log_probs: torch.Tensor = None
-    generated_length: int = 0
+    inference_parameters: Optional[SamplingParams] = None
+    prompt_tokens: Optional[List[int]] = None
+    arrival_time: Optional[float] = None
+    status: Optional[Status] = None
+    encoder_prompt: Optional[str] = None
+    generated_text: Optional[str] = None
+    segments: Optional[List[str]] = None
+    generated_segments: Optional[List[str]] = None
+    generated_sequence_lengths: Optional[List[int]] = None
+    generated_tokens: Optional[torch.Tensor] = None
+    generated_log_probs: Optional[torch.Tensor] = None
+    generated_length: Optional[int] = None
+
+
+@dataclass(kw_only=True)
+class VLMInferenceRequest(InferenceRequest):
+    """Class for a VLM inference request"""
+
+    num_img_embeddings_per_tile: int
+    imgs: torch.Tensor
+    num_tiles: torch.Tensor
+    decoder_seq_length: int
--- a/megatron/core/inference/model_inference_wrappers/__init__.py
+++ b/megatron/core/inference/model_inference_wrappers/__init__.py
--- a/megatron/core/inference/model_inference_wrappers/abstract_model_inference_wrapper.py
+++ b/megatron/core/inference/model_inference_wrappers/abstract_model_inference_wrapper.py
 # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 import abc
 import math
-from typing import Iterable, List, Union
+from typing import Any, Dict, Iterable, Optional, Union

 import torch

@@ -26,7 +26,7 @@ class AbstractModelInferenceWrapper(abc.ABC):

    def __init__(
        self,
-        model: Union['LegacyGPTModel', GPTModel],
+        model: Union['LegacyGPTModel', GPTModel],  # type: ignore[name-defined]
        inference_wrapper_config: InferenceWrapperConfig,
    ):
        """Constructor for the model inference wrapper
@@ -48,10 +48,15 @@ class AbstractModelInferenceWrapper(abc.ABC):
            else self.inference_wrapper_config.params_dtype
        )

+        max_batch_size = self.inference_wrapper_config.inference_max_requests
+        max_sequence_length = self.inference_wrapper_config.inference_max_seq_length
+        self.inference_params = InferenceParams(max_batch_size, max_sequence_length)
+
    def prep_model_for_inference(self, prompts_tokens: torch.Tensor):
        """A utility function for preparing model for inference

-        The function gets called once before the auto regressive inference loop. It puts the model in eval mode , and gets some model and inference data parameters. Extend this to build position ids ,attention mask etc, so that required slices can be extracted during the forward pass.
+        The function gets called once before the auto regressive inference loop.
+        It puts the model in eval mode.

        Args:
            prompts_tokens (torch.Tensor): A tensor of shape [batch_size, max_seq_len]
@@ -63,38 +68,64 @@ class AbstractModelInferenceWrapper(abc.ABC):
        self.model_is_pipeline_parallel = not (
            parallel_state.is_pipeline_first_stage() and parallel_state.is_pipeline_last_stage()
        )
-        self.prompts_tokens = prompts_tokens
-        batch_size, max_sequence_length = self.prompts_tokens.shape
-        self.inference_params = InferenceParams(batch_size, max_sequence_length)
+
+        self.inference_params.reset()
+
+    @abc.abstractmethod
+    def prep_inference_input(self, prompt_tokens) -> Dict[str, Any]:
+        """Prepares the inference input data.
+
+        Args:
+            prompts_tokens (torch.Tensor): A tensor of shape [batch_size, max_seq_len]
+
+        Returns:
+            A dict with all the inference input needed for the batch.
+        """
+        raise NotImplementedError()

    @abc.abstractmethod
-    def get_batch_for_context_window(self) -> List:
+    def get_batch_for_context_window(self, *args, **kwargs) -> Dict[str, Any]:
        """Returns the input data for inference

        This function gets called iteratively in the inference loop . It can be used to extract relevant input from the prompt tokens, attention mask etc. required for each step in inference.

        """
-        pass
-
-    def forward_pass_without_pipeline_parallel(self, inference_input: List) -> torch.Tensor:
-        """Utility to carry out simple forward pass for TP or no model parallel models
+        raise NotImplementedError()

-        Runs a very simple forward pass for model. Used  in the case of models without any parallelism or only tensor parallelism.
+    def _forward(self, inference_input):
+        """Runs a forward pass of the model.

        Args:
-            inference_input (List): A list containg the inputs for the gpt model [tokens, position ids, attention mask]
+            inference_input(Dict[str, Any]): The input data.
+            inference_params(InferenceParams): The inference parameters.

        Returns:
-            torch.Tensor: The output logits of shape [batch_size, seq_len, padded_vocab_size]
+            The model output logits.
        """
-        tokens, position_ids, attention_mask = inference_input
-        logits = self.model(
+        tokens = inference_input["tokens"]
+        position_ids = inference_input["position_ids"]
+        attention_mask = inference_input["attention_mask"]
+        return self.model(
            tokens, position_ids, attention_mask, inference_params=self.inference_params
        )
-        logits = tensor_parallel.gather_from_tensor_model_parallel_region(logits)
-        self.inference_params.sequence_len_offset += tokens.size(1)

-        return logits
+    def _get_batch_size_and_seq_len(
+        self, tokens: torch.Tensor, recv_buffer_seq_len: Optional[int] = None
+    ):
+        """
+        Returns the batch size and sequence length based on the tokens tensor and recv_buffer_seq_len.
+
+        Args:
+            tokens (torch.Tensor): The input tensor of shape (batch_size, seq_len).
+            recv_buffer_seq_len (int, optional): An optional recv buffer sequence length.
+
+        Returns:
+            tuple: A tuple (batch_size, seq_len), where batch_size is the first dimension of tokens
+                   and seq_len is either the second dimension or recv_buffer_seq_len.
+        """
+        batch_size = tokens.shape[0]
+        seq_len = recv_buffer_seq_len if recv_buffer_seq_len is not None else tokens.shape[1]
+        return batch_size, seq_len

    def _allocate_recv_buffer(self, batch_size, seq_len):
        """Receive happens between the layers with size [seq_len, batch_size, hidden_size]."""
@@ -103,30 +134,51 @@ class AbstractModelInferenceWrapper(abc.ABC):
            recv_size, dtype=self.pipeline_communication_dtype, device=torch.cuda.current_device()
        )

+    def forward_pass_without_pipeline_parallel(
+        self, inference_input: Dict[str, Any]
+    ) -> torch.Tensor:
+        """Utility to carry out simple forward pass for TP or no model parallel models
+
+        Runs a very simple forward pass for model. Used  in the case of models without any parallelism or only tensor parallelism.
+
+        Args:
+            inference_input (Dict[str, Any]): A dict containg the inputs for the gpt model [tokens, position ids, attention mask]
+
+        Returns:
+            torch.Tensor: The output logits of shape [batch_size, seq_len, padded_vocab_size]
+        """
+        tokens = inference_input["tokens"]
+        logits = self._forward(inference_input)
+        logits = tensor_parallel.gather_from_tensor_model_parallel_region(logits)
+        self.inference_params.sequence_len_offset += tokens.size(1)
+
+        return logits
+
    def forward_pass_with_pipeline_parallel_small_input_batch(
-        self, inference_input: List
+        self, inference_input: Dict[str, Any], recv_buffer_seq_len: Optional[int] = None
    ) -> torch.Tensor:
        """Utility to carry out forward pass for PP models with very small inputs

        If a model is pipeline parallel, yet, the input global batch is very small, we compute a foward pass on the entire global batch, rather than splitting it up into micro batches and doing something more complex as in the forward_pass_with_pipeline_parallel_large_input_batch method

        Args:
-            inference_input (List): A list containg the inputs for the gpt model [tokens, position ids, attention mask]
+            inference_input (Dict[str, Any]): A dict containing the inputs for the gpt model [tokens, position ids, attention mask]
+            recv_buffer_seq_len (int): An optional sequence length for the pipeline parallel recv buffer.

        Returns:
            torch.Tensor: The output logits of shape [batch_size, seq_len, padded_vocab_size]
        """
-        tokens, position_ids, attention_mask = inference_input
-        batch_size, seq_len = tokens.shape
+        tokens = inference_input["tokens"]
+        position_ids = inference_input["position_ids"]
+        attention_mask = inference_input["attention_mask"]
+        batch_size, seq_len = self._get_batch_size_and_seq_len(tokens, recv_buffer_seq_len)
        recv_buffer = None
        if not parallel_state.is_pipeline_first_stage():
            recv_buffer = self._allocate_recv_buffer(batch_size, seq_len)
            recv_from_prev_pipeline_rank_(recv_buffer)

        self.model.set_input_tensor(recv_buffer)
-        output_tensor = self.model(
-            tokens, position_ids, attention_mask, inference_params=self.inference_params
-        )
+        output_tensor = self._forward(inference_input)

        if not parallel_state.is_pipeline_last_stage():
            send_to_next_pipeline_rank(output_tensor.type(dtype=self.pipeline_communication_dtype))
@@ -138,27 +190,35 @@ class AbstractModelInferenceWrapper(abc.ABC):
            logits = output_tensor
            logits = tensor_parallel.gather_from_tensor_model_parallel_region(logits)

+            # Explicitly cast logits to expected dtype
+            logits = logits.to(self.inference_wrapper_config.params_dtype)
+
        return logits

    def forward_pass_with_pipeline_parallel_large_input_batch(
-        self, inference_input: List
+        self, inference_input: Dict[str, Any], recv_buffer_seq_len=None
    ) -> torch.Tensor:
        """Utility to carry out forward pass PP models.

-        Runs the forward pass for models which are pipeline parallel. This is more complex than forward_pass_with_pipeline_parallel_small_input_batch coz this splits the global batch into small micro batches and runs them through the model.
+        Runs the forward pass for models which are pipeline parallel.
+        This is more complex than forward_pass_with_pipeline_parallel_small_input_batch because
+        this splits the global batch into small micro batches and runs them through the model.

        Args:
-            inference_input (List): A list containg the inputs for the gpt model [tokens, position ids, attention mask]
+            inference_input (Dict[str, Any]): A dict containg the inputs for the gpt model [tokens, position ids, attention mask]
+            recv_buffer_seq_len (int): An optional sequence length for the pipeline parallel recv buffer.

        Returns:
            torch.Tensor: The output logits of shape [batch_size, seq_len, padded_vocab_size]
        """
-        tokens, position_ids, attention_mask = inference_input
+        tokens = inference_input["tokens"]
+        position_ids = inference_input["position_ids"]
+        attention_mask = inference_input["attention_mask"]
        micro_batch_size = max(
            1,
            self.inference_wrapper_config.inference_batch_times_seqlen_threshold // tokens.size(1),
        )
-        batch_size, seq_len = tokens.shape
+        batch_size, seq_len = self._get_batch_size_and_seq_len(tokens, recv_buffer_seq_len)
        # Round up to account for the last partial micro batch if present
        num_micro_batches = math.ceil(batch_size / micro_batch_size)

@@ -167,7 +227,7 @@ class AbstractModelInferenceWrapper(abc.ABC):
        if parallel_state.is_pipeline_last_stage():
            logits = torch.empty(
                (batch_size, seq_len, self.inference_wrapper_config.padded_vocab_size),
-                dtype=torch.float32,
+                dtype=self.pipeline_communication_dtype,
                device=torch.cuda.current_device(),
            )

@@ -189,8 +249,12 @@ class AbstractModelInferenceWrapper(abc.ABC):
                recv_from_prev_pipeline_rank_(recv_buffer)

            self.model.set_input_tensor(recv_buffer)
-            output_tensor = self.model(
-                tokens2use, position_ids2use, attention_mask, inference_params=self.inference_params
+            output_tensor = self._forward(
+                {
+                    "tokens": tokens2use,
+                    "position_ids": position_ids2use,
+                    "attention_mask": attention_mask,
+                }
            )

            if not parallel_state.is_pipeline_last_stage():
@@ -202,8 +266,12 @@ class AbstractModelInferenceWrapper(abc.ABC):
                output_tensor = tensor_parallel.gather_from_tensor_model_parallel_region(
                    output_tensor
                )
+                assert logits is not None
                logits[start:end, ...] = output_tensor

+                # Explicitly cast logits to expected dtype
+                logits = logits.to(self.inference_wrapper_config.params_dtype)
+
        # Once done with all micro batches, we reset batch size offset and seq len offset
        self.inference_params.sequence_len_offset += seq_len
        self.inference_params.batch_size_offset = 0
@@ -211,28 +279,37 @@ class AbstractModelInferenceWrapper(abc.ABC):
        # NOTE: Only returns the logits on the last pipeline stage
        return logits

-    def run_one_forward_step(self, inference_input: List) -> torch.Tensor:
+    def run_one_forward_step(
+        self, inference_input: Dict[str, Any], recv_buffer_seq_len: Optional[int] = None
+    ) -> torch.Tensor:
        """The forward pass of the model for inference

        Appropriate utility is called for the forward pass depending on the type of model parallelism used

        Args:
-            inference_input (List): A list containg the inputs for the gpt model [tokens, position ids, attention mask]
+            inference_input (Dict[str, Any]): A dict containg the inputs for the gpt model [tokens, position ids, attention mask]
+            recv_buffer_seq_len (int): An optional sequence length for the pipeline parallel recv buffer.

        Returns:
            torch.Tensor: The output logits of shape [batch_size, seq_len, padded_vocab_size]. The logits are returned only in the last pipeline stage for PP models.
        """
        if self.model_is_pipeline_parallel:
-            tokens = inference_input[0]
-            current_batch_size, seq_len = tokens.shape
+            tokens = inference_input["tokens"]
+            current_batch_size, seq_len = self._get_batch_size_and_seq_len(
+                tokens, recv_buffer_seq_len
+            )
            # If input batch is large, we need to split into micro batches and run the forward pass
            if (
                current_batch_size * seq_len
                > self.inference_wrapper_config.inference_batch_times_seqlen_threshold
            ):
-                return self.forward_pass_with_pipeline_parallel_large_input_batch(inference_input)
+                return self.forward_pass_with_pipeline_parallel_large_input_batch(
+                    inference_input, recv_buffer_seq_len
+                )
            else:
                # If input batch is very small we can do a simple forward pass on the entire global batch
-                return self.forward_pass_with_pipeline_parallel_small_input_batch(inference_input)
+                return self.forward_pass_with_pipeline_parallel_small_input_batch(
+                    inference_input, recv_buffer_seq_len
+                )
        else:
            return self.forward_pass_without_pipeline_parallel(inference_input)
--- a/megatron/core/inference/model_inference_wrappers/gpt/__init__.py
+++ b/megatron/core/inference/model_inference_wrappers/gpt/__init__.py
--- a/megatron/core/inference/model_inference_wrappers/gpt/gpt_inference_wrapper.py
+++ b/megatron/core/inference/model_inference_wrappers/gpt/gpt_inference_wrapper.py
 # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
-from typing import List, Tuple
+from typing import Any, Dict, Tuple

 import torch

@@ -27,19 +27,21 @@ class GPTInferenceWrapper(AbstractModelInferenceWrapper):
        """
        super().__init__(model, inference_wrapper_config)

-    def prep_model_for_inference(self, prompts_tokens: torch.Tensor):
-        """A utility function for preparing model for inference
-
-        This function is called before the forward pass. It puts the model in eval mode, builds position ids, and creates attention masks so that required slices can be extracted during the forward pass.
+    def prep_inference_input(self, prompts_tokens: torch.Tensor) -> Dict[str, Any]:
+        """Prepares the inference input data.

        Args:
            prompts_tokens (torch.Tensor): A tensor of shape [batch_size, max_seq_len]
-        """

-        super().prep_model_for_inference(prompts_tokens=prompts_tokens)
-        self.attention_mask, self.position_ids = self._build_attention_mask_and_position_ids(
-            prompts_tokens
-        )
+        Returns:
+            A dict with all the inference input needed for the batch.
+        """
+        attention_mask, position_ids = self._build_attention_mask_and_position_ids(prompts_tokens)
+        return {
+            "tokens": prompts_tokens,
+            "attention_mask": attention_mask,
+            "position_ids": position_ids,
+        }

    def _build_attention_mask_and_position_ids(
        self, prompts_tokens: torch.Tensor
@@ -68,23 +70,33 @@ class GPTInferenceWrapper(AbstractModelInferenceWrapper):
        return attention_mask, position_ids

    def get_batch_for_context_window(
-        self, context_start_position: int, context_end_position: int
-    ) -> List:
+        self,
+        inference_input: Dict[str, Any],
+        context_start_position: int,
+        context_end_position: int,
+    ) -> Dict[str, Any]:
        """Returns the inference data given context window

        This function gets called iteratively in a loop . Given the start and end context positions , it extracts the appropriate data.

        Args:
+            inference_input (Dict[str, Any]): The inference input for the batch.
            context_start_position (int): Start of the context window. During the first inference step it is mostly 0
            context_end_position (int): End of the context window. During the last inference step it will mostly be the max generated sequence length.

        Returns:
-            List: A list of inputs that will be used by your model in the forward step
+            Dict[str, Any]: A dict of inputs that will be used by your model in the forward step
        """
-        tokens2use = self.prompts_tokens[:, context_start_position:context_end_position]
-        positions2use = self.position_ids[:, context_start_position:context_end_position]
-        attention_mask2use = self.attention_mask[
+        tokens = inference_input["tokens"]
+        position_ids = inference_input["position_ids"]
+        attention_mask = inference_input["attention_mask"]
+        tokens2use = tokens[:, context_start_position:context_end_position]
+        positions2use = position_ids[:, context_start_position:context_end_position]
+        attention_mask2use = attention_mask[
            ..., context_start_position:context_end_position, :context_end_position
        ]
-        data_at_step_idx = [tokens2use, positions2use, attention_mask2use]
-        return data_at_step_idx
+        return {
+            "tokens": tokens2use,
+            "position_ids": positions2use,
+            "attention_mask": attention_mask2use,
+        }
--- a/megatron/core/inference/model_inference_wrappers/inference_wrapper_config.py
+++ b/megatron/core/inference/model_inference_wrappers/inference_wrapper_config.py
@@ -25,6 +25,12 @@ class InferenceWrapperConfig:
    """The final padded vocab size (Padded to make it divisible by 
    --make-vocab-size-divisible-by value)"""

+    inference_max_requests: int = 8
+    """ Maximum number of requests for inference (prefill & decode). Necessary for CUDA graphs. """
+
+    inference_max_seq_length: int = 2560
+    """ Maximum sequence length for inference (prefill & decode). Necessary for CUDA graphs. """
+
    fp32_residual_connection: bool = False
    """Move residual connections to fp32. Obtained from arguments.py"""


--- a/megatron/core/inference/model_inference_wrappers/multimodal/vlm_inference_wrapper.py
+++ b/megatron/core/inference/model_inference_wrappers/multimodal/vlm_inference_wrapper.py
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+from typing import Any, Dict
+
+import torch
+
+from megatron.core import parallel_state
+from megatron.core.inference.model_inference_wrappers.gpt.gpt_inference_wrapper import (
+    GPTInferenceWrapper,
+)
+from megatron.core.inference_params import InferenceParams
+
+
+# pylint: disable=line-too-long
+class VLMInferenceWrapper(GPTInferenceWrapper):
+    """Inference wrapper for VLMs"""
+
+    def prep_model_for_inference(self, prompts_tokens: torch.Tensor):
+        """A utility function for preparing model for inference
+
+        The function gets called once before the auto regressive inference loop.
+        It puts the model in eval mode.
+
+        Args:
+            prompts_tokens (torch.Tensor): A tensor of shape [batch_size, max_seq_len]
+
+        """
+        super().prep_model_for_inference(prompts_tokens)
+
+        # For TP only model both is_pp_first_stage and _is_pp_last_stage returns True
+        self.model_is_pipeline_parallel = not (
+            parallel_state.is_pipeline_first_stage() and parallel_state.is_pipeline_last_stage()
+        )
+
+        self._recv_only_vision_embeds = False
+        pp_rank = parallel_state.get_pipeline_model_parallel_rank()
+        # Checks if the previous stage only has a vision encoder, and that the current stage
+        # has part of the LM decoder. In this case, the current stage should only receive
+        # vision embeddings.
+        if pp_rank > 0:
+            self._recv_only_vision_embeds = (
+                parallel_state.is_inside_encoder(pp_rank - 1)
+                and (not parallel_state.is_inside_decoder(pp_rank - 1))
+                and parallel_state.is_inside_decoder()
+            )
+
+        # Checks if the current stage only has a vision encoder
+        self._encoder_only = (
+            parallel_state.is_inside_encoder() and not parallel_state.is_inside_decoder()
+        )
+
+        # For TP only model both is_pp_first_stage and _is_pp_last_stage returns True
+        self.model_is_pipeline_parallel = not (
+            parallel_state.is_pipeline_first_stage() and parallel_state.is_pipeline_last_stage()
+        )
+
+    def prep_inference_input(
+        self,
+        prompts_tokens: torch.Tensor,
+        num_img_embeddings_per_tile: int,
+        images: torch.Tensor,
+        num_tiles: torch.Tensor,
+        decoder_seq_length: int,
+    ):
+        """Prepares the inference input data.
+
+        Args:
+            prompts_tokens (torch.Tensor): A tensor of shape [batch_size, max_seq_len]
+            num_img_embeddings_per_tile (int): The number of image embeddings per tile
+            images (torch.Tensor): The image embeddings
+            num_tiles (torch.Tensor): The number of tiles for each input image
+            decoder_seq_length (int): The decoder sequence length
+        """
+        inference_input = super().prep_inference_input(prompts_tokens)
+
+        total_num_tiles = torch.sum(num_tiles).item()
+        num_img_embeddings = num_img_embeddings_per_tile * total_num_tiles
+
+        batch_size, max_sequence_length = prompts_tokens.shape
+        self.inference_params = InferenceParams(
+            batch_size, max_sequence_length + num_img_embeddings
+        )
+
+        inference_input["images"] = images
+        inference_input["num_tiles"] = num_tiles
+        inference_input["num_img_embeddings"] = num_img_embeddings
+        inference_input["decoder_seq_length"] = decoder_seq_length
+
+        return inference_input
+
+    def get_batch_for_context_window(
+        self,
+        inference_input: Dict[str, Any],
+        context_start_position: int,
+        context_end_position: int,
+    ) -> Dict[str, Any]:
+        """Returns the inference data given context window
+
+        This function gets called iteratively in a loop . Given the start and end context positions , it extracts the appropriate data.
+
+        Args:
+            inference_input (Dict[str, Any]): The inference input for the batch.
+            context_start_position (int): Start of the context window. During the first inference step it is mostly 0
+            context_end_position (int): End of the context window. During the last inference step it will mostly be the max generated sequence length.
+
+        Returns:
+            Dict[str, Any]: A dict of inputs that will be used by your model in the forward step
+        """
+        tokens = inference_input["tokens"]
+        position_ids = inference_input["position_ids"]
+        images = inference_input["images"]
+        num_tiles = inference_input["num_tiles"]
+        num_img_embeddings = inference_input["num_img_embeddings"]
+        decoder_seq_length = inference_input["decoder_seq_length"]
+
+        tokens2use = tokens[:, context_start_position:context_end_position]
+        positions2use = position_ids[:, context_start_position:context_end_position]
+
+        return {
+            "tokens": tokens2use,
+            "position_ids": positions2use,
+            "images": images,
+            "num_tiles": num_tiles,
+            "num_img_embeddings": num_img_embeddings,
+            "decoder_seq_length": decoder_seq_length,
+        }
+
+    def _forward(self, inference_input: Dict[str, Any]):
+        """Runs a forward pass of the model.
+
+        Args:
+            inference_input(Dict[str, Any]): The input data.
+
+        Returns:
+            The model output logits.
+        """
+        images = inference_input["images"]
+        tokens = inference_input["tokens"]
+        position_ids = inference_input["position_ids"]
+        num_image_tiles = inference_input["num_tiles"]
+
+        output = self.model(
+            images,
+            tokens,
+            position_ids=position_ids,
+            attention_mask=None,
+            inference_params=self.inference_params,
+            num_image_tiles=num_image_tiles,
+            runtime_gather_output=True,
+        )
+        if isinstance(output, tuple):
+            logits, _ = output
+        else:
+            logits = output
+        return logits
+
+    def run_one_forward_step(self, inference_input: Dict[str, Any]) -> torch.Tensor:
+        tokens = inference_input["tokens"]
+        num_image_tokens = (tokens == self.model.module.image_token_index).sum().item()
+        num_img_embeddings = inference_input["num_img_embeddings"]
+        decoder_seq_length = inference_input["decoder_seq_length"]
+        num_tokens = tokens.size(1)
+        recv_buffer_seq_len = None
+        if num_image_tokens > 0:
+            # When there are image tokens and this stage only receives vision embeddings,
+            # adjust the recv buffer seq length to match the image embeddings sequence length.
+            # If there are image tokens and this stage receives full embeddings, make sure we
+            # compensate for expansion of image tokens.
+            # Note that this will set a recv_buffer_seq_len for the encoder stage,
+            # this length is irrelevant since that recv buffer is never allocated.
+            if self._recv_only_vision_embeds:
+                recv_buffer_seq_len = num_img_embeddings
+            else:
+                recv_buffer_seq_len = min(
+                    num_img_embeddings + num_tokens - num_image_tokens, decoder_seq_length
+                )
+        elif self._recv_only_vision_embeds:
+            # If this stage only receives vision embeddings and there are no image tokens
+            # we won't run the encoder and therefore shouldn't try to recv.
+            recv_buffer_seq_len = 0
+
+        # If the pipeline stage only has a vision encoder, then it only needs to
+        # run when there are image tokens
+        if not (self._encoder_only and num_image_tokens == 0):
+            output = super().run_one_forward_step(
+                inference_input, recv_buffer_seq_len=recv_buffer_seq_len
+            )
+        else:
+            output = None
+        logits = output
+
+        # On the first inference iteration, we compute image tokens.
+        # On every PP stage(although inference params should only matter for decoder),
+        # update the sequence length offset by the number of image tokens.
+        if num_tokens > 1 and num_image_tokens > 0:
+            if "image_tokens_count" not in self.inference_params.key_value_memory_dict:
+                self.inference_params.key_value_memory_dict["image_tokens_count"] = (
+                    num_img_embeddings
+                )
+
+            if num_img_embeddings + num_tokens - num_image_tokens > decoder_seq_length:
+                self.inference_params.sequence_len_offset += decoder_seq_length - num_tokens
+            else:
+                self.inference_params.sequence_len_offset += (
+                    self.inference_params.key_value_memory_dict["image_tokens_count"]
+                    - num_image_tokens
+                )
+
+        return logits
--- a/megatron/core/inference/model_inference_wrappers/t5/__init__.py
+++ b/megatron/core/inference/model_inference_wrappers/t5/__init__.py
--- a/megatron/core/inference/model_inference_wrappers/t5/t5_inference_wrapper.py
+++ b/megatron/core/inference/model_inference_wrappers/t5/t5_inference_wrapper.py
 # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 from collections import deque
-from typing import Any, List, Tuple
+from typing import Any, Dict, List, Optional

 import numpy
 import torch
@@ -14,6 +14,7 @@ from megatron.core.inference.model_inference_wrappers.inference_wrapper_config i
    InferenceWrapperConfig,
 )
 from megatron.core.models.T5 import T5Model
+from megatron.core.utils import get_attr_wrapped_model


 # pylint: disable=line-too-long
@@ -39,54 +40,57 @@ class T5InferenceWrapper(AbstractModelInferenceWrapper):
        super().__init__(model, inference_wrapper_config)
        self.use_local = use_local

-    def prep_model_for_inference(
-        self, prompts_tokens: torch.Tensor, encoder_prompts: List[str] = None, tokenizer: Any = None
-    ):
-        """A utility function for preparing model for inference
-
-        This function is called before the forward pass. It puts the model in eval mode, builds
-        position ids, and creates attention masks so that required slices can be extracted during
-        the forward pass.
+    def prep_inference_input(
+        self,
+        prompts_tokens: torch.Tensor,
+        encoder_prompts: Optional[List[str]] = None,
+        tokenizer: Any = None,
+    ) -> Dict[str, Any]:
+        """Prepares the inference input data.

        Args:
-            prompts_tokens (torch.Tensor): A tensor of shape [batch_size, max_sequence_length]
+            prompts_tokens (torch.Tensor): A tensor of shape [batch_size, max_seq_len]
            encoder_prompts (dict): List of string of encoder input prompts
            tokenizer (_type_): Tokenizer used for tokenizing and detokenizing text
-        """
-
-        super().prep_model_for_inference(prompts_tokens=prompts_tokens)

+        Returns:
+            A dict with all the inference input needed for the batch.
+        """
        # get max_sequence_length
-        if hasattr(self.model, "module"):  # if self.model is Float16Module
-            max_sequence_length = self.model.module.max_sequence_length
-        else:
-            max_sequence_length = self.model.max_sequence_length
+        max_sequence_length = get_attr_wrapped_model(self.model, "max_sequence_length")

        encoder_prompts_tokens_list = [
            self.tokenize_encoder_prompt(encoder_prompt, tokenizer)
            for encoder_prompt in encoder_prompts
        ]
-        self.batch_encoder_prompts_tokens = self.pad_encoder_prompts_tokens(
+        batch_encoder_prompts_tokens = self.pad_encoder_prompts_tokens(
            encoder_prompts_tokens_list, max_sequence_length, tokenizer
        )

        # create batch mask for encoder_prompt (self.batch_input_tokens) and
-        # decoder_input (self.prompts_tokens), similar to megatron/core/datasets/t5_dataset.py
-        decoder_prompts_tokens = self.prompts_tokens.cpu().numpy()
-        encoder_prompts_tokens = self.batch_encoder_prompts_tokens.cpu().numpy()
-        self.batch_mask_encoder = []
-        self.batch_mask_decoder = []
-        for i in range(len(self.prompts_tokens)):
-            mask_encoder = encoder_prompts_tokens[i] == tokenizer.pad
-            mask_decoder = decoder_prompts_tokens[i] == tokenizer.pad
-            self.batch_mask_encoder.append(mask_encoder)
-            self.batch_mask_decoder.append(mask_decoder)
-        self.batch_mask_encoder = torch.tensor(numpy.array(self.batch_mask_encoder)).cuda()
-        self.batch_mask_decoder = torch.tensor(numpy.array(self.batch_mask_decoder)).cuda()
-
-    def tokenize_encoder_prompt(
-        self, encoder_prompt: str, tokenizer
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        # decoder_input (prompts_tokens), similar to megatron/core/datasets/t5_dataset.py
+        decoder_prompts_tokens = prompts_tokens
+        encoder_prompts_tokens = batch_encoder_prompts_tokens
+        decoder_prompts_tokens_numpy = decoder_prompts_tokens.cpu().numpy()
+        encoder_prompts_tokens_numpy = encoder_prompts_tokens.cpu().numpy()
+        batch_mask_encoder = []
+        batch_mask_decoder = []
+        for i in range(len(prompts_tokens)):
+            mask_encoder = encoder_prompts_tokens_numpy[i] == tokenizer.pad
+            mask_decoder = decoder_prompts_tokens_numpy[i] == tokenizer.pad
+            batch_mask_encoder.append(mask_encoder)
+            batch_mask_decoder.append(mask_decoder)
+        batch_mask_encoder = torch.tensor(numpy.array(batch_mask_encoder)).cuda()
+        batch_mask_decoder = torch.tensor(numpy.array(batch_mask_decoder)).cuda()
+
+        return {
+            "encoder_tokens": encoder_prompts_tokens,
+            "decoder_tokens": decoder_prompts_tokens,
+            "encoder_mask": batch_mask_encoder,
+            "decoder_mask": batch_mask_decoder,
+        }
+
+    def tokenize_encoder_prompt(self, encoder_prompt: str, tokenizer) -> torch.Tensor:
        """Utility to tokenize the encoder_prompt

        Args:
@@ -138,28 +142,32 @@ class T5InferenceWrapper(AbstractModelInferenceWrapper):
        return torch.tensor(encoder_prompts_tokens_list).cuda()

    def get_batch_for_context_window(
-        self, context_start_position: int, context_end_position: int
-    ) -> List:
+        self,
+        inference_input: Dict[str, Any],
+        context_start_position: int,
+        context_end_position: int,
+    ) -> Dict[str, Any]:
        """Returns the inference data given context window

        This function gets called iteratively in a loop . Given the start and end context
        positions , it extracts the appropriate data.

        Args:
+            inference_input (Dict[str, Any]): The inference input for the batch.
            context_start_position (int): Start of the context window. During
                the first inference step it is mostly 0
            context_end_position (int): End of the context window. During the
                last inference step it will mostly be the max generated sequence length.

        Returns:
-            List: A list of inputs that will be used by your model in the forward step
+            Dict: A dict of inputs that will be used by your model in the forward step
        """

        # T5 inference not yet support kv_cache
-        encoder_tokens2use = self.batch_encoder_prompts_tokens
-        decoder_tokens2use = self.prompts_tokens[:, :context_end_position]
-        encoder_mask2use = self.batch_mask_encoder
-        decoder_mask2use = self.batch_mask_decoder[:, :context_end_position]
+        encoder_tokens2use = inference_input["encoder_tokens"]
+        decoder_tokens2use = inference_input["decoder_tokens"][:, :context_end_position]
+        encoder_mask2use = inference_input["encoder_mask"]
+        decoder_mask2use = inference_input["decoder_mask"][:, :context_end_position]

        # Configure attention mask based on different conditions
        # (e.g., transformer-impl, TE versions, TE backends)
@@ -173,32 +181,34 @@ class T5InferenceWrapper(AbstractModelInferenceWrapper):
            )
        )

-        data_at_step_idx = [
-            encoder_tokens2use,
-            decoder_tokens2use,
-            encoder_mask2use,
-            decoder_mask2use,
-            encoder_decoder_mask2use,
-        ]
-
-        return data_at_step_idx
+        return {
+            "encoder_tokens": encoder_tokens2use,
+            "decoder_tokens": decoder_tokens2use,
+            "encoder_mask": encoder_mask2use,
+            "decoder_mask": decoder_mask2use,
+            "encoder_decoder_mask": encoder_decoder_mask2use,
+        }

-    def forward_pass_without_pipeline_parallel(self, inference_input: List) -> torch.Tensor:
+    def forward_pass_without_pipeline_parallel(
+        self, inference_input: Dict[str, Any]
+    ) -> torch.Tensor:
        """Utility to carry out simple forward pass for TP or no model parallel models

        Runs a very simple forward pass for model. Used  in the case of models without
        any parallelism or only tensor parallelism.

        Args:
-            inference_input (List): A list containg the inputs for the gpt
+            inference_input (Dict[str, Any]): A dict containg the inputs for the gpt
                model [tokens, position ids, attention mask]

        Returns:
            torch.Tensor: The output logits of shape [batch_size, seq_len, padded_vocab_size]
        """
-        [encoder_tokens, decoder_tokens, encoder_mask, decoder_mask, encoder_decoder_mask] = (
-            inference_input
-        )
+        encoder_tokens = inference_input["encoder_tokens"]
+        decoder_tokens = inference_input["decoder_tokens"]
+        encoder_mask = inference_input["encoder_mask"]
+        decoder_mask = inference_input["decoder_mask"]
+        encoder_decoder_mask = inference_input["encoder_decoder_mask"]
        tokens = decoder_tokens

        # T5 inference not yet support kv_cache

--- a/megatron/core/inference/modelopt_support/__init__.py
+++ b/megatron/core/inference/modelopt_support/__init__.py
 # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 """Integrations with NVIDIA TensorRT Model Optimizer (referred as ModelOpt).

-ModelOpt is a library comprising state-of-the-art model optimization techniques including quantization and sparsity to
-compress model for efficient inference on NVIDIA GPUs. ModelOpt is integrated with Megatron-core to provide a seamless
-experience for users to optimize their Megatron-core models for inference. More details on ModelOpt including
-installation and usage can be found at https://github.com/NVIDIA/TensorRT-Model-Optimizer.
+ModelOpt is a library comprising state-of-the-art model optimization techniques
+including quantization and sparsity to compress model for efficient inference on
+NVIDIA GPUs. ModelOpt is integrated with Megatron-core to provide a seamless
+experience for users to optimize their Megatron-core models for inference.
+More details on ModelOpt including installation and usage can be found at
+https://github.com/NVIDIA/TensorRT-Model-Optimizer.
 """
--- a/megatron/core/inference/modelopt_support/gpt/__init__.py
+++ b/megatron/core/inference/modelopt_support/gpt/__init__.py
 # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+
+import warnings
+
+warnings.warn(
+    "`megatron.core.inference.modelopt_support` will be deprecated in a"
+    "future release. Use `megatron.core.post_training.modelopt` instead."
+)
--- a/megatron/core/inference/modelopt_support/gpt/model_specs.py
+++ b/megatron/core/inference/modelopt_support/gpt/model_specs.py
 # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.

+from typing import Optional
+
 from megatron.core.extensions.transformer_engine import TEDotProductAttention, TENorm
 from megatron.core.fusions.fused_bias_dropout import get_bias_dropout_add
-from megatron.core.models.gpt.gpt_layer_specs import _get_mlp_module_spec
+from megatron.core.models.gpt.gpt_layer_specs import get_mlp_module_spec
 from megatron.core.tensor_parallel.layers import ColumnParallelLinear, RowParallelLinear
 from megatron.core.transformer.attention import SelfAttention, SelfAttentionSubmodules
+from megatron.core.transformer.dot_product_attention import DotProductAttention
 from megatron.core.transformer.enums import AttnMaskType
 from megatron.core.transformer.identity_op import IdentityOp
 from megatron.core.transformer.spec_utils import ModuleSpec
@@ -13,7 +16,8 @@ from megatron.core.transformer.transformer_layer import TransformerLayer, Transf

 # Use this spec for ModelOpt PTQ and TensorRT-LLM export
 def get_gpt_layer_modelopt_spec(
-    num_experts: int = None,
+    num_experts: Optional[int] = None,
+    local_core_attention: bool = False,
    moe_grouped_gemm: bool = False,
    remap_te_layernorm: bool = False,
    qk_layernorm: bool = False,
@@ -24,7 +28,8 @@ def get_gpt_layer_modelopt_spec(
    is using TENorm from Transformer-Engine. The issue is that FusedLayerNorm from apex
    has stopped supporting RMSNorm needed by llama.
    """
-    mlp = _get_mlp_module_spec(
+    core_attention = DotProductAttention if local_core_attention else TEDotProductAttention
+    mlp = get_mlp_module_spec(
        use_te=False, num_experts=num_experts, moe_grouped_gemm=moe_grouped_gemm, fp8=False
    )
    sharded_state_dict_keys_map = {}
@@ -47,7 +52,7 @@ def get_gpt_layer_modelopt_spec(
                params={"attn_mask_type": AttnMaskType.causal},
                submodules=SelfAttentionSubmodules(
                    linear_qkv=ColumnParallelLinear,
-                    core_attention=TEDotProductAttention,
+                    core_attention=core_attention,
                    linear_proj=RowParallelLinear,
                    q_layernorm=TENorm if qk_layernorm else IdentityOp,
                    k_layernorm=TENorm if qk_layernorm else IdentityOp,

--- a/megatron/core/inference/modelopt_support/gpt/state_dict_hooks.py
+++ b/megatron/core/inference/modelopt_support/gpt/state_dict_hooks.py
--- a/megatron/core/inference/modelopt_support/mamba/__init__.py
+++ b/megatron/core/inference/modelopt_support/mamba/__init__.py
+# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
--- a/megatron/core/inference/modelopt_support/mamba/model_specs.py
+++ b/megatron/core/inference/modelopt_support/mamba/model_specs.py
+# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+
+from megatron.core.extensions.transformer_engine import TEDotProductAttention, TENorm
+from megatron.core.fusions.fused_bias_dropout import get_bias_dropout_add
+from megatron.core.ssm.mamba_block import MambaStack, MambaStackSubmodules
+from megatron.core.ssm.mamba_layer import MambaLayer, MambaLayerSubmodules
+from megatron.core.ssm.mamba_mixer import MambaMixer, MambaMixerSubmodules
+from megatron.core.tensor_parallel.layers import ColumnParallelLinear, RowParallelLinear
+from megatron.core.transformer.attention import SelfAttention, SelfAttentionSubmodules
+from megatron.core.transformer.dot_product_attention import DotProductAttention
+from megatron.core.transformer.enums import AttnMaskType
+from megatron.core.transformer.mlp import MLP, MLPSubmodules
+from megatron.core.transformer.spec_utils import ModuleSpec
+from megatron.core.transformer.transformer_layer import TransformerLayer, TransformerLayerSubmodules
+
+
+# Use this spec for ModelOpt PTQ and TensorRT-LLM export
+def get_mamba_stack_modelopt_spec(
+    local_core_attention: bool = False, remap_te_layernorm: bool = False
+) -> ModuleSpec:
+    """Mix the native spec with TENorm.
+
+    This is essentially the native local spec except for the layernorm implementation
+    is using TENorm from Transformer-Engine.
+    """
+    mamba_state_dict_keys_map = {}
+    transformer_state_dict_keys_map = {}
+    if remap_te_layernorm:
+        mamba_state_dict_keys_map = {'norm.': 'mixer.in_proj.layer_norm_'}
+        transformer_state_dict_keys_map = {
+            'input_layernorm.': 'self_attention.linear_qkv.layer_norm_',
+            'pre_mlp_layernorm.': 'mlp.linear_fc1.layer_norm_',
+        }
+
+    mamba_layer = ModuleSpec(
+        module=MambaLayer,
+        submodules=MambaLayerSubmodules(
+            norm=TENorm,
+            mixer=ModuleSpec(
+                module=MambaMixer,
+                submodules=MambaMixerSubmodules(
+                    in_proj=ColumnParallelLinear, out_proj=RowParallelLinear
+                ),
+            ),
+            mamba_bda=get_bias_dropout_add,
+            sharded_state_dict_keys_map=mamba_state_dict_keys_map,
+        ),
+    )
+
+    core_attention = DotProductAttention if local_core_attention else TEDotProductAttention
+    attention_layer = ModuleSpec(
+        module=TransformerLayer,
+        submodules=TransformerLayerSubmodules(
+            input_layernorm=TENorm,
+            self_attention=ModuleSpec(
+                module=SelfAttention,
+                params={"attn_mask_type": AttnMaskType.causal},
+                submodules=SelfAttentionSubmodules(
+                    linear_qkv=ColumnParallelLinear,
+                    core_attention=core_attention,
+                    linear_proj=RowParallelLinear,
+                ),
+            ),
+            self_attn_bda=get_bias_dropout_add,
+            sharded_state_dict_keys_map=transformer_state_dict_keys_map,
+        ),
+    )
+
+    mlp_layer = ModuleSpec(
+        module=TransformerLayer,
+        submodules=TransformerLayerSubmodules(
+            pre_mlp_layernorm=TENorm,
+            mlp=ModuleSpec(
+                module=MLP,
+                submodules=MLPSubmodules(
+                    linear_fc1=ColumnParallelLinear, linear_fc2=RowParallelLinear
+                ),
+            ),
+            mlp_bda=get_bias_dropout_add,
+            sharded_state_dict_keys_map=transformer_state_dict_keys_map,
+        ),
+    )
+
+    return ModuleSpec(
+        module=MambaStack,
+        submodules=MambaStackSubmodules(
+            mamba_layer=mamba_layer, attention_layer=attention_layer, mlp_layer=mlp_layer
+        ),
+    )
--- a/megatron/core/inference/sampling_params.py
+++ b/megatron/core/inference/sampling_params.py
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+from dataclasses import dataclass
+
+
+@dataclass
+class SamplingParams:
+    """Inference parameters sent along with the prompts.
+    This class contains request-level attributes that control the sampling techniques used when
+    generating text. This is distinct from megatron.core.InferenceParams, which is sets model-level
+    inference attributes such as the maximum sequence length, and contains the KV cache.
+
+    For an explanation of these parameters refer to this blog
+    https://ivibudh.medium.com/a-guide-to-controlling-llm-model-output-exploring-top-k-top-p-and-
+    temperature-parameters-ed6a31313910
+    """
+
+    temperature: float = 1.0
+    top_k: int = 0
+    top_p: float = 0.0
+    return_log_probs: bool = False
+    return_segments: bool = False  # Whether to return individually detokenized tokens
+    num_tokens_to_generate: int = 30
+
+    def add_attributes(self, attribute_value_pair: dict):
+        """Utility to add more attributes to sampling params
+
+        Use this method to pass in a custom dictionary to add more sampling parameter attributes.
+        c = SamplingParams
+        c.add_attributes({'min_length':4, 'eod_id':153})
+
+        Args:
+            attribute_value_pair (dict): A dictionary containing attributes as the key names and
+            their values as the values.
+        """
+        for key, value in attribute_value_pair.items():
+            setattr(self, key, value)