add

0d99ae1f · silencealiang · c271aaae · 0d99ae1f · 0d99ae1f · 0d99ae1f
Commit 0d99ae1f authored Mar 14, 2025 by silencealiang
20 changed files
--- a/megatron/core/inference/engines/mcore_engine.py
+++ b/megatron/core/inference/engines/mcore_engine.py
@@ -3,12 +3,12 @@ from typing import Dict, List

 import torch

-from megatron.core.inference.common_inference_params import CommonInferenceParams
 from megatron.core.inference.engines.abstract_engine import AbstractEngine
 from megatron.core.inference.inference_request import InferenceRequest
+from megatron.core.inference.sampling_params import SamplingParams
 from megatron.core.inference.scheduler import Scheduler
-from megatron.core.inference.text_generation_controllers.simple_text_generation_controller import (
-    SimpleTextGenerationController,
+from megatron.core.inference.text_generation_controllers.text_generation_controller import (
+    TextGenerationController,
 )


@@ -19,7 +19,7 @@ class MCoreEngine(AbstractEngine):
    Supports any model that is callable (Accepts the inputs and outputs the tensor)

    Args:
-        text_generation_controller (SimpleTextGenerationController): A text generation
+        text_generation_controller (TextGenerationController): A text generation
            controller that will be used to define how to preprocess prompts, generate
            outputs and detokenizer the output tokens.
        max_batch_size : The maxinum number of requests to process at once
@@ -29,7 +29,7 @@ class MCoreEngine(AbstractEngine):

    def __init__(
        self,
-        text_generation_controller: SimpleTextGenerationController,
+        text_generation_controller: TextGenerationController,
        max_batch_size,
        random_seed: int = None,
    ):
@@ -42,7 +42,8 @@ class MCoreEngine(AbstractEngine):
        prompts: List[str],
        add_BOS: bool = False,
        encoder_prompts: List[str] = None,
-        common_inference_params: CommonInferenceParams = None,
+        common_inference_params: SamplingParams = None,
+        sampling_params: SamplingParams = None,
    ) -> dict:
        """The megatron core inference backend generate function

@@ -54,13 +55,19 @@ class MCoreEngine(AbstractEngine):
            prompts (List[str]): All the prompts as a list of strings
            add_BOS (bool): Whether to add BOS token to beginning of prompts
            encoder_prompts (List[dict]): All the encoder prompts as a list of strings
-            common_inference_params (CommonInferenceParams): The inference parameters
+            common_inference_params: Deprecated. Only used for backward compatibility with
+            MCore <= 0.9.0. Use `sampling_params` going forward.
+            sampling_params (SamplingParams): The request-level sampling parameters

        Returns:
            List[InferenceRequest]: The output is list of inference requests containing the
            generated tokens, texts and log probs if required
        """
        # TODO :M core- get rng state tracker
+
+        if common_inference_params:
+            sampling_params = common_inference_params
+
        if self.random_seed:
            torch.random.manual_seed(self.random_seed)

@@ -73,7 +80,7 @@ class MCoreEngine(AbstractEngine):
                prompt=prompt,
                prompt_tokens=prompt_tokens,
                encoder_prompt=encoder_prompt,
-                inference_parameters=common_inference_params,
+                inference_parameters=sampling_params,
            )

        self.run_engine()

--- a/megatron/core/inference/inference_request.py
+++ b/megatron/core/inference/inference_request.py
@@ -5,7 +5,7 @@ from typing import List

 import torch

-from megatron.core.inference.common_inference_params import CommonInferenceParams
+from megatron.core.inference.sampling_params import SamplingParams


 # class syntax
@@ -28,7 +28,7 @@ class InferenceRequest:

    request_id: str
    prompt: str
-    inference_parameters: CommonInferenceParams
+    inference_parameters: SamplingParams
    prompt_tokens: List[int]
    arrival_time: float
    status: Status

--- a/megatron/core/inference/model_inference_wrappers/__init__.py
+++ b/megatron/core/inference/model_inference_wrappers/__init__.py
--- a/megatron/core/inference/model_inference_wrappers/abstract_model_inference_wrapper.py
+++ b/megatron/core/inference/model_inference_wrappers/abstract_model_inference_wrapper.py
--- a/megatron/core/inference/model_inference_wrappers/gpt/__init__.py
+++ b/megatron/core/inference/model_inference_wrappers/gpt/__init__.py
--- a/megatron/core/inference/model_inference_wrappers/gpt/gpt_inference_wrapper.py
+++ b/megatron/core/inference/model_inference_wrappers/gpt/gpt_inference_wrapper.py
--- a/megatron/core/inference/model_inference_wrappers/inference_wrapper_config.py
+++ b/megatron/core/inference/model_inference_wrappers/inference_wrapper_config.py
--- a/megatron/core/inference/model_inference_wrappers/t5/__init__.py
+++ b/megatron/core/inference/model_inference_wrappers/t5/__init__.py
--- a/megatron/core/inference/model_inference_wrappers/t5/t5_inference_wrapper.py
+++ b/megatron/core/inference/model_inference_wrappers/t5/t5_inference_wrapper.py
--- a/megatron/core/inference/modelopt_support/__init__.py
+++ b/megatron/core/inference/modelopt_support/__init__.py
--- a/megatron/core/inference/modelopt_support/gpt/__init__.py
+++ b/megatron/core/inference/modelopt_support/gpt/__init__.py
--- a/megatron/core/inference/modelopt_support/gpt/model_specs.py
+++ b/megatron/core/inference/modelopt_support/gpt/model_specs.py
--- a/megatron/core/inference/modelopt_support/gpt/state_dict_hooks.py
+++ b/megatron/core/inference/modelopt_support/gpt/state_dict_hooks.py
--- a/megatron/core/inference/sampling_params.py
+++ b/megatron/core/inference/sampling_params.py
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+from dataclasses import dataclass
+
+
+@dataclass
+class SamplingParams:
+    """Inference parameters sent along with the prompts.
+    This class contains request-level attributes that control the sampling techniques used when
+    generating text. This is distinct from megatron.core.InferenceParams, which is sets model-level
+    inference attributes such as the maximum sequence length, and contains the KV cache.
+
+    For an explanation of these parameters refer to this blog
+    https://ivibudh.medium.com/a-guide-to-controlling-llm-model-output-exploring-top-k-top-p-and-
+    temperature-parameters-ed6a31313910
+    """
+
+    temperature: float = 1.0
+    top_k: int = 0
+    top_p: float = 0.0
+    return_log_probs: bool = False
+    num_tokens_to_generate: int = 30
+
+    def add_attributes(self, attribute_value_pair: dict):
+        """Utility to add more attributes to sampling params
+
+        Use this method to pass in a custom dictionary to add more sampling parameter attributes.
+        c = SamplingParams
+        c.add_attributes({'min_length':4, 'eod_id':153})
+
+        Args:
+            attribute_value_pair (dict): A dictionary containing attributes as the key names and
+            their values as the values.
+        """
+        for key, value in attribute_value_pair.items():
+            setattr(self, key, value)
--- a/megatron/core/inference/scheduler.py
+++ b/megatron/core/inference/scheduler.py
@@ -6,8 +6,8 @@ from typing import Dict

 import torch

-from megatron.core.inference.common_inference_params import CommonInferenceParams
 from megatron.core.inference.inference_request import InferenceRequest, Status
+from megatron.core.inference.sampling_params import SamplingParams
 from megatron.core.inference.utils import Counter


@@ -33,7 +33,7 @@ class Scheduler:
        prompt: str,
        prompt_tokens: torch.Tensor,
        encoder_prompt: str = None,
-        inference_parameters: CommonInferenceParams = None,
+        inference_parameters: SamplingParams = None,
        arrival_time: float = None,
    ):
        """Add an incoming request
@@ -45,7 +45,7 @@ class Scheduler:
            prompt (str): Input prompt string
            prompt_tokens (torch.Tensor): A torch tensor having the input prompts tokenized
            encoder_prompt (str): Encoder input string
-            inference_parameters (CommonInferenceParams): The inference parameters
+            inference_parameters (SamplingParams): The inference parameters
            arrival_time (float, optional): The incoming request time. Defaults to None.
        """
        request_id = str(next(self.request_counter))

--- a/megatron/core/inference/text_generation_controllers/__init__.py
+++ b/megatron/core/inference/text_generation_controllers/__init__.py
--- a/megatron/core/inference/text_generation_controllers/encoder_decoder_text_generation_controller.py
+++ b/megatron/core/inference/text_generation_controllers/encoder_decoder_text_generation_controller.py
@@ -4,15 +4,15 @@ from typing import OrderedDict
 import torch

 from megatron.core.inference.inference_request import InferenceRequest
-from megatron.core.inference.text_generation_controllers.simple_text_generation_controller import (
-    SimpleTextGenerationController,
+from megatron.core.inference.text_generation_controllers.text_generation_controller import (
+    TextGenerationController,
 )


-class EncoderDecoderTextGenerationController(SimpleTextGenerationController):
+class EncoderDecoderTextGenerationController(TextGenerationController):
    """The text generation controller for encoder-decoder architecture

-    This class ingherits from SimpleTextGenerationController, adding features
+    This class inherits from TextGenerationController, adding features
    relating to encoder input encoder_prompt

    """

--- a/megatron/core/inference/text_generation_controllers/simple_text_generation_controller.py
+++ b/megatron/core/inference/text_generation_controllers/simple_text_generation_controller.py
-# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
-from typing import List, OrderedDict, Tuple
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.

-import torch
-import torch.nn.functional as F
-
-from megatron.core import parallel_state
-from megatron.core.inference.common_inference_params import CommonInferenceParams
-from megatron.core.inference.communication_utils import broadcast_from_last_pipeline_stage
-from megatron.core.inference.inference_request import InferenceRequest, Status
-from megatron.core.inference.model_inference_wrappers.abstract_model_inference_wrapper import (
-    AbstractModelInferenceWrapper,
+from megatron.core.inference.text_generation_controllers.text_generation_controller import (  # noqa: F401 # pylint: disable=unused-import
+    TextGenerationController as SimpleTextGenerationController,
 )
-
-
-class SimpleTextGenerationController:
-    """The basic text generation controller
-
-    This class is responsible for tokenizing the input , running the inference, sampling
-    and also detokenizing the output
-
-    Args:
-        inference_wrapped_model (AbstractModelInferenceWrapper): A model that
-            is wrapped using the specs given in the abstract_model_inference_wrapper.py
-        tokenizer (_type_): Tokenizer used for tokenizing and detokenizing the prompts
-    """
-
-    def __init__(self, inference_wrapped_model: AbstractModelInferenceWrapper, tokenizer):
-        self.inference_wrapped_model = inference_wrapped_model
-        self.tokenizer = tokenizer
-
-        # For models without pipeline parallelism, is_first_stage and is_last_stage returns True
-        self.model_is_pipeline_parallel = not (
-            parallel_state.is_pipeline_first_stage() and parallel_state.is_pipeline_last_stage()
-        )
-
-    def tokenize_prompt(
-        self, prompt: str, add_BOS: bool = False
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
-        """Utility to tokenize the input prompts
-
-        Args:
-            prompt (str): The input prompt
-
-        Returns:
-            torch.Tensor: Returns the tokenized prompt
-        """
-        prompt_tokens = self.tokenizer.tokenize(prompt)
-
-        if add_BOS:
-            prompt_tokens = [self.tokenizer.bos] + prompt_tokens
-
-        return prompt_tokens
-
-    def detokenize_generations(self, prompt_tokens_with_generated_tokens: torch.Tensor) -> str:
-        """Detokenize the output generations
-
-        Args:
-            prompt_tokens_with_generated_tokens (torch.Tensor): The input prompt
-            tokens plus the generated tokens
-
-        Returns:
-            str: The detokenized output
-        """
-        tokens = prompt_tokens_with_generated_tokens.cpu().numpy().tolist()
-        return self.tokenizer.detokenize(tokens)
-
-    def sample_from_logits(
-        self,
-        last_token_logits: torch.Tensor,
-        common_inference_params: CommonInferenceParams,
-        vocab_size: int = None,
-    ) -> torch.Tensor:
-        """Samples the logits to generate outputs
-
-        Given the logits of the last token, this function samples it
-        according to the parameters defined in common_inference_params
-        and returns the samples
-
-        Args:
-            last_token_logits (torch.Tensor): The last token logits. A tensor of
-                size [batch_size, vocab_size]
-            common_inference_params (CommonInferenceParams): The paramters to use
-                for inference
-            vocab_size (int): Obtained from the tokenizer. Defaults to None
-
-        Returns:
-            torch.Tensor: 1D tensor of the sampled logits with [batch_size] elements
-        """
-
-        top_p = common_inference_params.top_p
-        top_k = common_inference_params.top_k
-        temperature = common_inference_params.temperature
-
-        assert not (top_k > 0 and top_p > 0), 'Cannot have top-p and top-k both greater than zero'
-        assert top_p <= 1.0, 'top-p should be in (0,1]'
-
-        def modify_logits_for_top_k_filtering(logits, top_k):
-            """Set the logits for none top-k values to -inf."""
-            filter_ = logits < torch.topk(logits, top_k)[0][..., -1, None]
-            logits.masked_fill_(filter_, float('-Inf'))
-
-        def modify_logits_for_top_p_filtering(logits, top_p):
-            """Set the logits for none top-p values to -inf."""
-            # First sort and calculate cumulative sum of probabilities.
-            sorted_logits, sorted_indices = torch.sort(logits, descending=True)
-            cumulative_probs = sorted_logits.softmax(dim=-1).cumsum(dim=-1)
-
-            # Filteration based on the cumulative sum.
-            filter_ = cumulative_probs > top_p
-            # This shift by 1 is weird and I cannot justify it. This existed
-            # in the original implementation:
-            #   https://github.com/ari-holtzman/degen/blob/master/gen.py
-            # and I guess it is needed so keeping it for now.
-            filter_[:, 1:] = filter_[:, :-1].clone()
-            # Make sure we at least have one token to select from.
-            filter_[..., 0] = 0
-
-            # Fill in the filtered part
-            filter_ = filter_.scatter(1, sorted_indices, filter_)
-            logits.masked_fill_(filter_, float('-Inf'))
-
-        # Greedy sampling
-        if top_k == 1:
-            sampled_logits = torch.argmax(last_token_logits, dim=-1)
-        else:
-            last_token_logits = last_token_logits.clone()
-            if temperature != 1.0:
-                last_token_logits.div_(temperature)
-
-            if top_k > 1:
-                assert top_k <= last_token_logits.size(1), 'top-k is larger than logit size.'
-                if vocab_size:
-                    assert top_k < vocab_size, 'top-k is larger than vocab size.'
-                modify_logits_for_top_k_filtering(last_token_logits, top_k)
-
-            elif top_p > 0.0:
-                modify_logits_for_top_p_filtering(last_token_logits, top_p)
-
-            # After filtering, we need to recalculate the distribution.
-            probabilities = last_token_logits.softmax(dim=-1)
-            sampled_logits = torch.multinomial(probabilities, num_samples=1).view(-1)
-
-            # If vocab size is provided, make sure the samples are in in the range [0, vocab-size).
-            if vocab_size:
-                sampled_logits = torch.clamp(sampled_logits, min=0, max=(vocab_size - 1))
-        return sampled_logits
-
-    def update_generation_status(
-        self,
-        updated_prompts_tokens: torch.Tensor,
-        generation_started: torch.Tensor,
-        current_context_end_position: int,
-        is_generation_done_tensor: torch.Tensor,
-        generated_sequence_lengths: torch.Tensor,
-    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-        """Checks which prompts have reached an end condition
-
-        We check which prompts have reached an end condition and set the corresponding
-        flags of the is_generation_done_tensor to True. The generated sequence lengths
-        increase as we keep generating, until that prompts hits an end condition. The
-        generation_started tensor determines which prompts have started generating.
-
-        Args:
-            updated_prompts_tokens (torch.Tensor): The prompts tokens updated with the latest
-                generated tokens. A tensor of shape [batch_size, max_seq_len]
-                (i.e max_seq_len = max_prompt_len + tokens_to_generate)
-            generation_started (torch.Tensor): A boolean tensor of shape [batch_size]. True
-                indicates the prompt at that index has started generating tokens.
-            current_context_end_position (int): An integer indicating which position to
-                extract from the prompts tokens to get the latest generated tokens.
-            is_generation_done_tensor (torch.Tensor): A boolean tensor of shape [batch_size].
-                True indicates the prompt at that index has reached end condition.
-            generated_sequence_lengths (torch.Tensor): A int tensor of shape [batch_size].
-                Each value represents the generated sequence lengths for that prompt.
-
-        Returns:
-            Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: Returns the boolean
-                is_generation_done_tensor and the generated_sequence_lengths after updating it
-        """
-        latest_samples = updated_prompts_tokens[:, current_context_end_position]
-        # Make sure we are checking eod criterion only for prompts that have started generating
-        # (i.e) We only look at the generated tokenns and not the input tokens.
-        reached_eod = (latest_samples == self.tokenizer.eod) & generation_started
-        is_generation_done_tensor = is_generation_done_tensor | reached_eod
-        # We increment generated sequence lengths when that prompt has not hit the
-        # EOD and generation has started
-        generated_sequence_lengths += ~is_generation_done_tensor & generation_started
-
-        return is_generation_done_tensor, generated_sequence_lengths
-
-    def pad_input_prompt_tokens(
-        self,
-        batch_prompt_tokens_list: List[List[int]],
-        max_prompt_length_in_batch: int,
-        num_tokens_to_generate: int,
-    ) -> torch.Tensor:
-        """Method to pad input prompts
-
-        Given a list of prompts, pad them all to uniform length
-
-        Args:
-            batch_prompt_tokens_list (List[List[int]]): A list containing the prompt tokens
-            max_prompt_length_in_batch (int): Maximum of the length of the input prompt tokens
-            num_tokens_togenerate (int): The number of tokens to generate for each prompt
-
-        Returns:
-            torch.Tensor: A torch tensor of shape [bs, max_seq_len] (i.e)
-            max_seq_len = max_prompt_length_in_batch + num_tokens_to_generate,
-            with extra indices for each tensor padded with mask id.
-        """
-        max_seq_len = max_prompt_length_in_batch + num_tokens_to_generate
-
-        for prompt_tokens in batch_prompt_tokens_list:
-            padding_size = max_seq_len - len(prompt_tokens)
-            prompt_tokens.extend([self.tokenizer.eod] * padding_size)
-
-        return torch.tensor(batch_prompt_tokens_list).cuda()
-
-    def generate_output_tokens_dynamic_batch(
-        self, active_requests: OrderedDict[int, InferenceRequest]
-    ) -> OrderedDict[int, InferenceRequest]:
-        """Utility to generate the output tokens and probabilities for the prompts
-
-        This utility generates the output tokens for a dynamic batch. It will run one forward step
-        at a time, and pass control back to the engine, which will update the request pool and call
-        this method again.
-
-        Args:
-            active_requests (OrderedDict[int, InferenceRequest]): The input active requests.
-
-        Returns:
-            OrderedDict[int, InferenceRequest]: The result for each of the incoming requests
-            after running one forward step.
-        """
-        raise Exception("Not implemented yet")
-
-    def generate_all_output_tokens_static_batch(
-        self, active_requests: OrderedDict[int, InferenceRequest]
-    ) -> OrderedDict[int, InferenceRequest]:
-        """Utility to generate the all the output tokens and probabilities for the prompts .
-
-        This utility generates the output tokens for a static batch. It runs the forward steps till
-        all prompts complete generation, updates the status of these requests to completed, adds
-        the generated result and returns these requests
-
-        Args:
-            active_requests (OrderedDict[int, InferenceRequest]): The input active requests.
-
-        Returns:
-            OrderedDict[int, InferenceRequest]: The result for each of the incoming requests
-        """
-        batch_prompt_tokens_list = list(
-            map(lambda request: request.prompt_tokens, active_requests.values())
-        )
-        prompt_lengths_in_batch = torch.tensor(
-            [len(prompt_tokens) for prompt_tokens in batch_prompt_tokens_list]
-        ).cuda()
-        max_prompt_length_in_batch = max(prompt_lengths_in_batch)
-        min_prompt_length_in_batch = min(prompt_lengths_in_batch)
-
-        # For batch inference the inference params are the same for all request
-        common_inference_params: CommonInferenceParams = list(active_requests.values())[
-            0
-        ].inference_parameters
-
-        # max_seq_len = max_prompt_length_in_batch + num_tokens_to_generate
-        batch_prompt_tokens = self.pad_input_prompt_tokens(
-            batch_prompt_tokens_list,
-            max_prompt_length_in_batch=max_prompt_length_in_batch,
-            num_tokens_to_generate=common_inference_params.num_tokens_to_generate,
-        )
-        batch_size, max_sequence_length = batch_prompt_tokens.shape
-
-        # Pre allocate log probs tensor
-        output_log_probs = None
-        if common_inference_params.return_log_probs:
-            output_log_probs = torch.empty(
-                (batch_size, max_sequence_length - 1), dtype=torch.float32
-            ).cuda()
-
-        # An array to check which of the prompts have reached end of generation condition
-        is_generation_done_tensor = torch.zeros(batch_size, dtype=torch.bool).cuda()
-
-        # An array to act as a counter to keep track of generated sequence lengths
-        generated_sequence_lengths = torch.zeros(batch_size).cuda()
-
-        with torch.no_grad():
-
-            self.prep_model_for_inference(
-                prompts_tokens=batch_prompt_tokens, active_requests=active_requests
-            )
-
-            context_start_position = 0
-            # Pick the context window that we need to pass through the network.
-            for context_end_position in range(min_prompt_length_in_batch, max_sequence_length):
-
-                inference_input = self.inference_wrapped_model.get_batch_for_context_window(
-                    context_start_position, context_end_position
-                )
-
-                # Returns the final logits of shape [batch_size, context_length, vocab_size]
-                # Note: This is returned in all TP ranks or last PP stage in PP models
-                logits = self.inference_wrapped_model.run_one_forward_step(inference_input)
-                if self.model_is_pipeline_parallel:
-                    context_length = context_end_position - context_start_position
-                    logits = broadcast_from_last_pipeline_stage(
-                        [batch_size, context_length, self.tokenizer.vocab_size],
-                        dtype=self.inference_wrapped_model.inference_wrapper_config.params_dtype,
-                        tensor=logits,
-                    )
-
-                # Indicates which of the input prompts have started generating tokens.
-                # A 1D boolean tensor with [batch_size] elements (i.e) The shortest
-                # prompts will start generating first and so on
-                generation_started = prompt_lengths_in_batch <= context_end_position
-                last_token_logits = logits[:, -1, :]
-                sampled_logits = self.sample_from_logits(
-                    last_token_logits, common_inference_params, self.tokenizer.vocab_size
-                )
-
-                # Substitute the sampled logits only for only the prompts that
-                # have started generating tokens
-                batch_prompt_tokens[generation_started, context_end_position] = sampled_logits[
-                    generation_started
-                ]
-
-                if common_inference_params.return_log_probs:
-                    log_probs = F.log_softmax(logits, dim=2)
-                    indices = torch.unsqueeze(
-                        batch_prompt_tokens[
-                            :, (context_start_position + 1) : (context_end_position + 1)
-                        ],
-                        2,
-                    )
-                    # Get the log probabilities for only the prompt tokens
-                    output_log_probs[:, context_start_position:context_end_position] = torch.gather(
-                        log_probs, 2, indices
-                    ).squeeze(2)
-
-                context_start_position = context_end_position
-
-                # Check end of generation status for each tensor
-                # and update generated sequence lengths
-                (is_generation_done_tensor, generated_sequence_lengths) = (
-                    self.update_generation_status(
-                        updated_prompts_tokens=batch_prompt_tokens,
-                        generation_started=generation_started,
-                        current_context_end_position=context_end_position,
-                        is_generation_done_tensor=is_generation_done_tensor,
-                        generated_sequence_lengths=generated_sequence_lengths,
-                    )
-                )
-                # Boolean flag indicating if all prompts are finished
-                all_prompts_done = torch.all(is_generation_done_tensor)
-                if all_prompts_done:
-                    break
-
-        # Include all the generated tokens
-        batch_prompt_tokens_with_generations = batch_prompt_tokens[:, : (context_end_position + 1)]
-        if common_inference_params.return_log_probs:
-            output_log_probs = output_log_probs[:, :context_end_position]
-
-        generated_sequence_lengths[
-            generated_sequence_lengths > common_inference_params.num_tokens_to_generate
-        ] = common_inference_params.num_tokens_to_generate
-
-        for idx, request in enumerate(active_requests.values()):
-            input_prompt_length = int(prompt_lengths_in_batch[idx])
-            # Shorter prompts might have generated more than required tokens. So we trim them down
-            required_sequence_length = int(
-                min(generated_sequence_lengths[idx], common_inference_params.num_tokens_to_generate)
-            )
-            # Extract only the generated tokens
-            required_result_tokens = batch_prompt_tokens_with_generations[
-                idx, input_prompt_length : (input_prompt_length + required_sequence_length)
-            ]
-
-            request.generated_length = required_sequence_length
-            request.generated_tokens = required_result_tokens
-            request.generated_log_probs = (
-                None
-                if output_log_probs is None
-                else output_log_probs[idx, input_prompt_length:required_sequence_length]
-            )
-            request.status = Status.COMPLETED
-            request.generated_text = self.detokenize_generations(required_result_tokens)
-
-        return active_requests
-
-    def prep_model_for_inference(
-        self, prompts_tokens: torch.Tensor, active_requests: OrderedDict[int, InferenceRequest]
-    ):
-        """Preparing batch for inference, using respective wrapper's prep_model_for_inference method
-
-        Args:
-            prompts_tokens (torch.Tensor): A tensor of shape [batch_size, max_sequence_length]
-            active_requests (OrderedDict[int, InferenceRequest]): The input active requests
-        """
-        self.inference_wrapped_model.prep_model_for_inference(prompts_tokens=prompts_tokens)
--- a/megatron/core/inference/text_generation_controllers/text_generation_controller.py
+++ b/megatron/core/inference/text_generation_controllers/text_generation_controller.py
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+from typing import List, OrderedDict, Tuple
+
+import torch
+import torch.nn.functional as F
+
+from megatron.core import parallel_state
+from megatron.core.inference.communication_utils import broadcast_from_last_pipeline_stage
+from megatron.core.inference.inference_request import InferenceRequest, Status
+from megatron.core.inference.model_inference_wrappers.abstract_model_inference_wrapper import (
+    AbstractModelInferenceWrapper,
+)
+from megatron.core.inference.sampling_params import SamplingParams
+
+
+class TextGenerationController:
+    """The text generation controller (the main sampling loop)
+
+    This class tokenizes the input, runs inference, samples from logits, and detokenizes the output.
+
+    Args:
+        inference_wrapped_model (AbstractModelInferenceWrapper): A model that
+            is wrapped using the specs given in the abstract_model_inference_wrapper.py
+        tokenizer (_type_): Tokenizer used for tokenizing and detokenizing the prompts
+    """
+
+    def __init__(self, inference_wrapped_model: AbstractModelInferenceWrapper, tokenizer):
+        self.inference_wrapped_model = inference_wrapped_model
+        self.tokenizer = tokenizer
+
+        # For models without pipeline parallelism, is_first_stage and is_last_stage returns True
+        self.model_is_pipeline_parallel = not (
+            parallel_state.is_pipeline_first_stage() and parallel_state.is_pipeline_last_stage()
+        )
+
+    def tokenize_prompt(
+        self, prompt: str, add_BOS: bool = False
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Utility to tokenize the input prompts
+
+        Args:
+            prompt (str): The input prompt
+
+        Returns:
+            torch.Tensor: Returns the tokenized prompt
+        """
+        prompt_tokens = self.tokenizer.tokenize(prompt)
+
+        if add_BOS:
+            prompt_tokens = [self.tokenizer.bos] + prompt_tokens
+
+        return prompt_tokens
+
+    def detokenize_generations(self, prompt_tokens_with_generated_tokens: torch.Tensor) -> str:
+        """Detokenize the output generations
+
+        Args:
+            prompt_tokens_with_generated_tokens (torch.Tensor): The input prompt
+            tokens plus the generated tokens
+
+        Returns:
+            str: The detokenized output
+        """
+        tokens = prompt_tokens_with_generated_tokens.cpu().numpy().tolist()
+        return self.tokenizer.detokenize(tokens)
+
+    def sample_from_logits(
+        self,
+        last_token_logits: torch.Tensor,
+        sampling_params: SamplingParams = None,
+        vocab_size: int = None,
+        **kwargs
+    ) -> torch.Tensor:
+        """Samples the logits to generate outputs
+
+        Given the logits of the last token, this function samples it
+        according to the parameters defined in sampling_params
+        and returns the samples
+
+        Args:
+            last_token_logits (torch.Tensor): The last token logits. A tensor of
+                size [batch_size, vocab_size]
+            sampling_params (SamplingParams): The parameters to use for inference.
+            vocab_size (int): Obtained from the tokenizer. Defaults to None
+
+        Returns:
+            torch.Tensor: 1D tensor of the sampled logits with [batch_size] elements
+        """
+
+        if kwargs.get('common_inference_params'):
+            sampling_params = kwargs['common_inference_params']
+
+        top_p = sampling_params.top_p
+        top_k = sampling_params.top_k
+        temperature = sampling_params.temperature
+
+        assert not (top_k > 0 and top_p > 0), 'Cannot have top-p and top-k both greater than zero'
+        assert top_p <= 1.0, 'top-p should be in (0,1]'
+
+        def modify_logits_for_top_k_filtering(logits, top_k):
+            """Set the logits for none top-k values to -inf."""
+            filter_ = logits < torch.topk(logits, top_k)[0][..., -1, None]
+            logits.masked_fill_(filter_, float('-Inf'))
+
+        def modify_logits_for_top_p_filtering(logits, top_p):
+            """Set the logits for none top-p values to -inf."""
+            # First sort and calculate cumulative sum of probabilities.
+            sorted_logits, sorted_indices = torch.sort(logits, descending=True)
+            cumulative_probs = sorted_logits.softmax(dim=-1).cumsum(dim=-1)
+
+            # Filteration based on the cumulative sum.
+            filter_ = cumulative_probs > top_p
+            # This shift by 1 is weird and I cannot justify it. This existed
+            # in the original implementation:
+            #   https://github.com/ari-holtzman/degen/blob/master/gen.py
+            # and I guess it is needed so keeping it for now.
+            filter_[:, 1:] = filter_[:, :-1].clone()
+            # Make sure we at least have one token to select from.
+            filter_[..., 0] = 0
+
+            # Fill in the filtered part
+            filter_ = filter_.scatter(1, sorted_indices, filter_)
+            logits.masked_fill_(filter_, float('-Inf'))
+
+        # Greedy sampling
+        if top_k == 1:
+            sampled_logits = torch.argmax(last_token_logits, dim=-1)
+        else:
+            last_token_logits = last_token_logits.clone()
+            if temperature != 1.0:
+                last_token_logits.div_(temperature)
+
+            if top_k > 1:
+                assert top_k <= last_token_logits.size(1), 'top-k is larger than logit size.'
+                if vocab_size:
+                    assert top_k < vocab_size, 'top-k is larger than vocab size.'
+                modify_logits_for_top_k_filtering(last_token_logits, top_k)
+
+            elif top_p > 0.0:
+                modify_logits_for_top_p_filtering(last_token_logits, top_p)
+
+            # After filtering, we need to recalculate the distribution.
+            probabilities = last_token_logits.softmax(dim=-1)
+            sampled_logits = torch.multinomial(probabilities, num_samples=1).view(-1)
+
+            # If vocab size is provided, make sure the samples are in in the range [0, vocab-size).
+            if vocab_size:
+                sampled_logits = torch.clamp(sampled_logits, min=0, max=(vocab_size - 1))
+        return sampled_logits
+
+    def update_generation_status(
+        self,
+        updated_prompts_tokens: torch.Tensor,
+        generation_started: torch.Tensor,
+        current_context_end_position: int,
+        is_generation_done_tensor: torch.Tensor,
+        generated_sequence_lengths: torch.Tensor,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """Checks which prompts have reached an end condition
+
+        We check which prompts have reached an end condition and set the corresponding
+        flags of the is_generation_done_tensor to True. The generated sequence lengths
+        increase as we keep generating, until that prompts hits an end condition. The
+        generation_started tensor determines which prompts have started generating.
+
+        Args:
+            updated_prompts_tokens (torch.Tensor): The prompts tokens updated with the latest
+                generated tokens. A tensor of shape [batch_size, max_seq_len]
+                (i.e max_seq_len = max_prompt_len + tokens_to_generate)
+            generation_started (torch.Tensor): A boolean tensor of shape [batch_size]. True
+                indicates the prompt at that index has started generating tokens.
+            current_context_end_position (int): An integer indicating which position to
+                extract from the prompts tokens to get the latest generated tokens.
+            is_generation_done_tensor (torch.Tensor): A boolean tensor of shape [batch_size].
+                True indicates the prompt at that index has reached end condition.
+            generated_sequence_lengths (torch.Tensor): A int tensor of shape [batch_size].
+                Each value represents the generated sequence lengths for that prompt.
+
+        Returns:
+            Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: Returns the boolean
+                is_generation_done_tensor and the generated_sequence_lengths after updating it
+        """
+        latest_samples = updated_prompts_tokens[:, current_context_end_position]
+        # Make sure we are checking eod criterion only for prompts that have started generating
+        # (i.e) We only look at the generated tokenns and not the input tokens.
+        reached_eod = (latest_samples == self.tokenizer.eod) & generation_started
+        is_generation_done_tensor = is_generation_done_tensor | reached_eod
+        # We increment generated sequence lengths when that prompt has not hit the
+        # EOD and generation has started
+        generated_sequence_lengths += ~is_generation_done_tensor & generation_started
+
+        return is_generation_done_tensor, generated_sequence_lengths
+
+    def pad_input_prompt_tokens(
+        self,
+        batch_prompt_tokens_list: List[List[int]],
+        max_prompt_length_in_batch: int,
+        num_tokens_to_generate: int,
+    ) -> torch.Tensor:
+        """Method to pad input prompts
+
+        Given a list of prompts, pad them all to uniform length
+
+        Args:
+            batch_prompt_tokens_list (List[List[int]]): A list containing the prompt tokens
+            max_prompt_length_in_batch (int): Maximum of the length of the input prompt tokens
+            num_tokens_togenerate (int): The number of tokens to generate for each prompt
+
+        Returns:
+            torch.Tensor: A torch tensor of shape [bs, max_seq_len] (i.e)
+            max_seq_len = max_prompt_length_in_batch + num_tokens_to_generate,
+            with extra indices for each tensor padded with mask id.
+        """
+        max_seq_len = max_prompt_length_in_batch + num_tokens_to_generate
+
+        for prompt_tokens in batch_prompt_tokens_list:
+            padding_size = max_seq_len - len(prompt_tokens)
+            prompt_tokens.extend([self.tokenizer.eod] * padding_size)
+
+        return torch.tensor(batch_prompt_tokens_list).cuda()
+
+    def generate_output_tokens_dynamic_batch(
+        self, active_requests: OrderedDict[int, InferenceRequest]
+    ) -> OrderedDict[int, InferenceRequest]:
+        """Utility to generate the output tokens and probabilities for the prompts
+
+        This utility generates the output tokens for a dynamic batch. It will run one forward step
+        at a time, and pass control back to the engine, which will update the request pool and call
+        this method again.
+
+        Args:
+            active_requests (OrderedDict[int, InferenceRequest]): The input active requests.
+
+        Returns:
+            OrderedDict[int, InferenceRequest]: The result for each of the incoming requests
+            after running one forward step.
+        """
+        raise Exception("Not implemented yet")
+
+    def generate_all_output_tokens_static_batch(
+        self, active_requests: OrderedDict[int, InferenceRequest]
+    ) -> OrderedDict[int, InferenceRequest]:
+        """Utility to generate the all the output tokens and probabilities for the prompts .
+
+        This utility generates the output tokens for a static batch. It runs the forward steps till
+        all prompts complete generation, updates the status of these requests to completed, adds
+        the generated result and returns these requests
+
+        Args:
+            active_requests (OrderedDict[int, InferenceRequest]): The input active requests.
+
+        Returns:
+            OrderedDict[int, InferenceRequest]: The result for each of the incoming requests
+        """
+        batch_prompt_tokens_list = list(
+            map(lambda request: request.prompt_tokens, active_requests.values())
+        )
+        prompt_lengths_in_batch = torch.tensor(
+            [len(prompt_tokens) for prompt_tokens in batch_prompt_tokens_list]
+        ).cuda()
+        max_prompt_length_in_batch = max(prompt_lengths_in_batch)
+        min_prompt_length_in_batch = min(prompt_lengths_in_batch)
+
+        # For batch inference the inference params are the same for all request
+        sampling_params: SamplingParams = list(active_requests.values())[0].inference_parameters
+
+        # max_seq_len = max_prompt_length_in_batch + num_tokens_to_generate
+        batch_prompt_tokens = self.pad_input_prompt_tokens(
+            batch_prompt_tokens_list,
+            max_prompt_length_in_batch=max_prompt_length_in_batch,
+            num_tokens_to_generate=sampling_params.num_tokens_to_generate,
+        )
+        batch_size, max_sequence_length = batch_prompt_tokens.shape
+
+        # Pre allocate log probs tensor
+        output_log_probs = None
+        if sampling_params.return_log_probs:
+            output_log_probs = torch.empty(
+                (batch_size, max_sequence_length - 1), dtype=torch.float32
+            ).cuda()
+
+        # An array to check which of the prompts have reached end of generation condition
+        is_generation_done_tensor = torch.zeros(batch_size, dtype=torch.bool).cuda()
+
+        # An array to act as a counter to keep track of generated sequence lengths
+        generated_sequence_lengths = torch.zeros(batch_size).cuda()
+
+        with torch.no_grad():
+
+            self.prep_model_for_inference(
+                prompts_tokens=batch_prompt_tokens, active_requests=active_requests
+            )
+
+            context_start_position = 0
+            # Pick the context window that we need to pass through the network.
+            for context_end_position in range(min_prompt_length_in_batch, max_sequence_length):
+
+                inference_input = self.inference_wrapped_model.get_batch_for_context_window(
+                    context_start_position, context_end_position
+                )
+
+                # Returns the final logits of shape [batch_size, context_length, vocab_size]
+                # Note: This is returned in all TP ranks or last PP stage in PP models
+                logits = self.inference_wrapped_model.run_one_forward_step(inference_input)
+                if self.model_is_pipeline_parallel:
+                    context_length = context_end_position - context_start_position
+                    logits = broadcast_from_last_pipeline_stage(
+                        [batch_size, context_length, self.tokenizer.vocab_size],
+                        dtype=self.inference_wrapped_model.inference_wrapper_config.params_dtype,
+                        tensor=logits,
+                    )
+
+                # Indicates which of the input prompts have started generating tokens.
+                # A 1D boolean tensor with [batch_size] elements (i.e) The shortest
+                # prompts will start generating first and so on
+                generation_started = prompt_lengths_in_batch <= context_end_position
+                last_token_logits = logits[:, -1, :]
+                sampled_logits = self.sample_from_logits(
+                    last_token_logits, sampling_params, self.tokenizer.vocab_size
+                )
+
+                # Substitute the sampled logits only for only the prompts that
+                # have started generating tokens
+                batch_prompt_tokens[generation_started, context_end_position] = sampled_logits[
+                    generation_started
+                ]
+
+                if sampling_params.return_log_probs:
+                    log_probs = F.log_softmax(logits, dim=2)
+                    indices = torch.unsqueeze(
+                        batch_prompt_tokens[
+                            :, (context_start_position + 1) : (context_end_position + 1)
+                        ],
+                        2,
+                    )
+                    # Get the log probabilities for only the prompt tokens
+                    output_log_probs[:, context_start_position:context_end_position] = torch.gather(
+                        log_probs, 2, indices
+                    ).squeeze(2)
+
+                context_start_position = context_end_position
+
+                # Check end of generation status for each tensor
+                # and update generated sequence lengths
+                (is_generation_done_tensor, generated_sequence_lengths) = (
+                    self.update_generation_status(
+                        updated_prompts_tokens=batch_prompt_tokens,
+                        generation_started=generation_started,
+                        current_context_end_position=context_end_position,
+                        is_generation_done_tensor=is_generation_done_tensor,
+                        generated_sequence_lengths=generated_sequence_lengths,
+                    )
+                )
+                # Boolean flag indicating if all prompts are finished
+                all_prompts_done = torch.all(is_generation_done_tensor)
+                if all_prompts_done:
+                    break
+
+        # Include all the generated tokens
+        batch_prompt_tokens_with_generations = batch_prompt_tokens[:, : (context_end_position + 1)]
+        if sampling_params.return_log_probs:
+            output_log_probs = output_log_probs[:, :context_end_position]
+
+        generated_sequence_lengths[
+            generated_sequence_lengths > sampling_params.num_tokens_to_generate
+        ] = sampling_params.num_tokens_to_generate
+
+        for idx, request in enumerate(active_requests.values()):
+            input_prompt_length = int(prompt_lengths_in_batch[idx])
+            # Shorter prompts might have generated more than required tokens. So we trim them down
+            required_sequence_length = int(
+                min(generated_sequence_lengths[idx], sampling_params.num_tokens_to_generate)
+            )
+            # Extract only the generated tokens
+            required_result_tokens = batch_prompt_tokens_with_generations[
+                idx, input_prompt_length : (input_prompt_length + required_sequence_length)
+            ]
+
+            request.generated_length = required_sequence_length
+            request.generated_tokens = required_result_tokens
+            request.generated_log_probs = (
+                None
+                if output_log_probs is None
+                else output_log_probs[idx, input_prompt_length:required_sequence_length]
+            )
+            request.status = Status.COMPLETED
+            request.generated_text = self.detokenize_generations(required_result_tokens)
+
+        return active_requests
+
+    def prep_model_for_inference(
+        self, prompts_tokens: torch.Tensor, active_requests: OrderedDict[int, InferenceRequest]
+    ):
+        """Preparing batch for inference, using respective wrapper's prep_model_for_inference method
+
+        Args:
+            prompts_tokens (torch.Tensor): A tensor of shape [batch_size, max_sequence_length]
+            active_requests (OrderedDict[int, InferenceRequest]): The input active requests
+        """
+        self.inference_wrapped_model.prep_model_for_inference(prompts_tokens=prompts_tokens)
--- a/megatron/core/inference/utils.py
+++ b/megatron/core/inference/utils.py