Commit 160bf237 authored by wangxj's avatar wangxj
Browse files

更新0.12

parent b01809dd
Pipeline #2448 failed with stages
......@@ -14,6 +14,10 @@ def broadcast_from_last_pipeline_stage(size, dtype, tensor=None):
"""Broadcast a tensor from last pipeline stage to all ranks."""
if parallel_state.is_pipeline_last_stage():
assert size == list(
tensor.shape
), f"Expected tensor of shape {size} but got {list(tensor.shape)}"
assert dtype == tensor.dtype, f"Expected tensor of type {dtype} but got {tensor.dtype}"
_is_cuda(tensor)
assert tensor.is_contiguous()
else:
......
File mode changed from 100755 to 100644
File mode changed from 100755 to 100644
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
from typing import Dict, List
import asyncio
import warnings
from collections import OrderedDict
from typing import AsyncGenerator, Dict, List, Optional, Union
import torch
from megatron.core.inference.common_inference_params import CommonInferenceParams
from megatron.core.inference.async_stream import AsyncStream
from megatron.core.inference.engines.abstract_engine import AbstractEngine
from megatron.core.inference.inference_request import InferenceRequest
from megatron.core.inference.sampling_params import SamplingParams
from megatron.core.inference.scheduler import Scheduler
from megatron.core.inference.text_generation_controllers.simple_text_generation_controller import (
SimpleTextGenerationController,
from megatron.core.inference.text_generation_controllers.text_generation_controller import (
TextGenerationController,
)
......@@ -19,31 +23,105 @@ class MCoreEngine(AbstractEngine):
Supports any model that is callable (Accepts the inputs and outputs the tensor)
Args:
text_generation_controller (SimpleTextGenerationController): A text generation
text_generation_controller (TextGenerationController): A text generation
controller that will be used to define how to preprocess prompts, generate
outputs and detokenizer the output tokens.
max_batch_size : The maxinum number of requests to process at once
max_batch_size (int, optional): The maximum number of requests to process at once.
Will be set from the InferenceWrapperConfig in `text_generation_controller` by
default.
random_seed (int, optional): Use a random seed if you want deterministic
results. Defaults to None.
"""
def __init__(
self,
text_generation_controller: SimpleTextGenerationController,
max_batch_size,
random_seed: int = None,
text_generation_controller: TextGenerationController,
max_batch_size: Optional[int] = None,
random_seed: Optional[int] = None,
):
inference_wrapper_config = (
text_generation_controller.inference_wrapped_model.inference_wrapper_config
)
inference_max_batch_size = inference_wrapper_config.inference_max_requests
if max_batch_size is None:
max_batch_size = inference_max_batch_size
elif max_batch_size > inference_max_batch_size:
warnings.warn(
f"Engine `max_batch_size` ({max_batch_size}) > "
f"`inference_max_requests` in `inference_wrapper_config` "
f"({inference_max_batch_size}); setting `max_batch_size` to "
f"{inference_max_batch_size}",
UserWarning,
)
max_batch_size = inference_max_batch_size
self.text_generation_controller = text_generation_controller
self.random_seed = random_seed
self.scheduler = Scheduler(max_batch_size=max_batch_size)
def get_new_request_id(self) -> str:
"""Gets a new request id from the scheduler"""
return self.scheduler.get_new_request_id()
def add_request(
self,
prompt: Optional[str] = None,
add_BOS: bool = False,
encoder_prompt: Optional[str] = None,
inference_parameters: Optional[SamplingParams] = None,
streaming: bool = False,
inference_request: Optional[InferenceRequest] = None,
) -> str:
"""
Adds a request to the scheduler and returns the request ID.
Args:
prompt (str): A prompt string
add_BOS (bool): Whether to add BOS token to beginning of the prompt
encoder_prompt (str): The encoder prompt string
inference_parameters (SamplingParams): The inference parameters
streaming (bool): Whether to stream incremental outputs for this request
inference_request (InferenceRequest, optional): A fully constructed request.
Defaults to None.
Returns:
The newly created request ID.
"""
assert (
prompt is not None or inference_request is not None
), f"At least one of `prompt` or `inference_request` must be specified"
if inference_request is None:
prompt_tokens = self.text_generation_controller.tokenize_prompt(prompt, add_BOS)
else:
prompt_tokens = inference_request.prompt_tokens
return self.scheduler.add_request(
prompt=prompt,
prompt_tokens=prompt_tokens,
encoder_prompt=encoder_prompt,
inference_parameters=inference_parameters,
streaming=streaming,
inference_request=inference_request,
)
def get_stream_generator(
self, request_id: str
) -> Union[AsyncGenerator[InferenceRequest, None], None]:
"""Returns the stream generator for the given request ID if it exists."""
stream = self.scheduler.streams.get(request_id, None)
if stream is not None:
return stream.generator()
return None
def generate(
self,
prompts: List[str],
prompts: Optional[List[str]] = None,
add_BOS: bool = False,
encoder_prompts: List[str] = None,
common_inference_params: CommonInferenceParams = None,
) -> dict:
encoder_prompts: Optional[List[str]] = None,
common_inference_params: Optional[SamplingParams] = None,
sampling_params: Optional[SamplingParams] = None,
inference_requests: Optional[List[InferenceRequest]] = None,
) -> List[InferenceRequest]:
"""The megatron core inference backend generate function
This backend returns the output generations as a dictionary.
......@@ -54,31 +132,47 @@ class MCoreEngine(AbstractEngine):
prompts (List[str]): All the prompts as a list of strings
add_BOS (bool): Whether to add BOS token to beginning of prompts
encoder_prompts (List[dict]): All the encoder prompts as a list of strings
common_inference_params (CommonInferenceParams): The inference parameters
common_inference_params: Deprecated. Only used for backward compatibility with
MCore <= 0.9.0. Use `sampling_params` going forward.
sampling_params (SamplingParams): The request-level sampling parameters
inference_requests (List[InferenceRequest]): A pre-populated list of inference requests
Returns:
List[InferenceRequest]: The output is list of inference requests containing the
generated tokens, texts and log probs if required
"""
# TODO :M core- get rng state tracker
request_ids: List[str] = []
if self.random_seed:
torch.random.manual_seed(self.random_seed)
for i in range(len(prompts)):
prompt = prompts[i]
encoder_prompt = encoder_prompts[i] if encoder_prompts is not None else None
prompt_tokens = self.text_generation_controller.tokenize_prompt(prompt, add_BOS)
if inference_requests is None:
assert prompts is not None
self.scheduler.add_request(
prompt=prompt,
prompt_tokens=prompt_tokens,
encoder_prompt=encoder_prompt,
inference_parameters=common_inference_params,
)
if common_inference_params:
sampling_params = common_inference_params
for i in range(len(prompts)):
prompt = prompts[i]
encoder_prompt = encoder_prompts[i] if encoder_prompts is not None else None
request_id = self.add_request(
prompt=prompt,
encoder_prompt=encoder_prompt,
inference_parameters=sampling_params,
)
request_ids.append(request_id)
else:
for inference_request in inference_requests:
request_ids.append(inference_request.request_id)
self.scheduler.add_request(inference_request=inference_request)
self.run_engine()
result: List[InferenceRequest] = self.scheduler.completed_request_pool.values()
result: List[InferenceRequest] = [
self.scheduler.completed_request_pool[request_id] for request_id in request_ids
]
return result
def run_engine(self):
......@@ -92,10 +186,15 @@ class MCoreEngine(AbstractEngine):
Defaults to False.
"""
while self.scheduler.have_requests_pending():
active_requests: Dict[int, InferenceRequest] = self.scheduler.active_request_pool.copy()
result_dict: Dict[int, InferenceRequest] = (
active_requests: Dict[str, InferenceRequest] = self.scheduler.active_request_pool.copy()
active_streams: Dict[str, AsyncStream] = OrderedDict()
for request_id in active_requests:
if (stream := self.scheduler.streams.get(request_id, None)) is not None:
assert isinstance(stream, AsyncStream), stream
active_streams[request_id] = stream
result_dict: Dict[str, InferenceRequest] = (
self.text_generation_controller.generate_all_output_tokens_static_batch(
active_requests
active_requests, active_streams
)
)
......@@ -105,9 +204,25 @@ class MCoreEngine(AbstractEngine):
"""
if dynamic_batching:
result_dict: Dict[
int, InferenceRequest
str, InferenceRequest
] = self.text_generation_controller.generate_output_tokens_one_step_dynamic_batch(
active_requests
)
self.scheduler.update_requests_pools(result_dict=result_dict)
"""
def _wrapped_run_engine(self, cuda_device):
"""
Explicitly sets the CUDA device before running the engine.
This is to ensure that the CUDA device is correctly propagated when running
in a new thread context.
"""
torch.cuda.set_device(cuda_device)
self.run_engine()
async def run_engine_async(self):
"""Runs the engine asynchronously using asyncio"""
loop = asyncio.get_running_loop()
await loop.run_in_executor(None, self._wrapped_run_engine, torch.cuda.current_device())
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
from dataclasses import dataclass
from enum import Enum
from typing import List
from typing import List, Optional
import torch
from megatron.core.inference.common_inference_params import CommonInferenceParams
from megatron.core.inference.sampling_params import SamplingParams
# class syntax
......@@ -18,7 +18,7 @@ class Status(Enum):
COMPLETED = 4
@dataclass
@dataclass(kw_only=True)
class InferenceRequest:
"""Class for one inference request
......@@ -28,12 +28,25 @@ class InferenceRequest:
request_id: str
prompt: str
inference_parameters: CommonInferenceParams
prompt_tokens: List[int]
arrival_time: float
status: Status
encoder_prompt: str = None
generated_text: str = None
generated_tokens: torch.Tensor = None
generated_log_probs: torch.Tensor = None
generated_length: int = 0
inference_parameters: Optional[SamplingParams] = None
prompt_tokens: Optional[List[int]] = None
arrival_time: Optional[float] = None
status: Optional[Status] = None
encoder_prompt: Optional[str] = None
generated_text: Optional[str] = None
segments: Optional[List[str]] = None
generated_segments: Optional[List[str]] = None
generated_sequence_lengths: Optional[List[int]] = None
generated_tokens: Optional[torch.Tensor] = None
generated_log_probs: Optional[torch.Tensor] = None
generated_length: Optional[int] = None
@dataclass(kw_only=True)
class VLMInferenceRequest(InferenceRequest):
"""Class for a VLM inference request"""
num_img_embeddings_per_tile: int
imgs: torch.Tensor
num_tiles: torch.Tensor
decoder_seq_length: int
File mode changed from 100755 to 100644
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
import abc
import math
from typing import Iterable, List, Union
from typing import Any, Dict, Iterable, Optional, Union
import torch
......@@ -26,7 +26,7 @@ class AbstractModelInferenceWrapper(abc.ABC):
def __init__(
self,
model: Union['LegacyGPTModel', GPTModel],
model: Union['LegacyGPTModel', GPTModel], # type: ignore[name-defined]
inference_wrapper_config: InferenceWrapperConfig,
):
"""Constructor for the model inference wrapper
......@@ -48,10 +48,15 @@ class AbstractModelInferenceWrapper(abc.ABC):
else self.inference_wrapper_config.params_dtype
)
max_batch_size = self.inference_wrapper_config.inference_max_requests
max_sequence_length = self.inference_wrapper_config.inference_max_seq_length
self.inference_params = InferenceParams(max_batch_size, max_sequence_length)
def prep_model_for_inference(self, prompts_tokens: torch.Tensor):
"""A utility function for preparing model for inference
The function gets called once before the auto regressive inference loop. It puts the model in eval mode , and gets some model and inference data parameters. Extend this to build position ids ,attention mask etc, so that required slices can be extracted during the forward pass.
The function gets called once before the auto regressive inference loop.
It puts the model in eval mode.
Args:
prompts_tokens (torch.Tensor): A tensor of shape [batch_size, max_seq_len]
......@@ -63,38 +68,64 @@ class AbstractModelInferenceWrapper(abc.ABC):
self.model_is_pipeline_parallel = not (
parallel_state.is_pipeline_first_stage() and parallel_state.is_pipeline_last_stage()
)
self.prompts_tokens = prompts_tokens
batch_size, max_sequence_length = self.prompts_tokens.shape
self.inference_params = InferenceParams(batch_size, max_sequence_length)
self.inference_params.reset()
@abc.abstractmethod
def prep_inference_input(self, prompt_tokens) -> Dict[str, Any]:
"""Prepares the inference input data.
Args:
prompts_tokens (torch.Tensor): A tensor of shape [batch_size, max_seq_len]
Returns:
A dict with all the inference input needed for the batch.
"""
raise NotImplementedError()
@abc.abstractmethod
def get_batch_for_context_window(self) -> List:
def get_batch_for_context_window(self, *args, **kwargs) -> Dict[str, Any]:
"""Returns the input data for inference
This function gets called iteratively in the inference loop . It can be used to extract relevant input from the prompt tokens, attention mask etc. required for each step in inference.
"""
pass
def forward_pass_without_pipeline_parallel(self, inference_input: List) -> torch.Tensor:
"""Utility to carry out simple forward pass for TP or no model parallel models
raise NotImplementedError()
Runs a very simple forward pass for model. Used in the case of models without any parallelism or only tensor parallelism.
def _forward(self, inference_input):
"""Runs a forward pass of the model.
Args:
inference_input (List): A list containg the inputs for the gpt model [tokens, position ids, attention mask]
inference_input(Dict[str, Any]): The input data.
inference_params(InferenceParams): The inference parameters.
Returns:
torch.Tensor: The output logits of shape [batch_size, seq_len, padded_vocab_size]
The model output logits.
"""
tokens, position_ids, attention_mask = inference_input
logits = self.model(
tokens = inference_input["tokens"]
position_ids = inference_input["position_ids"]
attention_mask = inference_input["attention_mask"]
return self.model(
tokens, position_ids, attention_mask, inference_params=self.inference_params
)
logits = tensor_parallel.gather_from_tensor_model_parallel_region(logits)
self.inference_params.sequence_len_offset += tokens.size(1)
return logits
def _get_batch_size_and_seq_len(
self, tokens: torch.Tensor, recv_buffer_seq_len: Optional[int] = None
):
"""
Returns the batch size and sequence length based on the tokens tensor and recv_buffer_seq_len.
Args:
tokens (torch.Tensor): The input tensor of shape (batch_size, seq_len).
recv_buffer_seq_len (int, optional): An optional recv buffer sequence length.
Returns:
tuple: A tuple (batch_size, seq_len), where batch_size is the first dimension of tokens
and seq_len is either the second dimension or recv_buffer_seq_len.
"""
batch_size = tokens.shape[0]
seq_len = recv_buffer_seq_len if recv_buffer_seq_len is not None else tokens.shape[1]
return batch_size, seq_len
def _allocate_recv_buffer(self, batch_size, seq_len):
"""Receive happens between the layers with size [seq_len, batch_size, hidden_size]."""
......@@ -103,30 +134,51 @@ class AbstractModelInferenceWrapper(abc.ABC):
recv_size, dtype=self.pipeline_communication_dtype, device=torch.cuda.current_device()
)
def forward_pass_without_pipeline_parallel(
self, inference_input: Dict[str, Any]
) -> torch.Tensor:
"""Utility to carry out simple forward pass for TP or no model parallel models
Runs a very simple forward pass for model. Used in the case of models without any parallelism or only tensor parallelism.
Args:
inference_input (Dict[str, Any]): A dict containg the inputs for the gpt model [tokens, position ids, attention mask]
Returns:
torch.Tensor: The output logits of shape [batch_size, seq_len, padded_vocab_size]
"""
tokens = inference_input["tokens"]
logits = self._forward(inference_input)
logits = tensor_parallel.gather_from_tensor_model_parallel_region(logits)
self.inference_params.sequence_len_offset += tokens.size(1)
return logits
def forward_pass_with_pipeline_parallel_small_input_batch(
self, inference_input: List
self, inference_input: Dict[str, Any], recv_buffer_seq_len: Optional[int] = None
) -> torch.Tensor:
"""Utility to carry out forward pass for PP models with very small inputs
If a model is pipeline parallel, yet, the input global batch is very small, we compute a foward pass on the entire global batch, rather than splitting it up into micro batches and doing something more complex as in the forward_pass_with_pipeline_parallel_large_input_batch method
Args:
inference_input (List): A list containg the inputs for the gpt model [tokens, position ids, attention mask]
inference_input (Dict[str, Any]): A dict containing the inputs for the gpt model [tokens, position ids, attention mask]
recv_buffer_seq_len (int): An optional sequence length for the pipeline parallel recv buffer.
Returns:
torch.Tensor: The output logits of shape [batch_size, seq_len, padded_vocab_size]
"""
tokens, position_ids, attention_mask = inference_input
batch_size, seq_len = tokens.shape
tokens = inference_input["tokens"]
position_ids = inference_input["position_ids"]
attention_mask = inference_input["attention_mask"]
batch_size, seq_len = self._get_batch_size_and_seq_len(tokens, recv_buffer_seq_len)
recv_buffer = None
if not parallel_state.is_pipeline_first_stage():
recv_buffer = self._allocate_recv_buffer(batch_size, seq_len)
recv_from_prev_pipeline_rank_(recv_buffer)
self.model.set_input_tensor(recv_buffer)
output_tensor = self.model(
tokens, position_ids, attention_mask, inference_params=self.inference_params
)
output_tensor = self._forward(inference_input)
if not parallel_state.is_pipeline_last_stage():
send_to_next_pipeline_rank(output_tensor.type(dtype=self.pipeline_communication_dtype))
......@@ -138,27 +190,35 @@ class AbstractModelInferenceWrapper(abc.ABC):
logits = output_tensor
logits = tensor_parallel.gather_from_tensor_model_parallel_region(logits)
# Explicitly cast logits to expected dtype
logits = logits.to(self.inference_wrapper_config.params_dtype)
return logits
def forward_pass_with_pipeline_parallel_large_input_batch(
self, inference_input: List
self, inference_input: Dict[str, Any], recv_buffer_seq_len=None
) -> torch.Tensor:
"""Utility to carry out forward pass PP models.
Runs the forward pass for models which are pipeline parallel. This is more complex than forward_pass_with_pipeline_parallel_small_input_batch coz this splits the global batch into small micro batches and runs them through the model.
Runs the forward pass for models which are pipeline parallel.
This is more complex than forward_pass_with_pipeline_parallel_small_input_batch because
this splits the global batch into small micro batches and runs them through the model.
Args:
inference_input (List): A list containg the inputs for the gpt model [tokens, position ids, attention mask]
inference_input (Dict[str, Any]): A dict containg the inputs for the gpt model [tokens, position ids, attention mask]
recv_buffer_seq_len (int): An optional sequence length for the pipeline parallel recv buffer.
Returns:
torch.Tensor: The output logits of shape [batch_size, seq_len, padded_vocab_size]
"""
tokens, position_ids, attention_mask = inference_input
tokens = inference_input["tokens"]
position_ids = inference_input["position_ids"]
attention_mask = inference_input["attention_mask"]
micro_batch_size = max(
1,
self.inference_wrapper_config.inference_batch_times_seqlen_threshold // tokens.size(1),
)
batch_size, seq_len = tokens.shape
batch_size, seq_len = self._get_batch_size_and_seq_len(tokens, recv_buffer_seq_len)
# Round up to account for the last partial micro batch if present
num_micro_batches = math.ceil(batch_size / micro_batch_size)
......@@ -167,7 +227,7 @@ class AbstractModelInferenceWrapper(abc.ABC):
if parallel_state.is_pipeline_last_stage():
logits = torch.empty(
(batch_size, seq_len, self.inference_wrapper_config.padded_vocab_size),
dtype=torch.float32,
dtype=self.pipeline_communication_dtype,
device=torch.cuda.current_device(),
)
......@@ -189,8 +249,12 @@ class AbstractModelInferenceWrapper(abc.ABC):
recv_from_prev_pipeline_rank_(recv_buffer)
self.model.set_input_tensor(recv_buffer)
output_tensor = self.model(
tokens2use, position_ids2use, attention_mask, inference_params=self.inference_params
output_tensor = self._forward(
{
"tokens": tokens2use,
"position_ids": position_ids2use,
"attention_mask": attention_mask,
}
)
if not parallel_state.is_pipeline_last_stage():
......@@ -202,8 +266,12 @@ class AbstractModelInferenceWrapper(abc.ABC):
output_tensor = tensor_parallel.gather_from_tensor_model_parallel_region(
output_tensor
)
assert logits is not None
logits[start:end, ...] = output_tensor
# Explicitly cast logits to expected dtype
logits = logits.to(self.inference_wrapper_config.params_dtype)
# Once done with all micro batches, we reset batch size offset and seq len offset
self.inference_params.sequence_len_offset += seq_len
self.inference_params.batch_size_offset = 0
......@@ -211,28 +279,37 @@ class AbstractModelInferenceWrapper(abc.ABC):
# NOTE: Only returns the logits on the last pipeline stage
return logits
def run_one_forward_step(self, inference_input: List) -> torch.Tensor:
def run_one_forward_step(
self, inference_input: Dict[str, Any], recv_buffer_seq_len: Optional[int] = None
) -> torch.Tensor:
"""The forward pass of the model for inference
Appropriate utility is called for the forward pass depending on the type of model parallelism used
Args:
inference_input (List): A list containg the inputs for the gpt model [tokens, position ids, attention mask]
inference_input (Dict[str, Any]): A dict containg the inputs for the gpt model [tokens, position ids, attention mask]
recv_buffer_seq_len (int): An optional sequence length for the pipeline parallel recv buffer.
Returns:
torch.Tensor: The output logits of shape [batch_size, seq_len, padded_vocab_size]. The logits are returned only in the last pipeline stage for PP models.
"""
if self.model_is_pipeline_parallel:
tokens = inference_input[0]
current_batch_size, seq_len = tokens.shape
tokens = inference_input["tokens"]
current_batch_size, seq_len = self._get_batch_size_and_seq_len(
tokens, recv_buffer_seq_len
)
# If input batch is large, we need to split into micro batches and run the forward pass
if (
current_batch_size * seq_len
> self.inference_wrapper_config.inference_batch_times_seqlen_threshold
):
return self.forward_pass_with_pipeline_parallel_large_input_batch(inference_input)
return self.forward_pass_with_pipeline_parallel_large_input_batch(
inference_input, recv_buffer_seq_len
)
else:
# If input batch is very small we can do a simple forward pass on the entire global batch
return self.forward_pass_with_pipeline_parallel_small_input_batch(inference_input)
return self.forward_pass_with_pipeline_parallel_small_input_batch(
inference_input, recv_buffer_seq_len
)
else:
return self.forward_pass_without_pipeline_parallel(inference_input)
File mode changed from 100755 to 100644
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
from typing import List, Tuple
from typing import Any, Dict, Tuple
import torch
......@@ -27,19 +27,21 @@ class GPTInferenceWrapper(AbstractModelInferenceWrapper):
"""
super().__init__(model, inference_wrapper_config)
def prep_model_for_inference(self, prompts_tokens: torch.Tensor):
"""A utility function for preparing model for inference
This function is called before the forward pass. It puts the model in eval mode, builds position ids, and creates attention masks so that required slices can be extracted during the forward pass.
def prep_inference_input(self, prompts_tokens: torch.Tensor) -> Dict[str, Any]:
"""Prepares the inference input data.
Args:
prompts_tokens (torch.Tensor): A tensor of shape [batch_size, max_seq_len]
"""
super().prep_model_for_inference(prompts_tokens=prompts_tokens)
self.attention_mask, self.position_ids = self._build_attention_mask_and_position_ids(
prompts_tokens
)
Returns:
A dict with all the inference input needed for the batch.
"""
attention_mask, position_ids = self._build_attention_mask_and_position_ids(prompts_tokens)
return {
"tokens": prompts_tokens,
"attention_mask": attention_mask,
"position_ids": position_ids,
}
def _build_attention_mask_and_position_ids(
self, prompts_tokens: torch.Tensor
......@@ -68,23 +70,33 @@ class GPTInferenceWrapper(AbstractModelInferenceWrapper):
return attention_mask, position_ids
def get_batch_for_context_window(
self, context_start_position: int, context_end_position: int
) -> List:
self,
inference_input: Dict[str, Any],
context_start_position: int,
context_end_position: int,
) -> Dict[str, Any]:
"""Returns the inference data given context window
This function gets called iteratively in a loop . Given the start and end context positions , it extracts the appropriate data.
Args:
inference_input (Dict[str, Any]): The inference input for the batch.
context_start_position (int): Start of the context window. During the first inference step it is mostly 0
context_end_position (int): End of the context window. During the last inference step it will mostly be the max generated sequence length.
Returns:
List: A list of inputs that will be used by your model in the forward step
Dict[str, Any]: A dict of inputs that will be used by your model in the forward step
"""
tokens2use = self.prompts_tokens[:, context_start_position:context_end_position]
positions2use = self.position_ids[:, context_start_position:context_end_position]
attention_mask2use = self.attention_mask[
tokens = inference_input["tokens"]
position_ids = inference_input["position_ids"]
attention_mask = inference_input["attention_mask"]
tokens2use = tokens[:, context_start_position:context_end_position]
positions2use = position_ids[:, context_start_position:context_end_position]
attention_mask2use = attention_mask[
..., context_start_position:context_end_position, :context_end_position
]
data_at_step_idx = [tokens2use, positions2use, attention_mask2use]
return data_at_step_idx
return {
"tokens": tokens2use,
"position_ids": positions2use,
"attention_mask": attention_mask2use,
}
......@@ -25,6 +25,12 @@ class InferenceWrapperConfig:
"""The final padded vocab size (Padded to make it divisible by
--make-vocab-size-divisible-by value)"""
inference_max_requests: int = 8
""" Maximum number of requests for inference (prefill & decode). Necessary for CUDA graphs. """
inference_max_seq_length: int = 2560
""" Maximum sequence length for inference (prefill & decode). Necessary for CUDA graphs. """
fp32_residual_connection: bool = False
"""Move residual connections to fp32. Obtained from arguments.py"""
......
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
from typing import Any, Dict
import torch
from megatron.core import parallel_state
from megatron.core.inference.model_inference_wrappers.gpt.gpt_inference_wrapper import (
GPTInferenceWrapper,
)
from megatron.core.inference_params import InferenceParams
# pylint: disable=line-too-long
class VLMInferenceWrapper(GPTInferenceWrapper):
"""Inference wrapper for VLMs"""
def prep_model_for_inference(self, prompts_tokens: torch.Tensor):
"""A utility function for preparing model for inference
The function gets called once before the auto regressive inference loop.
It puts the model in eval mode.
Args:
prompts_tokens (torch.Tensor): A tensor of shape [batch_size, max_seq_len]
"""
super().prep_model_for_inference(prompts_tokens)
# For TP only model both is_pp_first_stage and _is_pp_last_stage returns True
self.model_is_pipeline_parallel = not (
parallel_state.is_pipeline_first_stage() and parallel_state.is_pipeline_last_stage()
)
self._recv_only_vision_embeds = False
pp_rank = parallel_state.get_pipeline_model_parallel_rank()
# Checks if the previous stage only has a vision encoder, and that the current stage
# has part of the LM decoder. In this case, the current stage should only receive
# vision embeddings.
if pp_rank > 0:
self._recv_only_vision_embeds = (
parallel_state.is_inside_encoder(pp_rank - 1)
and (not parallel_state.is_inside_decoder(pp_rank - 1))
and parallel_state.is_inside_decoder()
)
# Checks if the current stage only has a vision encoder
self._encoder_only = (
parallel_state.is_inside_encoder() and not parallel_state.is_inside_decoder()
)
# For TP only model both is_pp_first_stage and _is_pp_last_stage returns True
self.model_is_pipeline_parallel = not (
parallel_state.is_pipeline_first_stage() and parallel_state.is_pipeline_last_stage()
)
def prep_inference_input(
self,
prompts_tokens: torch.Tensor,
num_img_embeddings_per_tile: int,
images: torch.Tensor,
num_tiles: torch.Tensor,
decoder_seq_length: int,
):
"""Prepares the inference input data.
Args:
prompts_tokens (torch.Tensor): A tensor of shape [batch_size, max_seq_len]
num_img_embeddings_per_tile (int): The number of image embeddings per tile
images (torch.Tensor): The image embeddings
num_tiles (torch.Tensor): The number of tiles for each input image
decoder_seq_length (int): The decoder sequence length
"""
inference_input = super().prep_inference_input(prompts_tokens)
total_num_tiles = torch.sum(num_tiles).item()
num_img_embeddings = num_img_embeddings_per_tile * total_num_tiles
batch_size, max_sequence_length = prompts_tokens.shape
self.inference_params = InferenceParams(
batch_size, max_sequence_length + num_img_embeddings
)
inference_input["images"] = images
inference_input["num_tiles"] = num_tiles
inference_input["num_img_embeddings"] = num_img_embeddings
inference_input["decoder_seq_length"] = decoder_seq_length
return inference_input
def get_batch_for_context_window(
self,
inference_input: Dict[str, Any],
context_start_position: int,
context_end_position: int,
) -> Dict[str, Any]:
"""Returns the inference data given context window
This function gets called iteratively in a loop . Given the start and end context positions , it extracts the appropriate data.
Args:
inference_input (Dict[str, Any]): The inference input for the batch.
context_start_position (int): Start of the context window. During the first inference step it is mostly 0
context_end_position (int): End of the context window. During the last inference step it will mostly be the max generated sequence length.
Returns:
Dict[str, Any]: A dict of inputs that will be used by your model in the forward step
"""
tokens = inference_input["tokens"]
position_ids = inference_input["position_ids"]
images = inference_input["images"]
num_tiles = inference_input["num_tiles"]
num_img_embeddings = inference_input["num_img_embeddings"]
decoder_seq_length = inference_input["decoder_seq_length"]
tokens2use = tokens[:, context_start_position:context_end_position]
positions2use = position_ids[:, context_start_position:context_end_position]
return {
"tokens": tokens2use,
"position_ids": positions2use,
"images": images,
"num_tiles": num_tiles,
"num_img_embeddings": num_img_embeddings,
"decoder_seq_length": decoder_seq_length,
}
def _forward(self, inference_input: Dict[str, Any]):
"""Runs a forward pass of the model.
Args:
inference_input(Dict[str, Any]): The input data.
Returns:
The model output logits.
"""
images = inference_input["images"]
tokens = inference_input["tokens"]
position_ids = inference_input["position_ids"]
num_image_tiles = inference_input["num_tiles"]
output = self.model(
images,
tokens,
position_ids=position_ids,
attention_mask=None,
inference_params=self.inference_params,
num_image_tiles=num_image_tiles,
runtime_gather_output=True,
)
if isinstance(output, tuple):
logits, _ = output
else:
logits = output
return logits
def run_one_forward_step(self, inference_input: Dict[str, Any]) -> torch.Tensor:
tokens = inference_input["tokens"]
num_image_tokens = (tokens == self.model.module.image_token_index).sum().item()
num_img_embeddings = inference_input["num_img_embeddings"]
decoder_seq_length = inference_input["decoder_seq_length"]
num_tokens = tokens.size(1)
recv_buffer_seq_len = None
if num_image_tokens > 0:
# When there are image tokens and this stage only receives vision embeddings,
# adjust the recv buffer seq length to match the image embeddings sequence length.
# If there are image tokens and this stage receives full embeddings, make sure we
# compensate for expansion of image tokens.
# Note that this will set a recv_buffer_seq_len for the encoder stage,
# this length is irrelevant since that recv buffer is never allocated.
if self._recv_only_vision_embeds:
recv_buffer_seq_len = num_img_embeddings
else:
recv_buffer_seq_len = min(
num_img_embeddings + num_tokens - num_image_tokens, decoder_seq_length
)
elif self._recv_only_vision_embeds:
# If this stage only receives vision embeddings and there are no image tokens
# we won't run the encoder and therefore shouldn't try to recv.
recv_buffer_seq_len = 0
# If the pipeline stage only has a vision encoder, then it only needs to
# run when there are image tokens
if not (self._encoder_only and num_image_tokens == 0):
output = super().run_one_forward_step(
inference_input, recv_buffer_seq_len=recv_buffer_seq_len
)
else:
output = None
logits = output
# On the first inference iteration, we compute image tokens.
# On every PP stage(although inference params should only matter for decoder),
# update the sequence length offset by the number of image tokens.
if num_tokens > 1 and num_image_tokens > 0:
if "image_tokens_count" not in self.inference_params.key_value_memory_dict:
self.inference_params.key_value_memory_dict["image_tokens_count"] = (
num_img_embeddings
)
if num_img_embeddings + num_tokens - num_image_tokens > decoder_seq_length:
self.inference_params.sequence_len_offset += decoder_seq_length - num_tokens
else:
self.inference_params.sequence_len_offset += (
self.inference_params.key_value_memory_dict["image_tokens_count"]
- num_image_tokens
)
return logits
File mode changed from 100755 to 100644
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
from collections import deque
from typing import Any, List, Tuple
from typing import Any, Dict, List, Optional
import numpy
import torch
......@@ -14,6 +14,7 @@ from megatron.core.inference.model_inference_wrappers.inference_wrapper_config i
InferenceWrapperConfig,
)
from megatron.core.models.T5 import T5Model
from megatron.core.utils import get_attr_wrapped_model
# pylint: disable=line-too-long
......@@ -39,54 +40,57 @@ class T5InferenceWrapper(AbstractModelInferenceWrapper):
super().__init__(model, inference_wrapper_config)
self.use_local = use_local
def prep_model_for_inference(
self, prompts_tokens: torch.Tensor, encoder_prompts: List[str] = None, tokenizer: Any = None
):
"""A utility function for preparing model for inference
This function is called before the forward pass. It puts the model in eval mode, builds
position ids, and creates attention masks so that required slices can be extracted during
the forward pass.
def prep_inference_input(
self,
prompts_tokens: torch.Tensor,
encoder_prompts: Optional[List[str]] = None,
tokenizer: Any = None,
) -> Dict[str, Any]:
"""Prepares the inference input data.
Args:
prompts_tokens (torch.Tensor): A tensor of shape [batch_size, max_sequence_length]
prompts_tokens (torch.Tensor): A tensor of shape [batch_size, max_seq_len]
encoder_prompts (dict): List of string of encoder input prompts
tokenizer (_type_): Tokenizer used for tokenizing and detokenizing text
"""
super().prep_model_for_inference(prompts_tokens=prompts_tokens)
Returns:
A dict with all the inference input needed for the batch.
"""
# get max_sequence_length
if hasattr(self.model, "module"): # if self.model is Float16Module
max_sequence_length = self.model.module.max_sequence_length
else:
max_sequence_length = self.model.max_sequence_length
max_sequence_length = get_attr_wrapped_model(self.model, "max_sequence_length")
encoder_prompts_tokens_list = [
self.tokenize_encoder_prompt(encoder_prompt, tokenizer)
for encoder_prompt in encoder_prompts
]
self.batch_encoder_prompts_tokens = self.pad_encoder_prompts_tokens(
batch_encoder_prompts_tokens = self.pad_encoder_prompts_tokens(
encoder_prompts_tokens_list, max_sequence_length, tokenizer
)
# create batch mask for encoder_prompt (self.batch_input_tokens) and
# decoder_input (self.prompts_tokens), similar to megatron/core/datasets/t5_dataset.py
decoder_prompts_tokens = self.prompts_tokens.cpu().numpy()
encoder_prompts_tokens = self.batch_encoder_prompts_tokens.cpu().numpy()
self.batch_mask_encoder = []
self.batch_mask_decoder = []
for i in range(len(self.prompts_tokens)):
mask_encoder = encoder_prompts_tokens[i] == tokenizer.pad
mask_decoder = decoder_prompts_tokens[i] == tokenizer.pad
self.batch_mask_encoder.append(mask_encoder)
self.batch_mask_decoder.append(mask_decoder)
self.batch_mask_encoder = torch.tensor(numpy.array(self.batch_mask_encoder)).cuda()
self.batch_mask_decoder = torch.tensor(numpy.array(self.batch_mask_decoder)).cuda()
def tokenize_encoder_prompt(
self, encoder_prompt: str, tokenizer
) -> Tuple[torch.Tensor, torch.Tensor]:
# decoder_input (prompts_tokens), similar to megatron/core/datasets/t5_dataset.py
decoder_prompts_tokens = prompts_tokens
encoder_prompts_tokens = batch_encoder_prompts_tokens
decoder_prompts_tokens_numpy = decoder_prompts_tokens.cpu().numpy()
encoder_prompts_tokens_numpy = encoder_prompts_tokens.cpu().numpy()
batch_mask_encoder = []
batch_mask_decoder = []
for i in range(len(prompts_tokens)):
mask_encoder = encoder_prompts_tokens_numpy[i] == tokenizer.pad
mask_decoder = decoder_prompts_tokens_numpy[i] == tokenizer.pad
batch_mask_encoder.append(mask_encoder)
batch_mask_decoder.append(mask_decoder)
batch_mask_encoder = torch.tensor(numpy.array(batch_mask_encoder)).cuda()
batch_mask_decoder = torch.tensor(numpy.array(batch_mask_decoder)).cuda()
return {
"encoder_tokens": encoder_prompts_tokens,
"decoder_tokens": decoder_prompts_tokens,
"encoder_mask": batch_mask_encoder,
"decoder_mask": batch_mask_decoder,
}
def tokenize_encoder_prompt(self, encoder_prompt: str, tokenizer) -> torch.Tensor:
"""Utility to tokenize the encoder_prompt
Args:
......@@ -138,28 +142,32 @@ class T5InferenceWrapper(AbstractModelInferenceWrapper):
return torch.tensor(encoder_prompts_tokens_list).cuda()
def get_batch_for_context_window(
self, context_start_position: int, context_end_position: int
) -> List:
self,
inference_input: Dict[str, Any],
context_start_position: int,
context_end_position: int,
) -> Dict[str, Any]:
"""Returns the inference data given context window
This function gets called iteratively in a loop . Given the start and end context
positions , it extracts the appropriate data.
Args:
inference_input (Dict[str, Any]): The inference input for the batch.
context_start_position (int): Start of the context window. During
the first inference step it is mostly 0
context_end_position (int): End of the context window. During the
last inference step it will mostly be the max generated sequence length.
Returns:
List: A list of inputs that will be used by your model in the forward step
Dict: A dict of inputs that will be used by your model in the forward step
"""
# T5 inference not yet support kv_cache
encoder_tokens2use = self.batch_encoder_prompts_tokens
decoder_tokens2use = self.prompts_tokens[:, :context_end_position]
encoder_mask2use = self.batch_mask_encoder
decoder_mask2use = self.batch_mask_decoder[:, :context_end_position]
encoder_tokens2use = inference_input["encoder_tokens"]
decoder_tokens2use = inference_input["decoder_tokens"][:, :context_end_position]
encoder_mask2use = inference_input["encoder_mask"]
decoder_mask2use = inference_input["decoder_mask"][:, :context_end_position]
# Configure attention mask based on different conditions
# (e.g., transformer-impl, TE versions, TE backends)
......@@ -173,32 +181,34 @@ class T5InferenceWrapper(AbstractModelInferenceWrapper):
)
)
data_at_step_idx = [
encoder_tokens2use,
decoder_tokens2use,
encoder_mask2use,
decoder_mask2use,
encoder_decoder_mask2use,
]
return data_at_step_idx
return {
"encoder_tokens": encoder_tokens2use,
"decoder_tokens": decoder_tokens2use,
"encoder_mask": encoder_mask2use,
"decoder_mask": decoder_mask2use,
"encoder_decoder_mask": encoder_decoder_mask2use,
}
def forward_pass_without_pipeline_parallel(self, inference_input: List) -> torch.Tensor:
def forward_pass_without_pipeline_parallel(
self, inference_input: Dict[str, Any]
) -> torch.Tensor:
"""Utility to carry out simple forward pass for TP or no model parallel models
Runs a very simple forward pass for model. Used in the case of models without
any parallelism or only tensor parallelism.
Args:
inference_input (List): A list containg the inputs for the gpt
inference_input (Dict[str, Any]): A dict containg the inputs for the gpt
model [tokens, position ids, attention mask]
Returns:
torch.Tensor: The output logits of shape [batch_size, seq_len, padded_vocab_size]
"""
[encoder_tokens, decoder_tokens, encoder_mask, decoder_mask, encoder_decoder_mask] = (
inference_input
)
encoder_tokens = inference_input["encoder_tokens"]
decoder_tokens = inference_input["decoder_tokens"]
encoder_mask = inference_input["encoder_mask"]
decoder_mask = inference_input["decoder_mask"]
encoder_decoder_mask = inference_input["encoder_decoder_mask"]
tokens = decoder_tokens
# T5 inference not yet support kv_cache
......
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
"""Integrations with NVIDIA TensorRT Model Optimizer (referred as ModelOpt).
ModelOpt is a library comprising state-of-the-art model optimization techniques including quantization and sparsity to
compress model for efficient inference on NVIDIA GPUs. ModelOpt is integrated with Megatron-core to provide a seamless
experience for users to optimize their Megatron-core models for inference. More details on ModelOpt including
installation and usage can be found at https://github.com/NVIDIA/TensorRT-Model-Optimizer.
ModelOpt is a library comprising state-of-the-art model optimization techniques
including quantization and sparsity to compress model for efficient inference on
NVIDIA GPUs. ModelOpt is integrated with Megatron-core to provide a seamless
experience for users to optimize their Megatron-core models for inference.
More details on ModelOpt including installation and usage can be found at
https://github.com/NVIDIA/TensorRT-Model-Optimizer.
"""
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
import warnings
warnings.warn(
"`megatron.core.inference.modelopt_support` will be deprecated in a"
"future release. Use `megatron.core.post_training.modelopt` instead."
)
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
from typing import Optional
from megatron.core.extensions.transformer_engine import TEDotProductAttention, TENorm
from megatron.core.fusions.fused_bias_dropout import get_bias_dropout_add
from megatron.core.models.gpt.gpt_layer_specs import _get_mlp_module_spec
from megatron.core.models.gpt.gpt_layer_specs import get_mlp_module_spec
from megatron.core.tensor_parallel.layers import ColumnParallelLinear, RowParallelLinear
from megatron.core.transformer.attention import SelfAttention, SelfAttentionSubmodules
from megatron.core.transformer.dot_product_attention import DotProductAttention
from megatron.core.transformer.enums import AttnMaskType
from megatron.core.transformer.identity_op import IdentityOp
from megatron.core.transformer.spec_utils import ModuleSpec
......@@ -13,7 +16,8 @@ from megatron.core.transformer.transformer_layer import TransformerLayer, Transf
# Use this spec for ModelOpt PTQ and TensorRT-LLM export
def get_gpt_layer_modelopt_spec(
num_experts: int = None,
num_experts: Optional[int] = None,
local_core_attention: bool = False,
moe_grouped_gemm: bool = False,
remap_te_layernorm: bool = False,
qk_layernorm: bool = False,
......@@ -24,7 +28,8 @@ def get_gpt_layer_modelopt_spec(
is using TENorm from Transformer-Engine. The issue is that FusedLayerNorm from apex
has stopped supporting RMSNorm needed by llama.
"""
mlp = _get_mlp_module_spec(
core_attention = DotProductAttention if local_core_attention else TEDotProductAttention
mlp = get_mlp_module_spec(
use_te=False, num_experts=num_experts, moe_grouped_gemm=moe_grouped_gemm, fp8=False
)
sharded_state_dict_keys_map = {}
......@@ -47,7 +52,7 @@ def get_gpt_layer_modelopt_spec(
params={"attn_mask_type": AttnMaskType.causal},
submodules=SelfAttentionSubmodules(
linear_qkv=ColumnParallelLinear,
core_attention=TEDotProductAttention,
core_attention=core_attention,
linear_proj=RowParallelLinear,
q_layernorm=TENorm if qk_layernorm else IdentityOp,
k_layernorm=TENorm if qk_layernorm else IdentityOp,
......
File mode changed from 100755 to 100644
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
from megatron.core.extensions.transformer_engine import TEDotProductAttention, TENorm
from megatron.core.fusions.fused_bias_dropout import get_bias_dropout_add
from megatron.core.ssm.mamba_block import MambaStack, MambaStackSubmodules
from megatron.core.ssm.mamba_layer import MambaLayer, MambaLayerSubmodules
from megatron.core.ssm.mamba_mixer import MambaMixer, MambaMixerSubmodules
from megatron.core.tensor_parallel.layers import ColumnParallelLinear, RowParallelLinear
from megatron.core.transformer.attention import SelfAttention, SelfAttentionSubmodules
from megatron.core.transformer.dot_product_attention import DotProductAttention
from megatron.core.transformer.enums import AttnMaskType
from megatron.core.transformer.mlp import MLP, MLPSubmodules
from megatron.core.transformer.spec_utils import ModuleSpec
from megatron.core.transformer.transformer_layer import TransformerLayer, TransformerLayerSubmodules
# Use this spec for ModelOpt PTQ and TensorRT-LLM export
def get_mamba_stack_modelopt_spec(
local_core_attention: bool = False, remap_te_layernorm: bool = False
) -> ModuleSpec:
"""Mix the native spec with TENorm.
This is essentially the native local spec except for the layernorm implementation
is using TENorm from Transformer-Engine.
"""
mamba_state_dict_keys_map = {}
transformer_state_dict_keys_map = {}
if remap_te_layernorm:
mamba_state_dict_keys_map = {'norm.': 'mixer.in_proj.layer_norm_'}
transformer_state_dict_keys_map = {
'input_layernorm.': 'self_attention.linear_qkv.layer_norm_',
'pre_mlp_layernorm.': 'mlp.linear_fc1.layer_norm_',
}
mamba_layer = ModuleSpec(
module=MambaLayer,
submodules=MambaLayerSubmodules(
norm=TENorm,
mixer=ModuleSpec(
module=MambaMixer,
submodules=MambaMixerSubmodules(
in_proj=ColumnParallelLinear, out_proj=RowParallelLinear
),
),
mamba_bda=get_bias_dropout_add,
sharded_state_dict_keys_map=mamba_state_dict_keys_map,
),
)
core_attention = DotProductAttention if local_core_attention else TEDotProductAttention
attention_layer = ModuleSpec(
module=TransformerLayer,
submodules=TransformerLayerSubmodules(
input_layernorm=TENorm,
self_attention=ModuleSpec(
module=SelfAttention,
params={"attn_mask_type": AttnMaskType.causal},
submodules=SelfAttentionSubmodules(
linear_qkv=ColumnParallelLinear,
core_attention=core_attention,
linear_proj=RowParallelLinear,
),
),
self_attn_bda=get_bias_dropout_add,
sharded_state_dict_keys_map=transformer_state_dict_keys_map,
),
)
mlp_layer = ModuleSpec(
module=TransformerLayer,
submodules=TransformerLayerSubmodules(
pre_mlp_layernorm=TENorm,
mlp=ModuleSpec(
module=MLP,
submodules=MLPSubmodules(
linear_fc1=ColumnParallelLinear, linear_fc2=RowParallelLinear
),
),
mlp_bda=get_bias_dropout_add,
sharded_state_dict_keys_map=transformer_state_dict_keys_map,
),
)
return ModuleSpec(
module=MambaStack,
submodules=MambaStackSubmodules(
mamba_layer=mamba_layer, attention_layer=attention_layer, mlp_layer=mlp_layer
),
)
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
from dataclasses import dataclass
@dataclass
class SamplingParams:
"""Inference parameters sent along with the prompts.
This class contains request-level attributes that control the sampling techniques used when
generating text. This is distinct from megatron.core.InferenceParams, which is sets model-level
inference attributes such as the maximum sequence length, and contains the KV cache.
For an explanation of these parameters refer to this blog
https://ivibudh.medium.com/a-guide-to-controlling-llm-model-output-exploring-top-k-top-p-and-
temperature-parameters-ed6a31313910
"""
temperature: float = 1.0
top_k: int = 0
top_p: float = 0.0
return_log_probs: bool = False
return_segments: bool = False # Whether to return individually detokenized tokens
num_tokens_to_generate: int = 30
def add_attributes(self, attribute_value_pair: dict):
"""Utility to add more attributes to sampling params
Use this method to pass in a custom dictionary to add more sampling parameter attributes.
c = SamplingParams
c.add_attributes({'min_length':4, 'eod_id':153})
Args:
attribute_value_pair (dict): A dictionary containing attributes as the key names and
their values as the values.
"""
for key, value in attribute_value_pair.items():
setattr(self, key, value)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment