Unverified Commit 7a64d24a authored by Cyrus Leung's avatar Cyrus Leung Committed by GitHub
Browse files

[Core] Support image processor (#4197)

parent dfbe60dc
from .base import MultiModalData, MultiModalPlugin
from .registry import MULTIMODAL_REGISTRY, MultiModalRegistry
__all__ = [
"MultiModalData", "MultiModalPlugin", "MULTIMODAL_REGISTRY",
"MultiModalRegistry"
]
from abc import ABC, abstractmethod
from typing import (TYPE_CHECKING, Callable, Dict, Generic, Optional, Type,
TypeVar)
from vllm.config import ModelConfig, VisionLanguageConfig
from vllm.logger import init_logger
if TYPE_CHECKING:
import torch
from torch import nn
logger = init_logger(__name__)
class MultiModalData:
"""
Base class that contains multi-modal data.
To add a new modality, add a new file under ``multimodal`` directory.
In this new file, subclass :class:`~MultiModalData` and
:class:`~MultiModalPlugin`.
Finally, register the new plugin to
:const:`vllm.multimodal.MULTIMODAL_REGISTRY`.
This enables models to call :meth:`MultiModalRegistry.register_input` for
the new modality.
"""
pass
D = TypeVar("D", bound=MultiModalData)
N = TypeVar("N", bound=Type["nn.Module"])
MultiModalInputProcessor = Callable[[D, ModelConfig, VisionLanguageConfig],
Dict[str, "torch.Tensor"]]
"""Return a dictionary to be passed as keyword arguments to
:meth:`torch.nn.Module.forward`. This is similar in concept to tokenizers
and processors in HuggingFace Transformers."""
class MultiModalPlugin(ABC, Generic[D]):
"""
Base class that defines data processing logic for a specific modality.
In particular, we adopt a registry pattern to dispatch data processing
according to the model being used (considering that different models may
process the same data differently). This registry is in turn used by
:class:`~MultiModalRegistry` which acts at a higher level
(i.e., the modality of the data).
"""
@classmethod
def get_model_cls(cls, model_config: ModelConfig) -> Type["nn.Module"]:
# Avoid circular import
from vllm.model_executor.model_loader import get_model_architecture
return get_model_architecture(model_config)[0]
def __init__(self) -> None:
self._input_processors: Dict[Type["nn.Module"],
MultiModalInputProcessor[D]] = {}
@abstractmethod
def get_data_type(self) -> Type[D]:
"""
Get the modality (subclass of :class:`~MultiModalData`) served by
this plugin.
"""
raise NotImplementedError
@abstractmethod
def _default_input_processor(
self, data: D, model_config: ModelConfig,
vlm_config: VisionLanguageConfig) -> Dict[str, "torch.Tensor"]:
"""Return a dictionary to be passed as keyword arguments to
:meth:`torch.nn.Module.forward`. This is similar in concept to
tokenizers and processors in HuggingFace Transformers.
"""
raise NotImplementedError
def register_input_processor(self,
processor: Optional[
MultiModalInputProcessor[D]] = None):
"""
Register an input processor to a model class.
When the model receives input data that matches the modality served by
this plugin (see :meth:`get_data_type`), the provided input processor is
applied to preprocess the data. If `None` is provided, then the default
input processor is applied instead.
"""
def wrapper(model_cls: N) -> N:
if model_cls in self._input_processors:
logger.warning(
"Model class %s already has an input processor "
"registered to %s. It is overwritten by the new one.",
model_cls, self)
self._input_processors[model_cls] = processor \
or self._default_input_processor
return model_cls
return wrapper
def process_input(
self, data: D, model_config: ModelConfig,
vlm_config: VisionLanguageConfig) -> Dict[str, "torch.Tensor"]:
"""
Apply an input processor to a :class:`~MultiModalData` instance passed
to the model.
The model is identified by ``model_config``. ``vlm_config`` is
for compatibility purposes and may be merged into ``model_config``
in the near future.
"""
model_cls = self.get_model_cls(model_config)
processor = self._input_processors.get(model_cls)
if processor is None:
raise KeyError(f"No input processor in {self} is registered for "
f"model class {model_cls.__name__}.")
return processor(data, model_config, vlm_config)
from typing import Dict, Tuple, Type, Union
import torch
from PIL import Image
from vllm.config import ModelConfig, VisionLanguageConfig
from vllm.logger import init_logger
from vllm.sequence import SequenceData
from vllm.transformers_utils.image_processor import cached_get_image_processor
from .base import MultiModalData, MultiModalPlugin
logger = init_logger(__name__)
def _get_dummy_seq_data(seq_len: int,
vlm_config: VisionLanguageConfig) -> SequenceData:
# NOTE: We assume that <image> token is repeated `image_feature_size` times
# and then concatenated with the text prompt
# TODO: Enable other ways of inserting the image into the prompt
token_ids = [vlm_config.image_token_id] * vlm_config.image_feature_size
token_ids += [0] * (seq_len - vlm_config.image_feature_size)
return SequenceData(token_ids)
def _get_dummy_values(vlm_config: VisionLanguageConfig) -> torch.Tensor:
if vlm_config.image_processor is None:
values_dtype = torch.float16
else:
values_dtype = torch.uint8
return torch.zeros(vlm_config.image_input_shape, dtype=values_dtype)
def get_dummy_image_data(
seq_len: int,
model_config: ModelConfig,
vlm_config: VisionLanguageConfig,
) -> Tuple[SequenceData, MultiModalData]:
"""Standard dummy data factory for image data (to be used in
:meth:`vlm.multimodal.MultiModalRegistry.register_dummy_data`)."""
seq_data = _get_dummy_seq_data(seq_len, vlm_config)
values = _get_dummy_values(vlm_config)
config_input_type = vlm_config.image_input_type
ImageInputType = VisionLanguageConfig.ImageInputType
fake_mm_data: MultiModalData
if config_input_type == ImageInputType.PIXEL_VALUES:
fake_mm_data = ImagePixelData(values)
elif config_input_type == ImageInputType.IMAGE_FEATURES:
fake_mm_data = ImageFeatureData(values)
else:
raise NotImplementedError
return seq_data, fake_mm_data
class ImagePixelData(MultiModalData):
"""
The pixel data of an image. Can be one of:
- :class:``PIL.Image``: An image object. Requires that a HuggingFace
processor is available to the model.
- :class:``torch.Tensor``: The raw pixel data which is passed to the model
without additional pre-processing.
"""
def __init__(self, image: Union[Image.Image, torch.Tensor]) -> None:
if isinstance(image, Image.Image):
# So that this class can be created inside the Image context manager
image.load()
self.image = image
class ImagePixelPlugin(MultiModalPlugin[ImagePixelData]):
def get_data_type(self) -> Type[ImagePixelData]:
return ImagePixelData
def _get_hf_image_processor(self, model_config: ModelConfig,
vlm_config: VisionLanguageConfig):
if vlm_config is None or vlm_config.image_processor is None:
return None
return cached_get_image_processor(
vlm_config.image_processor,
trust_remote_code=model_config.trust_remote_code,
revision=vlm_config.image_processor_revision,
)
def _default_input_processor(
self, data: ImagePixelData, model_config: ModelConfig,
vlm_config: VisionLanguageConfig) -> Dict[str, torch.Tensor]:
image = data.image
image_processor = self._get_hf_image_processor(model_config,
vlm_config)
if isinstance(image, Image.Image):
if image_processor is None:
raise RuntimeError("No HuggingFace processor is available"
"to process the image object")
try:
return image_processor.preprocess(image, return_tensors="pt") \
.to(model_config.dtype).data
except Exception:
logger.error("Failed to process image (%s)", image)
raise
elif isinstance(image, torch.Tensor):
pixel_values = image.to(model_config.dtype)
return {"pixel_values": pixel_values}
raise TypeError(f"Invalid image type: {type(image)}")
class ImageFeatureData(MultiModalData):
"""
The feature vector of an image, passed directly to the model.
This should be the output of the vision tower.
"""
def __init__(self, image_features: torch.Tensor) -> None:
self.image_features = image_features
class ImageFeaturePlugin(MultiModalPlugin[ImageFeatureData]):
def get_data_type(self) -> Type[ImageFeatureData]:
return ImageFeatureData
def _default_input_processor(
self, data: ImageFeatureData, model_config: ModelConfig,
vlm_config: VisionLanguageConfig) -> Dict[str, torch.Tensor]:
image_features = data.image_features.to(model_config.dtype)
return {"image_features": image_features}
import functools
from typing import (TYPE_CHECKING, Any, Callable, Dict, Optional, Sequence,
Tuple, Type, TypeVar)
from vllm.config import ModelConfig, VisionLanguageConfig
from vllm.logger import init_logger
from .base import MultiModalData, MultiModalPlugin
from .image import (ImageFeatureData, ImageFeaturePlugin, ImagePixelData,
ImagePixelPlugin)
if TYPE_CHECKING:
import torch
from torch import nn
from vllm.sequence import SequenceData
logger = init_logger(__name__)
D = TypeVar("D", bound=MultiModalData)
N = TypeVar("N", bound=Type["nn.Module"])
MultiModalInputProcessor = Callable[[D, ModelConfig, VisionLanguageConfig],
Dict[str, "torch.Tensor"]]
MultiModalDummyFactory = Callable[[int, ModelConfig, VisionLanguageConfig],
Tuple["SequenceData", MultiModalData]]
class MultiModalRegistry:
"""
This registry is used by model runners to dispatch data processing
according to its modality and the target model.
"""
DEFAULT_PLUGINS = (ImageFeaturePlugin(), ImagePixelPlugin())
def __init__(self,
*,
plugins: Sequence[MultiModalPlugin[Any]] = DEFAULT_PLUGINS
) -> None:
self._plugins_by_data_type = {p.get_data_type(): p for p in plugins}
self._dummy_factories_by_model_type: Dict[Type["nn.Module"],
MultiModalDummyFactory] = {}
def register_plugin(self, plugin: MultiModalPlugin[Any]) -> None:
data_type = plugin.get_data_type()
if data_type in self._plugins_by_data_type:
logger.warning(
"A plugin is already registered for data type %s, "
"and will be overwritten by the new plugin %s.", data_type,
plugin)
self._plugins_by_data_type[data_type] = plugin
def _get_plugin_for_data_type(self, data_type: Type[MultiModalData]):
for typ in data_type.mro():
plugin = self._plugins_by_data_type.get(typ)
if plugin is not None:
return plugin
msg = f"Unknown multi-modal data type: {data_type}"
raise NotImplementedError(msg)
def register_dummy_data(self, factory: MultiModalDummyFactory):
"""
Register a dummy data factory to a model class.
During memory profiling, the provided function is invoked to create
dummy data to be inputted into the model. The modality and shape of
the dummy data should be an upper bound of what the model would receive
at inference time.
"""
def wrapper(model_cls: N) -> N:
if model_cls in self._dummy_factories_by_model_type:
logger.warning(
"Model class %s already has dummy data "
"registered to %s. It is overwritten by the new one.",
model_cls, self)
self._dummy_factories_by_model_type[model_cls] = factory
return model_cls
return wrapper
def dummy_data_for_profiling(self, seq_len: int, model_config: ModelConfig,
vlm_config: VisionLanguageConfig):
"""Create dummy data for memory profiling."""
model_cls = MultiModalPlugin.get_model_cls(model_config)
dummy_factory = self._dummy_factories_by_model_type.get(model_cls)
if dummy_factory is None:
msg = f"No dummy data defined for model class: {model_cls}"
raise NotImplementedError(msg)
return dummy_factory(seq_len, model_config, vlm_config)
def register_input(
self,
data_type: Type[D],
processor: Optional[MultiModalInputProcessor[D]] = None):
"""
Register an input processor for a specific modality to a model class.
See :meth:`MultiModalPlugin.register_input_processor` for more details.
"""
return self._get_plugin_for_data_type(data_type) \
.register_input_processor(processor)
def register_image_pixel_input(
self,
processor: Optional[
MultiModalInputProcessor[ImagePixelData]] = None):
"""
Register an input processor for image pixel data to a model class.
See :meth:`MultiModalPlugin.register_input_processor` for more details.
"""
return self.register_input(ImagePixelData, processor)
def register_image_feature_input(
self,
processor: Optional[
MultiModalInputProcessor[ImageFeatureData]] = None):
"""
Register an input processor for image feature data to a model class.
See :meth:`MultiModalPlugin.register_input_processor` for more details.
"""
return self.register_input(ImageFeatureData, processor)
def process_input(self, data: MultiModalData, model_config: ModelConfig,
vlm_config: VisionLanguageConfig):
"""
Apply an input processor to a :class:`~MultiModalData` instance passed
to the model.
See :meth:`MultiModalPlugin.process_input` for more details.
"""
return self._get_plugin_for_data_type(type(data)) \
.process_input(data, model_config, vlm_config)
def create_input_processor(self, model_config: ModelConfig,
vlm_config: VisionLanguageConfig):
"""
Create an input processor (see :meth:`process_input`) for a
specific model.
"""
return functools.partial(self.process_input,
model_config=model_config,
vlm_config=vlm_config)
MULTIMODAL_REGISTRY = MultiModalRegistry()
"""The global :class:`~MultiModalRegistry` which is used by model runners."""
...@@ -5,6 +5,8 @@ from abc import ABC, abstractmethod ...@@ -5,6 +5,8 @@ from abc import ABC, abstractmethod
from dataclasses import dataclass, field from dataclasses import dataclass, field
from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union
import torch
from vllm.block import LogicalTokenBlock from vllm.block import LogicalTokenBlock
from vllm.inputs import LLMInputs from vllm.inputs import LLMInputs
from vllm.lora.request import LoRARequest from vllm.lora.request import LoRARequest
...@@ -12,8 +14,7 @@ from vllm.pooling_params import PoolingParams ...@@ -12,8 +14,7 @@ from vllm.pooling_params import PoolingParams
from vllm.sampling_params import SamplingParams from vllm.sampling_params import SamplingParams
if TYPE_CHECKING: if TYPE_CHECKING:
import torch from vllm.multimodal import MultiModalData
from vllm.spec_decode.metrics import SpecDecodeWorkerMetrics from vllm.spec_decode.metrics import SpecDecodeWorkerMetrics
...@@ -398,25 +399,6 @@ class SequenceGroupState: ...@@ -398,25 +399,6 @@ class SequenceGroupState:
generator: Optional = None # type: ignore generator: Optional = None # type: ignore
class MultiModalData:
"""Multi modal request.
Args:
type: The data type.
data: The actual data.
The required shape and semantic meaning of it depends on the vision
language config of the hosted model.
See `VisionLanguageConfig` in `config.py`.
"""
class Type(enum.Enum):
IMAGE = enum.auto()
def __init__(self, type: Type, data: "torch.Tensor"):
self.type = type
self.data = data
class SequenceGroup: class SequenceGroup:
"""A group of sequences that are generated from the same prompt. """A group of sequences that are generated from the same prompt.
...@@ -473,7 +455,7 @@ class SequenceGroup: ...@@ -473,7 +455,7 @@ class SequenceGroup:
return next(iter(self.seqs_dict.values())).prompt_token_ids return next(iter(self.seqs_dict.values())).prompt_token_ids
@property @property
def multi_modal_data(self) -> Optional[MultiModalData]: def multi_modal_data(self) -> Optional["MultiModalData"]:
# All sequences in the group should have the same multi-modal data. # All sequences in the group should have the same multi-modal data.
# We use the multi-modal data of an arbitrary sequence. # We use the multi-modal data of an arbitrary sequence.
return next(iter(self.seqs_dict.values())).multi_modal_data return next(iter(self.seqs_dict.values())).multi_modal_data
...@@ -655,7 +637,7 @@ class SequenceGroupMetadata: ...@@ -655,7 +637,7 @@ class SequenceGroupMetadata:
lora_request: Optional[LoRARequest] = None, lora_request: Optional[LoRARequest] = None,
computed_block_nums: Optional[List[int]] = None, computed_block_nums: Optional[List[int]] = None,
state: Optional[SequenceGroupState] = None, state: Optional[SequenceGroupState] = None,
multi_modal_data: Optional[MultiModalData] = None, multi_modal_data: Optional["MultiModalData"] = None,
encoder_seq_data: Optional[SequenceData] = None, encoder_seq_data: Optional[SequenceData] = None,
cross_block_table: Optional[List[int]] = None, cross_block_table: Optional[List[int]] = None,
) -> None: ) -> None:
...@@ -798,13 +780,13 @@ class SamplerOutput: ...@@ -798,13 +780,13 @@ class SamplerOutput:
outputs: List[CompletionSequenceGroupOutput] outputs: List[CompletionSequenceGroupOutput]
# On-device tensor containing probabilities of each token. # On-device tensor containing probabilities of each token.
sampled_token_probs: Optional["torch.Tensor"] = None sampled_token_probs: Optional[torch.Tensor] = None
# On-device tensor containing the logprobs of each token. # On-device tensor containing the logprobs of each token.
logprobs: Optional["torch.Tensor"] = None logprobs: Optional["torch.Tensor"] = None
# On-device tensor containing the sampled token ids. # On-device tensor containing the sampled token ids.
sampled_token_ids: Optional["torch.Tensor"] = None sampled_token_ids: Optional[torch.Tensor] = None
# Spec decode metrics populated by workers. # Spec decode metrics populated by workers.
spec_decode_worker_metrics: Optional["SpecDecodeWorkerMetrics"] = None spec_decode_worker_metrics: Optional["SpecDecodeWorkerMetrics"] = None
......
from functools import lru_cache
from typing import Optional
from transformers import AutoImageProcessor
from transformers.image_processing_utils import BaseImageProcessor
from vllm.logger import init_logger
logger = init_logger(__name__)
def get_image_processor(
processor_name: str,
*args,
trust_remote_code: bool = False,
revision: Optional[str] = None,
**kwargs,
) -> BaseImageProcessor:
"""Gets an image processor for the given model name via HuggingFace."""
try:
processor: BaseImageProcessor = AutoImageProcessor.from_pretrained(
processor_name,
*args,
trust_remote_code=trust_remote_code,
revision=revision,
**kwargs)
except ValueError as e:
# If the error pertains to the processor class not existing or not
# currently being imported, suggest using the --trust-remote-code flag.
# Unlike AutoTokenizer, AutoImageProcessor does not separate such errors
if not trust_remote_code:
err_msg = (
"Failed to load the image processor. If the image processor is "
"a custom processor not yet available in the HuggingFace "
"transformers library, consider setting "
"`trust_remote_code=True` in LLM or using the "
"`--trust-remote-code` flag in the CLI.")
raise RuntimeError(err_msg) from e
else:
raise e
return processor
cached_get_image_processor = lru_cache(get_image_processor)
from typing import List, Optional, Tuple from collections import defaultdict
from typing import Dict, List, Optional, Tuple
import torch import torch
from torch import nn from torch import nn
...@@ -11,6 +12,7 @@ from vllm.distributed import broadcast_tensor_dict ...@@ -11,6 +12,7 @@ from vllm.distributed import broadcast_tensor_dict
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.model_executor import SamplingMetadata from vllm.model_executor import SamplingMetadata
from vllm.model_executor.model_loader import get_model from vllm.model_executor.model_loader import get_model
from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.sequence import SamplerOutput, SequenceGroupMetadata from vllm.sequence import SamplerOutput, SequenceGroupMetadata
from vllm.utils import make_tensor_with_pad from vllm.utils import make_tensor_with_pad
...@@ -63,6 +65,16 @@ class CPUModelRunner: ...@@ -63,6 +65,16 @@ class CPUModelRunner:
self.block_size, self.block_size,
) )
# Create processor for multi-modal data
if self.vision_language_config is not None:
self.multi_modal_input_processor = MULTIMODAL_REGISTRY \
.create_input_processor(
self.model_config,
self.vision_language_config,
)
else:
self.multi_modal_input_processor = None
# Lazy initialization. # Lazy initialization.
self.model: nn.Module # Set after init_Model self.model: nn.Module # Set after init_Model
...@@ -80,14 +92,15 @@ class CPUModelRunner: ...@@ -80,14 +92,15 @@ class CPUModelRunner:
def _prepare_prompt( def _prepare_prompt(
self, self,
seq_group_metadata_list: List[SequenceGroupMetadata], seq_group_metadata_list: List[SequenceGroupMetadata],
) -> Tuple[torch.Tensor, torch.Tensor, AttentionMetadata, List[int], ) -> Tuple[torch.Tensor, torch.Tensor, AttentionMetadata, List[int], Dict[
Optional[torch.Tensor]]: str, torch.Tensor]]:
assert len(seq_group_metadata_list) > 0 assert len(seq_group_metadata_list) > 0
input_tokens: List[int] = [] input_tokens: List[int] = []
input_positions: List[int] = [] input_positions: List[int] = []
slot_mapping: List[int] = [] slot_mapping: List[int] = []
seq_lens: List[int] = [] seq_lens: List[int] = []
multi_modal_input_list: List[torch.Tensor] = [] multi_modal_kwargs_list: Dict[str,
List[torch.Tensor]] = defaultdict(list)
for seq_group_metadata in seq_group_metadata_list: for seq_group_metadata in seq_group_metadata_list:
assert seq_group_metadata.is_prompt assert seq_group_metadata.is_prompt
...@@ -108,9 +121,17 @@ class CPUModelRunner: ...@@ -108,9 +121,17 @@ class CPUModelRunner:
# is always the first token in the sequence. # is always the first token in the sequence.
input_positions.extend(list(range(computed_len, seq_len))) input_positions.extend(list(range(computed_len, seq_len)))
if seq_group_metadata.multi_modal_data: mm_data = seq_group_metadata.multi_modal_data
multi_modal_input_list.append( if mm_data is not None:
seq_group_metadata.multi_modal_data.data) # Process multi-modal data
if self.multi_modal_input_processor is None:
raise ValueError(
"Multi-modal inputs are only supported by "
"vision language models.")
mm_kwargs = self.multi_modal_input_processor(mm_data)
for k, v in mm_kwargs.items():
multi_modal_kwargs_list[k].append(v)
# Compute the slot mapping. # Compute the slot mapping.
block_table = seq_group_metadata.block_tables[seq_id] block_table = seq_group_metadata.block_tables[seq_id]
...@@ -134,14 +155,10 @@ class CPUModelRunner: ...@@ -134,14 +155,10 @@ class CPUModelRunner:
slot = block_number * self.block_size + block_offset slot = block_number * self.block_size + block_offset
slot_mapping.append(slot) slot_mapping.append(slot)
if multi_modal_input_list: multi_modal_kwargs = {
assert self.vision_language_config, ( k: torch.cat(v, dim=0).to(self.device)
"Multi-modal inputs are only supported by " for k, v in multi_modal_kwargs_list.items()
"vision language models.") }
multi_modal_input = torch.cat(multi_modal_input_list,
dim=0).to(self.device)
else:
multi_modal_input = None
num_prompt_tokens = len(input_tokens) num_prompt_tokens = len(input_tokens)
...@@ -167,7 +184,7 @@ class CPUModelRunner: ...@@ -167,7 +184,7 @@ class CPUModelRunner:
slot_mapping=slot_mapping, slot_mapping=slot_mapping,
) )
return (input_tokens, input_positions, attn_metadata, seq_lens, return (input_tokens, input_positions, attn_metadata, seq_lens,
multi_modal_input) multi_modal_kwargs)
def _prepare_decode( def _prepare_decode(
self, self,
...@@ -257,8 +274,8 @@ class CPUModelRunner: ...@@ -257,8 +274,8 @@ class CPUModelRunner:
self, self,
seq_group_metadata_list: List[SequenceGroupMetadata], seq_group_metadata_list: List[SequenceGroupMetadata],
) -> Tuple[torch.Tensor, torch.Tensor, AttentionMetadata, SamplingMetadata, ) -> Tuple[torch.Tensor, torch.Tensor, AttentionMetadata, SamplingMetadata,
Optional[torch.Tensor]]: Optional[Dict[str, torch.Tensor]]]:
multi_modal_input = None multi_modal_kwargs = None
if self.is_driver_worker: if self.is_driver_worker:
# NOTE: We assume that all sequences in the group are all prompts or # NOTE: We assume that all sequences in the group are all prompts or
# all decodes. # all decodes.
...@@ -266,7 +283,7 @@ class CPUModelRunner: ...@@ -266,7 +283,7 @@ class CPUModelRunner:
# Prepare input tensors. # Prepare input tensors.
if is_prompt: if is_prompt:
(input_tokens, input_positions, attn_metadata, seq_lens, (input_tokens, input_positions, attn_metadata, seq_lens,
multi_modal_input multi_modal_kwargs
) = self._prepare_prompt(seq_group_metadata_list) ) = self._prepare_prompt(seq_group_metadata_list)
else: else:
(input_tokens, input_positions, (input_tokens, input_positions,
...@@ -307,7 +324,7 @@ class CPUModelRunner: ...@@ -307,7 +324,7 @@ class CPUModelRunner:
) )
return (input_tokens, input_positions, attn_metadata, return (input_tokens, input_positions, attn_metadata,
sampling_metadata, multi_modal_input) sampling_metadata, multi_modal_kwargs)
@torch.inference_mode() @torch.inference_mode()
def execute_model( def execute_model(
......
...@@ -90,7 +90,7 @@ class EmbeddingModelRunner(ModelRunner): ...@@ -90,7 +90,7 @@ class EmbeddingModelRunner(ModelRunner):
self, self,
seq_group_metadata_list: Optional[List[SequenceGroupMetadata]], seq_group_metadata_list: Optional[List[SequenceGroupMetadata]],
) -> Tuple[torch.Tensor, torch.Tensor, AttentionMetadata, PoolingMetadata, ) -> Tuple[torch.Tensor, torch.Tensor, AttentionMetadata, PoolingMetadata,
Set[LoRARequest], LoRAMapping, torch.Tensor]: Set[LoRARequest], LoRAMapping, Dict[str, torch.Tensor]]:
if self.is_driver_worker: if self.is_driver_worker:
assert seq_group_metadata_list is not None assert seq_group_metadata_list is not None
# Prepare input tensors. # Prepare input tensors.
...@@ -102,7 +102,7 @@ class EmbeddingModelRunner(ModelRunner): ...@@ -102,7 +102,7 @@ class EmbeddingModelRunner(ModelRunner):
_, _,
lora_mapping, lora_mapping,
lora_requests, lora_requests,
multi_modal_input, multi_modal_kwargs,
slot_mapping, slot_mapping,
num_prefill_tokens, num_prefill_tokens,
num_decode_tokens, num_decode_tokens,
...@@ -117,7 +117,7 @@ class EmbeddingModelRunner(ModelRunner): ...@@ -117,7 +117,7 @@ class EmbeddingModelRunner(ModelRunner):
"input_positions": input_positions, "input_positions": input_positions,
"lora_requests": lora_requests, "lora_requests": lora_requests,
"lora_mapping": lora_mapping, "lora_mapping": lora_mapping,
"multi_modal_input": multi_modal_input, "multi_modal_kwargs": multi_modal_kwargs,
"num_prefill_tokens": num_prefill_tokens, "num_prefill_tokens": num_prefill_tokens,
"num_decode_tokens": num_decode_tokens, "num_decode_tokens": num_decode_tokens,
"slot_mapping": slot_mapping, "slot_mapping": slot_mapping,
...@@ -132,7 +132,7 @@ class EmbeddingModelRunner(ModelRunner): ...@@ -132,7 +132,7 @@ class EmbeddingModelRunner(ModelRunner):
input_positions = metadata_dict.pop("input_positions") input_positions = metadata_dict.pop("input_positions")
lora_mapping = metadata_dict.pop("lora_mapping") lora_mapping = metadata_dict.pop("lora_mapping")
lora_requests = metadata_dict.pop("lora_requests") lora_requests = metadata_dict.pop("lora_requests")
multi_modal_input = metadata_dict.pop("multi_modal_input") multi_modal_kwargs = metadata_dict.pop("multi_modal_kwargs")
if metadata_dict: if metadata_dict:
attn_metadata = self.attn_backend.make_metadata( attn_metadata = self.attn_backend.make_metadata(
**metadata_dict) **metadata_dict)
...@@ -143,7 +143,7 @@ class EmbeddingModelRunner(ModelRunner): ...@@ -143,7 +143,7 @@ class EmbeddingModelRunner(ModelRunner):
prompt_lens=None) prompt_lens=None)
return (input_tokens, input_positions, attn_metadata, pooling_metadata, return (input_tokens, input_positions, attn_metadata, pooling_metadata,
lora_requests, lora_mapping, multi_modal_input) lora_requests, lora_mapping, multi_modal_kwargs)
def _prepare_pooling( def _prepare_pooling(
self, self,
......
import time import time
import warnings import warnings
from collections import defaultdict
from typing import Dict, List, NamedTuple, Optional, Set, Tuple, Union from typing import Dict, List, NamedTuple, Optional, Set, Tuple, Union
import numpy as np import numpy as np
...@@ -18,9 +19,9 @@ from vllm.lora.request import LoRARequest ...@@ -18,9 +19,9 @@ from vllm.lora.request import LoRARequest
from vllm.lora.worker_manager import LRUCacheWorkerLoRAManager from vllm.lora.worker_manager import LRUCacheWorkerLoRAManager
from vllm.model_executor import SamplingMetadata from vllm.model_executor import SamplingMetadata
from vllm.model_executor.model_loader import get_model from vllm.model_executor.model_loader import get_model
from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.sampling_params import SamplingParams from vllm.sampling_params import SamplingParams
from vllm.sequence import (MultiModalData, SamplerOutput, SequenceData, from vllm.sequence import SamplerOutput, SequenceData, SequenceGroupMetadata
SequenceGroupMetadata)
from vllm.utils import (CudaMemoryProfiler, get_kv_cache_torch_dtype, is_hip, from vllm.utils import (CudaMemoryProfiler, get_kv_cache_torch_dtype, is_hip,
is_pin_memory_available, make_tensor_with_pad) is_pin_memory_available, make_tensor_with_pad)
...@@ -44,7 +45,7 @@ class ModelInput(NamedTuple): ...@@ -44,7 +45,7 @@ class ModelInput(NamedTuple):
query_lens: List[int] query_lens: List[int]
lora_mapping: Optional[LoRAMapping] lora_mapping: Optional[LoRAMapping]
lora_requests: Set[LoRARequest] lora_requests: Set[LoRARequest]
multi_modal_input: Optional[torch.Tensor] multi_modal_kwargs: Dict[str, torch.Tensor]
slot_mapping: torch.Tensor slot_mapping: torch.Tensor
num_prefill_tokens: int num_prefill_tokens: int
num_decode_tokens: int num_decode_tokens: int
...@@ -60,7 +61,7 @@ class ModelInput(NamedTuple): ...@@ -60,7 +61,7 @@ class ModelInput(NamedTuple):
query_lens=[], query_lens=[],
lora_mapping=None, lora_mapping=None,
lora_requests=set(), lora_requests=set(),
multi_modal_input=None, multi_modal_kwargs={},
slot_mapping=torch.empty(0, device=device), slot_mapping=torch.empty(0, device=device),
num_prefill_tokens=0, num_prefill_tokens=0,
num_decode_tokens=0, num_decode_tokens=0,
...@@ -122,6 +123,16 @@ class ModelRunner: ...@@ -122,6 +123,16 @@ class ModelRunner:
self.block_size, self.block_size,
) )
# Create processor for multi-modal data
if self.vision_language_config is not None:
self.multi_modal_input_processor = MULTIMODAL_REGISTRY \
.create_input_processor(
self.model_config,
self.vision_language_config,
)
else:
self.multi_modal_input_processor = None
# Lazy initialization # Lazy initialization
self.model: nn.Module # Set after load_model self.model: nn.Module # Set after load_model
# Set if the backend is flashinfer. # Set if the backend is flashinfer.
...@@ -242,7 +253,8 @@ class ModelRunner: ...@@ -242,7 +253,8 @@ class ModelRunner:
context_lens: List[int] = [] context_lens: List[int] = []
query_lens: List[int] = [] query_lens: List[int] = []
block_tables: List[List[int]] = [] block_tables: List[List[int]] = []
multi_modal_input_list: List[torch.Tensor] = [] multi_modal_kwargs_list: Dict[str,
List[torch.Tensor]] = defaultdict(list)
decode_only = True decode_only = True
num_prefills = 0 num_prefills = 0
num_prefill_tokens = 0 num_prefill_tokens = 0
...@@ -417,9 +429,17 @@ class ModelRunner: ...@@ -417,9 +429,17 @@ class ModelRunner:
and seq_group_metadata.sampling_params.prompt_logprobs and seq_group_metadata.sampling_params.prompt_logprobs
else 1)) else 1))
if seq_group_metadata.multi_modal_data: mm_data = seq_group_metadata.multi_modal_data
multi_modal_input_list.append( if mm_data is not None:
seq_group_metadata.multi_modal_data.data) # Process multi-modal data
if self.multi_modal_input_processor is None:
raise ValueError(
"Multi-modal inputs are only supported by "
"vision language models.")
mm_kwargs = self.multi_modal_input_processor(mm_data)
for k, v in mm_kwargs.items():
multi_modal_kwargs_list[k].append(v)
if _is_block_tables_empty(seq_group_metadata.block_tables): if _is_block_tables_empty(seq_group_metadata.block_tables):
# During memory profiling, the block tables are not # During memory profiling, the block tables are not
...@@ -508,16 +528,6 @@ class ModelRunner: ...@@ -508,16 +528,6 @@ class ModelRunner:
context_lens_tensor = torch.tensor(context_lens, context_lens_tensor = torch.tensor(context_lens,
dtype=torch.int, dtype=torch.int,
device=self.device) device=self.device)
if multi_modal_input_list:
assert self.vision_language_config, (
"Multi-modal inputs are only supported by "
"vision language models.")
multi_modal_input = torch.cat(multi_modal_input_list,
dim=0).to(self.device)
else:
multi_modal_input = None
query_lens_tensor = torch.tensor(query_lens, query_lens_tensor = torch.tensor(query_lens,
dtype=torch.long, dtype=torch.long,
device=self.device) device=self.device)
...@@ -614,6 +624,11 @@ class ModelRunner: ...@@ -614,6 +624,11 @@ class ModelRunner:
else: else:
lora_mapping = None lora_mapping = None
multi_modal_kwargs = {
k: torch.cat(v, dim=0).to(self.device)
for k, v in multi_modal_kwargs_list.items()
}
return ModelInput( return ModelInput(
input_tokens=input_tokens_tensor, input_tokens=input_tokens_tensor,
input_positions=input_positions_tensor, input_positions=input_positions_tensor,
...@@ -622,7 +637,7 @@ class ModelRunner: ...@@ -622,7 +637,7 @@ class ModelRunner:
query_lens=query_lens, query_lens=query_lens,
lora_mapping=lora_mapping, lora_mapping=lora_mapping,
lora_requests=lora_requests, lora_requests=lora_requests,
multi_modal_input=multi_modal_input, multi_modal_kwargs=multi_modal_kwargs,
slot_mapping=slot_mapping_tensor, slot_mapping=slot_mapping_tensor,
num_prefill_tokens=num_prefill_tokens, num_prefill_tokens=num_prefill_tokens,
num_decode_tokens=num_decode_tokens, num_decode_tokens=num_decode_tokens,
...@@ -633,7 +648,7 @@ class ModelRunner: ...@@ -633,7 +648,7 @@ class ModelRunner:
self, self,
seq_group_metadata_list: Optional[List[SequenceGroupMetadata]], seq_group_metadata_list: Optional[List[SequenceGroupMetadata]],
) -> Tuple[torch.Tensor, torch.Tensor, AttentionMetadata, SamplingMetadata, ) -> Tuple[torch.Tensor, torch.Tensor, AttentionMetadata, SamplingMetadata,
Set[LoRARequest], LoRAMapping, torch.Tensor]: Set[LoRARequest], LoRAMapping, Dict[str, torch.Tensor]]:
if self.is_driver_worker: if self.is_driver_worker:
assert seq_group_metadata_list is not None assert seq_group_metadata_list is not None
# Prepare input tensors. # Prepare input tensors.
...@@ -645,7 +660,7 @@ class ModelRunner: ...@@ -645,7 +660,7 @@ class ModelRunner:
query_lens, query_lens,
lora_mapping, lora_mapping,
lora_requests, lora_requests,
multi_modal_input, multi_modal_kwargs,
slot_mapping, slot_mapping,
num_prefill_tokens, num_prefill_tokens,
num_decode_tokens, num_decode_tokens,
...@@ -662,7 +677,7 @@ class ModelRunner: ...@@ -662,7 +677,7 @@ class ModelRunner:
sampling_metadata.selected_token_indices, sampling_metadata.selected_token_indices,
"lora_requests": lora_requests, "lora_requests": lora_requests,
"lora_mapping": lora_mapping, "lora_mapping": lora_mapping,
"multi_modal_input": multi_modal_input, "multi_modal_kwargs": multi_modal_kwargs,
"num_prefill_tokens": num_prefill_tokens, "num_prefill_tokens": num_prefill_tokens,
"num_decode_tokens": num_decode_tokens, "num_decode_tokens": num_decode_tokens,
"slot_mapping": slot_mapping, "slot_mapping": slot_mapping,
...@@ -679,7 +694,7 @@ class ModelRunner: ...@@ -679,7 +694,7 @@ class ModelRunner:
"selected_token_indices") "selected_token_indices")
lora_mapping = metadata_dict.pop("lora_mapping") lora_mapping = metadata_dict.pop("lora_mapping")
lora_requests = metadata_dict.pop("lora_requests") lora_requests = metadata_dict.pop("lora_requests")
multi_modal_input = metadata_dict.pop("multi_modal_input") multi_modal_kwargs = metadata_dict.pop("multi_modal_kwargs")
if metadata_dict: if metadata_dict:
attn_metadata = self.attn_backend.make_metadata( attn_metadata = self.attn_backend.make_metadata(
**metadata_dict) **metadata_dict)
...@@ -694,7 +709,7 @@ class ModelRunner: ...@@ -694,7 +709,7 @@ class ModelRunner:
return (input_tokens, input_positions, attn_metadata, return (input_tokens, input_positions, attn_metadata,
sampling_metadata, lora_requests, lora_mapping, sampling_metadata, lora_requests, lora_mapping,
multi_modal_input) multi_modal_kwargs)
@torch.inference_mode() @torch.inference_mode()
def execute_model( def execute_model(
...@@ -703,7 +718,7 @@ class ModelRunner: ...@@ -703,7 +718,7 @@ class ModelRunner:
kv_caches: List[torch.Tensor], kv_caches: List[torch.Tensor],
) -> Optional[SamplerOutput]: ) -> Optional[SamplerOutput]:
(input_tokens, input_positions, attn_metadata, sampling_metadata, (input_tokens, input_positions, attn_metadata, sampling_metadata,
lora_requests, lora_mapping, multi_modal_input lora_requests, lora_mapping, multi_modal_kwargs
) = self.prepare_input_tensors(seq_group_metadata_list) ) = self.prepare_input_tensors(seq_group_metadata_list)
if self.lora_config: if self.lora_config:
...@@ -717,15 +732,14 @@ class ModelRunner: ...@@ -717,15 +732,14 @@ class ModelRunner:
model_executable = self.graph_runners[graph_batch_size] model_executable = self.graph_runners[graph_batch_size]
else: else:
model_executable = self.model model_executable = self.model
execute_model_kwargs = {
"input_ids": input_tokens, hidden_states = model_executable(
"positions": input_positions, input_ids=input_tokens,
"kv_caches": kv_caches, positions=input_positions,
"attn_metadata": attn_metadata, kv_caches=kv_caches,
} attn_metadata=attn_metadata,
if self.vision_language_config: **multi_modal_kwargs,
execute_model_kwargs.update({"image_input": multi_modal_input}) )
hidden_states = model_executable(**execute_model_kwargs)
# Compute the logits. # Compute the logits.
logits = self.model.compute_logits(hidden_states, sampling_metadata) logits = self.model.compute_logits(hidden_states, sampling_metadata)
...@@ -781,16 +795,24 @@ class ModelRunner: ...@@ -781,16 +795,24 @@ class ModelRunner:
# To exercise the worst scenario for GPU memory consumption, # To exercise the worst scenario for GPU memory consumption,
# the number of seqs (batch_size) is chosen to maximize the number # the number of seqs (batch_size) is chosen to maximize the number
# of images processed. # of images processed.
if self.vision_language_config: model_config = self.model_config
vlm_config = self.vision_language_config
if vlm_config:
max_num_seqs = min( max_num_seqs = min(
max_num_seqs, max_num_seqs,
int(max_num_batched_tokens / int(max_num_batched_tokens / vlm_config.image_feature_size))
self.vision_language_config.image_feature_size))
for group_id in range(max_num_seqs): for group_id in range(max_num_seqs):
seq_len = (max_num_batched_tokens // max_num_seqs + seq_len = (max_num_batched_tokens // max_num_seqs +
(group_id < max_num_batched_tokens % max_num_seqs)) (group_id < max_num_batched_tokens % max_num_seqs))
seq_data, fake_multi_modal_input = _prepare_fake_inputs(
seq_len, self.vision_language_config) if vlm_config is None:
seq_data = SequenceData([0] * seq_len)
dummy_multi_modal_data = None
else:
seq_data, dummy_multi_modal_data = MULTIMODAL_REGISTRY \
.dummy_data_for_profiling(seq_len, model_config, vlm_config)
seq = SequenceGroupMetadata( seq = SequenceGroupMetadata(
request_id=str(group_id), request_id=str(group_id),
is_prompt=True, is_prompt=True,
...@@ -799,7 +821,7 @@ class ModelRunner: ...@@ -799,7 +821,7 @@ class ModelRunner:
block_tables=None, block_tables=None,
lora_request=dummy_lora_requests_per_seq[group_id] lora_request=dummy_lora_requests_per_seq[group_id]
if dummy_lora_requests_per_seq else None, if dummy_lora_requests_per_seq else None,
multi_modal_data=fake_multi_modal_input, multi_modal_data=dummy_multi_modal_data,
) )
seqs.append(seq) seqs.append(seq)
...@@ -1034,24 +1056,6 @@ def _get_graph_batch_size(batch_size: int) -> int: ...@@ -1034,24 +1056,6 @@ def _get_graph_batch_size(batch_size: int) -> int:
_BATCH_SIZE_ALIGNMENT * _BATCH_SIZE_ALIGNMENT) _BATCH_SIZE_ALIGNMENT * _BATCH_SIZE_ALIGNMENT)
def _prepare_fake_inputs(
seq_len: int, vision_language_config: Optional[VisionLanguageConfig]):
"""Prepare fake inputs for profile run."""
if vision_language_config:
prompt_tokens = [
vision_language_config.image_token_id
] * vision_language_config.image_feature_size + [0] * (
seq_len - vision_language_config.image_feature_size)
fake_image_input = MultiModalData(
type=MultiModalData.Type.IMAGE,
data=torch.zeros(vision_language_config.image_input_shape,
dtype=torch.float16))
else:
prompt_tokens = [0] * seq_len
fake_image_input = None
return SequenceData(prompt_tokens), fake_image_input
def _is_block_tables_empty(block_tables: Union[None, Dict]): def _is_block_tables_empty(block_tables: Union[None, Dict]):
""" """
Check if block_tables is None or a dictionary with all None values. Check if block_tables is None or a dictionary with all None values.
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment