Commit 96ae75ad authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.6.6.post1' into v0.6.6.post1-dev

parents f9f4a735 2339d59f
...@@ -19,45 +19,43 @@ ...@@ -19,45 +19,43 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
"""Inference-only Qwen2-Audio model compatible with HuggingFace weights.""" """Inference-only Qwen2-Audio model compatible with HuggingFace weights."""
from functools import cached_property, lru_cache from functools import cached_property
from typing import (Iterable, List, Mapping, Optional, Set, Tuple, TypedDict, from typing import (Any, Iterable, List, Mapping, Optional, Set, Tuple,
Union) TypedDict, Union)
import librosa
import numpy as np import numpy as np
import torch import torch
import torch.nn as nn import torch.nn as nn
from transformers import Qwen2AudioEncoder from transformers import BatchFeature, ProcessorMixin
from transformers.models.qwen2_audio import (Qwen2AudioConfig,
Qwen2AudioEncoder,
Qwen2AudioProcessor)
from transformers.models.whisper import WhisperFeatureExtractor
from vllm.attention import AttentionMetadata from vllm.attention import AttentionMetadata
from vllm.config import VllmConfig from vllm.config import VllmConfig
from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData, from vllm.inputs import InputContext
InputContext, token_inputs)
from vllm.logger import init_logger
from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import NestedTensors from vllm.multimodal.inputs import NestedTensors
from vllm.multimodal.utils import consecutive_placeholder_ranges from vllm.multimodal.processing import (BaseMultiModalProcessor,
from vllm.sequence import IntermediateTensors, SequenceData MultiModalDataItems, ProcessorInputs,
PromptReplacement)
from vllm.sequence import IntermediateTensors
from .interfaces import SupportsMultiModal, SupportsPP from .interfaces import SupportsMultiModal, SupportsPP
from .utils import (AutoWeightsLoader, init_vllm_registered_model, from .utils import (AutoWeightsLoader, init_vllm_registered_model,
maybe_prefix, merge_multimodal_embeddings) maybe_prefix, merge_multimodal_embeddings)
logger = init_logger(__name__)
# # === Audio Inputs === # # # === Audio Inputs === #
class Qwen2AudioInputs(TypedDict): class Qwen2AudioInputs(TypedDict):
input_features: torch.Tensor input_features: torch.Tensor
"""Shape: """Shape: `(num_audios, num_mel_bins, 3000)`"""
`(num_audios, num_mel_bins, 3000)`
"""
feature_attention_mask: torch.Tensor feature_attention_mask: torch.Tensor
"""Shape: `(num_audios, 3000)` """Shape: `(num_audios, 3000)`"""
"""
# === Audio Encoder === # # === Audio Encoder === #
...@@ -74,187 +72,116 @@ class Qwen2AudioMultiModalProjector(nn.Module): ...@@ -74,187 +72,116 @@ class Qwen2AudioMultiModalProjector(nn.Module):
return hidden_states return hidden_states
def dummy_data_for_qwen2_audio(ctx: InputContext, seq_len: int, # From Qwen2AudioEncoder._get_feat_extract_output_lengths
mm_counts: Mapping[str, int]): def _get_feat_extract_output_lengths(input_lengths: torch.LongTensor):
num_audios = mm_counts["audio"] feat_lengths = (input_lengths - 1) // 2 + 1
max_tokens_per_audio = get_max_qwen2_audio_audio_tokens(ctx) output_lengths = (feat_lengths - 2) // 2 + 1
max_llm_audio_tokens = max_tokens_per_audio * num_audios return feat_lengths, output_lengths
if seq_len - max_llm_audio_tokens - 2 < 0:
raise RuntimeError(
f"Qwen2-Audio cannot process {num_audios} audios in a prompt, "
"please increase max_model_len or reduce audio limit by "
"--limit-mm-per-prompt.")
audio_token_index = ctx.model_config.hf_config.audio_token_index
dummy_seqdata = SequenceData.from_prompt_token_counts(
(audio_token_index, max_llm_audio_tokens),
(0, seq_len - max_llm_audio_tokens),
)
dummy_audio = np.full((max_llm_audio_tokens * 2 * 2 * 160, ), 0.)
return DummyData(
dummy_seqdata, {"audio": [(dummy_audio, 16000)] * num_audios}, {
"audio":
consecutive_placeholder_ranges(num_items=num_audios,
item_size=max_tokens_per_audio)
})
def get_processor(
processor_name: str,
*args,
trust_remote_code: bool = False,
**kwargs,
):
"""Gets a processor for the given model name via HuggingFace.
Derived from `vllm.transformers_utils.image_processor.get_image_processor`.
"""
# don't put this import at the top level
# it will call torch.cuda.device_count()
from transformers import AutoProcessor
try:
processor = AutoProcessor.from_pretrained(
processor_name,
*args,
trust_remote_code=trust_remote_code,
**kwargs)
except ValueError as e:
# If the error pertains to the processor class not existing or not
# currently being imported, suggest using the --trust-remote-code flag.
# Unlike AutoTokenizer, AutoProcessor does not separate such errors
if not trust_remote_code:
err_msg = (
"Failed to load the processor. If the processor is "
"a custom processor not yet available in the HuggingFace "
"transformers library, consider setting "
"`trust_remote_code=True` in LLM or using the "
"`--trust-remote-code` flag in the CLI.")
raise RuntimeError(err_msg) from e
else:
raise e
return processor
def get_max_qwen2_audio_audio_tokens(ctx: InputContext) -> int:
hf_config = ctx.get_hf_config(Qwen2AudioConfig)
max_source_position = hf_config.audio_config.max_source_positions
output_lengths = (max_source_position - 2) // 2 + 1
return output_lengths
cached_get_processor = lru_cache(get_processor)
class Qwen2AudioMultiModalProcessor(BaseMultiModalProcessor):
def _get_feat_extract_output_lengths(input_lengths: torch.LongTensor): def _get_hf_processor(self) -> Qwen2AudioProcessor:
""" return self.ctx.get_hf_processor(Qwen2AudioProcessor)
Computes the output length of the convolutional layers
and the output length of the audio encoder
"""
input_lengths = (input_lengths - 1) // 2 + 1
output_lengths = (input_lengths - 2) // 2 + 1
return input_lengths, output_lengths
def _get_feature_extractor(self) -> WhisperFeatureExtractor:
return self._get_hf_processor().feature_extractor # type: ignore
def get_max_qwen2_audio_audio_tokens(ctx: InputContext) -> int: def _get_processor_data(
max_source_position = ( self,
ctx.model_config.hf_config.audio_config.max_source_positions) mm_items: MultiModalDataItems,
output_lengths = (max_source_position - 2) // 2 + 1 ) -> tuple[dict[str, Any], dict[str, Any]]:
return output_lengths # resample audio to the model's sampling rate
feature_extractor = self._get_feature_extractor()
mm_items.resample_audios(feature_extractor.sampling_rate)
return super()._get_processor_data(mm_items)
def input_processor_for_qwen2_audio( def _call_hf_processor(
ctx: InputContext, inputs: DecoderOnlyInputs) -> DecoderOnlyInputs: self,
multi_modal_data = inputs.get("multi_modal_data") hf_processor: ProcessorMixin,
if multi_modal_data is None or "audio" not in multi_modal_data: prompt: str,
return inputs processor_data: Mapping[str, object],
mm_processor_kwargs: Mapping[str, object],
audios = multi_modal_data["audio"] ) -> BatchFeature:
if not isinstance(audios, list): processor_data = dict(processor_data)
audios = [audios] audios = processor_data.pop("audios", [])
if len(audios) == 0: if audios:
return inputs processor_data["audios"] = audios
processor = cached_get_processor(ctx.model_config.model) feature_extractor = self._get_feature_extractor()
resampled_audios = [ mm_processor_kwargs = dict(
librosa.resample(audio, **mm_processor_kwargs,
orig_sr=sampling_rate, sampling_rate=feature_extractor.sampling_rate,
target_sr=processor.feature_extractor.sampling_rate) )
for audio, sampling_rate in audios else:
] # NOTE: WhisperFeatureExtractor cannot handle empty list of audios
audio_input_lengths = np.array( pass
[min(3000, _.shape[0] // 160 + 1) for _ in resampled_audios])
return super()._call_hf_processor(
audio_feat_lengths, audio_output_lengths = _get_feat_extract_output_lengths( hf_processor,
audio_input_lengths) prompt=prompt,
processor_data=processor_data,
audio_token_index = ctx.model_config.hf_config.audio_token_index mm_processor_kwargs=mm_processor_kwargs,
)
input_ids = inputs['prompt_token_ids']
def _get_prompt_replacements(
new_input_ids = [] self,
audio_num = input_ids.count(audio_token_index) mm_items: MultiModalDataItems,
assert len(audio_input_lengths) == audio_num, \ hf_inputs: BatchFeature,
(f'The text input contains {audio_num} audio tokens, ' mm_processor_kwargs: Mapping[str, object],
f'but {len(audio_input_lengths)} audios provided') ) -> list[PromptReplacement]:
start = 0 hf_config = self.ctx.get_hf_config(Qwen2AudioConfig)
for audio_idx in range(audio_num): placeholder = hf_config.audio_token_index
end = input_ids.index(audio_token_index, start)
new_input_ids.extend(input_ids[start:end]) # text part feature_attention_mask = hf_inputs.get("feature_attention_mask")
if feature_attention_mask is None:
new_input_ids.extend([audio_token_index] * audio_output_lengths = []
audio_output_lengths[audio_idx]) else:
start = end + 1 _, audio_output_lengths = _get_feat_extract_output_lengths(
new_input_ids.extend(input_ids[start:]) feature_attention_mask.sum(-1))
return token_inputs( def get_replacement_qwen2_audio(item_idx: int):
prompt_token_ids=new_input_ids, return [placeholder] * audio_output_lengths[item_idx]
prompt=inputs.get("prompt"),
multi_modal_data=multi_modal_data, return [
) PromptReplacement(
modality="audio",
target=[placeholder],
def input_mapper_for_qwen2_audio( replacement=get_replacement_qwen2_audio,
ctx: InputContext, )
multi_modal_data: Union[np.ndarray, List[np.ndarray]],
) -> MultiModalKwargs:
"""Input mapper for Qwen2-Audio."""
if not isinstance(multi_modal_data, list):
multi_modal_data = [multi_modal_data]
if len(multi_modal_data) == 0:
return MultiModalKwargs()
processor = cached_get_processor(ctx.model_config.model)
audio_feature_extractor = processor.feature_extractor
if audio_feature_extractor is None:
raise RuntimeError(
"No HuggingFace audio_feature_extractor is available "
"to process the audio object")
try:
resampled_audios = [
librosa.resample(
audio,
orig_sr=sampling_rate,
target_sr=processor.feature_extractor.sampling_rate)
for audio, sampling_rate in multi_modal_data
] ]
batch_data = audio_feature_extractor(resampled_audios,
sampling_rate=16000, def _get_dummy_mm_inputs(
return_attention_mask=True, self,
padding="max_length", mm_counts: Mapping[str, int],
return_tensors="pt").data ) -> ProcessorInputs:
batch_data["feature_attention_mask"] = batch_data.pop("attention_mask") feature_extractor = self._get_feature_extractor()
except Exception: sampling_rate = feature_extractor.sampling_rate
logger.error("Failed to process audio (%s)", multi_modal_data) audio_len = feature_extractor.chunk_length * sampling_rate
raise
audio_count = mm_counts["audio"]
return MultiModalKwargs(batch_data) audio = np.zeros(audio_len)
data = {"audio": [audio] * audio_count}
@INPUT_REGISTRY.register_dummy_data(dummy_data_for_qwen2_audio) return ProcessorInputs(
@INPUT_REGISTRY.register_input_processor(input_processor_for_qwen2_audio) prompt_text="<|AUDIO|>" * audio_count,
@MULTIMODAL_REGISTRY.register_input_mapper("audio", mm_data=data,
input_mapper_for_qwen2_audio) mm_processor_kwargs={},
)
@MULTIMODAL_REGISTRY.register_max_multimodal_tokens( @MULTIMODAL_REGISTRY.register_max_multimodal_tokens(
"audio", get_max_qwen2_audio_audio_tokens) "audio", get_max_qwen2_audio_audio_tokens)
@MULTIMODAL_REGISTRY.register_processor(Qwen2AudioMultiModalProcessor)
class Qwen2AudioForConditionalGeneration(nn.Module, SupportsMultiModal, class Qwen2AudioForConditionalGeneration(nn.Module, SupportsMultiModal,
SupportsPP): SupportsPP):
...@@ -289,9 +216,7 @@ class Qwen2AudioForConditionalGeneration(nn.Module, SupportsMultiModal, ...@@ -289,9 +216,7 @@ class Qwen2AudioForConditionalGeneration(nn.Module, SupportsMultiModal,
return get_sampler() return get_sampler()
def _validate_and_reshape_mm_tensor(self, def _validate_and_reshape_mm_tensor(self, mm_input: object,
mm_input: Union[torch.Tensor,
List[torch.Tensor]],
name: str) -> torch.Tensor: name: str) -> torch.Tensor:
if not isinstance(mm_input, (torch.Tensor, list)): if not isinstance(mm_input, (torch.Tensor, list)):
raise ValueError(f"Incorrect type of {name}. " raise ValueError(f"Incorrect type of {name}. "
......
# Adapted from
# https://huggingface.co/Qwen/Qwen2.5-Math-RM-72B/blob/main/modeling_qwen2_rm.py
# Copyright 2024 Kakao Corp. (Kanana-X Team)
# Copyright 2024 The Qwen team.
# Copyright 2023 The vLLM team.
"""Inference-only Qwen2-Classification model compatible with HF weights."""
from typing import Iterable, List, Optional, Set, Tuple
import torch
from torch import nn
from vllm.attention import AttentionMetadata
from vllm.config import VllmConfig
from vllm.model_executor.layers.linear import RowParallelLinear
from vllm.model_executor.layers.pooler import Pooler, PoolingType
from vllm.model_executor.models.qwen2 import Qwen2Model
from vllm.model_executor.pooling_metadata import PoolingMetadata
from vllm.sequence import IntermediateTensors, PoolerOutput
from .interfaces import SupportsLoRA, SupportsPP
from .utils import AutoWeightsLoader, maybe_prefix
class Qwen2ForSequenceClassification(nn.Module, SupportsLoRA, SupportsPP):
packed_modules_mapping = {
"qkv_proj": [
"q_proj",
"k_proj",
"v_proj",
],
"gate_up_proj": [
"gate_proj",
"up_proj",
],
}
# LoRA specific attributes
supported_lora_modules = [
"qkv_proj",
"o_proj",
"gate_up_proj",
"down_proj",
]
embedding_modules = {}
embedding_padding_modules = []
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
super().__init__()
config = vllm_config.model_config.hf_config
quant_config = vllm_config.quant_config
lora_config = vllm_config.lora_config
pooler_config = vllm_config.model_config.pooler_config
self.config = config
self.lora_config = lora_config
self.quant_config = quant_config
self.model = Qwen2Model(vllm_config=vllm_config,
prefix=maybe_prefix(prefix, "model"))
# hidden_states from Qwen2Model has been reduced,
# the input of score layer is not parallelized.
self.score = RowParallelLinear(config.hidden_size,
config.num_labels,
quant_config=quant_config,
input_is_parallel=False,
bias=False,
prefix=maybe_prefix(prefix, "score"))
self._pooler = Pooler.from_config_with_defaults(
pooler_config,
pooling_type=PoolingType.LAST,
normalize=False,
softmax=True)
def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
return self.model.get_input_embeddings(input_ids)
def forward(
self,
input_ids: torch.Tensor,
positions: torch.Tensor,
kv_caches: List[torch.Tensor],
attn_metadata: AttentionMetadata,
intermediate_tensors: Optional[IntermediateTensors] = None,
inputs_embeds: Optional[torch.Tensor] = None,
) -> torch.Tensor:
hidden_states = self.model(input_ids, positions, kv_caches,
attn_metadata, intermediate_tensors,
inputs_embeds)
logits, _ = self.score(hidden_states)
return logits
def pooler(
self,
hidden_states: torch.Tensor,
pooling_metadata: PoolingMetadata,
) -> Optional[PoolerOutput]:
return self._pooler(hidden_states, pooling_metadata)
def load_weights(self, weights: Iterable[Tuple[str,
torch.Tensor]]) -> Set[str]:
loader = AutoWeightsLoader(self,
ignore_unexpected_prefixes=["lm_head."])
return loader.load_weights(weights)
...@@ -22,28 +22,26 @@ ...@@ -22,28 +22,26 @@
# limitations under the License. # limitations under the License.
"""Inference-only Qwen2-VL model compatible with HuggingFace weights.""" """Inference-only Qwen2-VL model compatible with HuggingFace weights."""
from functools import cached_property, partial from functools import cached_property, partial
from typing import (Any, Callable, Dict, Iterable, List, Literal, Mapping, from typing import (Any, Iterable, List, Literal, Mapping, Optional, Set,
Optional, Set, Tuple, Type, TypedDict, Union) Tuple, Type, TypedDict, Union)
import torch import torch
import torch.nn as nn import torch.nn as nn
import torch.nn.functional as F import torch.nn.functional as F
from einops import rearrange, repeat from einops import rearrange, repeat
from PIL import Image from PIL import Image
from transformers.image_utils import (get_image_size, from transformers import BatchFeature
infer_channel_dimension_format, from transformers.models.qwen2_vl import (Qwen2VLImageProcessor,
to_numpy_array) Qwen2VLProcessor)
from transformers.models.qwen2_vl.configuration_qwen2_vl import ( from transformers.models.qwen2_vl.configuration_qwen2_vl import (
Qwen2VLConfig, Qwen2VLVisionConfig) Qwen2VLConfig, Qwen2VLVisionConfig)
from transformers.models.qwen2_vl.image_processing_qwen2_vl import ( from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
make_batched_images, make_batched_videos, smart_resize)
from vllm.attention import AttentionMetadata from vllm.attention import AttentionMetadata
from vllm.config import VllmConfig from vllm.config import VllmConfig
from vllm.distributed import parallel_state from vllm.distributed import parallel_state
from vllm.distributed import utils as dist_utils from vllm.distributed import utils as dist_utils
from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData, from vllm.inputs import InputContext
InputContext, token_inputs)
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.model_executor import SamplingMetadata from vllm.model_executor import SamplingMetadata
from vllm.model_executor.layers.activation import QuickGELU from vllm.model_executor.layers.activation import QuickGELU
...@@ -56,14 +54,14 @@ from vllm.model_executor.layers.quantization.gptq_marlin import ( ...@@ -56,14 +54,14 @@ from vllm.model_executor.layers.quantization.gptq_marlin import (
from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.model_loader.weight_utils import default_weight_loader
from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.image import cached_get_image_processor from vllm.multimodal.inputs import MultiModalDataDict, NestedTensors
from vllm.multimodal.inputs import (MultiModalData, MultiModalDataDict, from vllm.multimodal.processing import (BaseMultiModalProcessor,
MultiModalKwargs, NestedTensors) MultiModalDataItems, ProcessorInputs,
from vllm.multimodal.utils import cached_get_tokenizer PromptReplacement)
from vllm.platforms import _Backend from vllm.platforms import _Backend
from vllm.sequence import IntermediateTensors, SequenceData from vllm.sequence import IntermediateTensors
from vllm.transformers_utils.config import uses_mrope from vllm.transformers_utils.config import uses_mrope
from vllm.transformers_utils.processor import cached_get_processor from vllm.utils import is_list_of
from .interfaces import SupportsLoRA, SupportsMultiModal, SupportsPP from .interfaces import SupportsLoRA, SupportsMultiModal, SupportsPP
from .utils import (AutoWeightsLoader, WeightsMapper, get_vit_attn_backend, from .utils import (AutoWeightsLoader, WeightsMapper, get_vit_attn_backend,
...@@ -164,7 +162,7 @@ class Qwen2VisionMLP(nn.Module): ...@@ -164,7 +162,7 @@ class Qwen2VisionMLP(nn.Module):
def __init__( def __init__(
self, self,
in_features: int, in_features: int,
hidden_features: int = None, hidden_features: int,
act_layer: Type[nn.Module] = QuickGELU, act_layer: Type[nn.Module] = QuickGELU,
quant_config: Optional[QuantizationConfig] = None, quant_config: Optional[QuantizationConfig] = None,
prefix: str = "", prefix: str = "",
...@@ -693,78 +691,8 @@ class Qwen2VisionTransformer(nn.Module): ...@@ -693,78 +691,8 @@ class Qwen2VisionTransformer(nn.Module):
# === Vision input helpers === # # === Vision input helpers === #
def get_mm_processor_kwargs(
min_pixels: Optional[int] = None,
max_pixels: Optional[int] = None) -> Dict[str, int]:
mm_processor_kwargs = {}
if min_pixels:
mm_processor_kwargs["min_pixels"] = min_pixels
if max_pixels:
mm_processor_kwargs["max_pixels"] = max_pixels
return mm_processor_kwargs
def mm_input_mapper_for_qwen2_vl(
ctx: InputContext,
data: MultiModalData[object],
data_type_key: str,
*,
min_pixels: Optional[int] = None,
max_pixels: Optional[int] = None,
) -> MultiModalKwargs:
"""Input mapper for Qwen2-VL."""
if data_type_key == "image" and isinstance(data, dict):
return MultiModalKwargs({
"image_embeds": data.get("image_embeds"),
"image_grid_thw": data.get("image_grid_thw"),
})
if data_type_key == "video" and isinstance(data, dict):
return MultiModalKwargs({
"video_embeds": data.get("video_embeds"),
"video_grid_thw": data.get("video_grid_thw"),
})
model_config = ctx.model_config
# Handle mm processor kwargs; we pass these at creation time
# because preprocess() in transformers doesn't expose them
mm_processor_kwargs = get_mm_processor_kwargs(min_pixels=min_pixels,
max_pixels=max_pixels)
image_processor = cached_get_image_processor(
model_config.model,
trust_remote_code=model_config.trust_remote_code,
**mm_processor_kwargs,
)
if image_processor is None:
raise RuntimeError("No HuggingFace processor is available "
"to process the image object")
images = None
videos = None
if data_type_key == "image":
images = data
else:
assert data_type_key == "video"
videos = data
try:
batch_data = image_processor \
.preprocess(images=images, videos=videos, return_tensors="pt") \
.data
except Exception:
logger.error("Failed to process image (%s)", data)
raise
return MultiModalKwargs(batch_data)
image_input_mapper_for_qwen2_vl = partial(mm_input_mapper_for_qwen2_vl,
data_type_key="image")
video_input_mapper_for_qwen2_vl = partial(mm_input_mapper_for_qwen2_vl,
data_type_key="video")
def _get_vision_info( def _get_vision_info(
image_processor, vision_config: Qwen2VLVisionConfig,
height: int, height: int,
width: int, width: int,
min_pixels: int, min_pixels: int,
...@@ -775,12 +703,15 @@ def _get_vision_info( ...@@ -775,12 +703,15 @@ def _get_vision_info(
): ):
"""Get information (resized height / width and number of vision tokens) """Get information (resized height / width and number of vision tokens)
of input image / video frame.""" of input image / video frame."""
patch_size = vision_config.patch_size
merge_size = vision_config.spatial_merge_size
temporal_patch_size = vision_config.temporal_patch_size
if do_resize: if do_resize:
resized_height, resized_width = smart_resize( resized_height, resized_width = smart_resize(
height=height, height=height,
width=width, width=width,
factor=image_processor.patch_size * image_processor.merge_size, factor=patch_size * merge_size,
min_pixels=min_pixels, min_pixels=min_pixels,
max_pixels=max_pixels, max_pixels=max_pixels,
) )
...@@ -791,54 +722,41 @@ def _get_vision_info( ...@@ -791,54 +722,41 @@ def _get_vision_info(
grid_t = mm_count grid_t = mm_count
else: else:
assert data_type_key == "video" assert data_type_key == "video"
grid_t = max(mm_count // image_processor.temporal_patch_size, 1) grid_t = max(mm_count // temporal_patch_size, 1)
grid_h = resized_height // image_processor.patch_size grid_h = resized_height // patch_size
grid_w = resized_width // image_processor.patch_size grid_w = resized_width // patch_size
vision_tokens = grid_t * grid_h * grid_w vision_tokens = grid_t * grid_h * grid_w
llm_num_vision_tokens = (vision_tokens // image_processor.merge_size // llm_num_vision_tokens = vision_tokens // (merge_size**2)
image_processor.merge_size)
return resized_height, resized_width, llm_num_vision_tokens return resized_height, resized_width, llm_num_vision_tokens
def _get_max_image_info( def _get_image_processor(hf_processor: Qwen2VLProcessor):
image_processor, image_processor = hf_processor.image_processor # type: ignore
data_type_key: str = "image", assert isinstance(image_processor, Qwen2VLImageProcessor)
mm_count: int = 1, return image_processor
min_pixels: Optional[int] = None,
max_pixels: Optional[int] = None,
):
# Limit min / max pixels unless they're explicitly provided
if min_pixels is None:
min_pixels = max(image_processor.min_pixels, 28 * 28)
if max_pixels is None:
max_pixels = min(image_processor.max_pixels, 1280 * 28 * 28)
return _get_vision_info(
image_processor,
height=9999999,
width=9999999,
min_pixels=min_pixels,
max_pixels=max_pixels,
data_type_key=data_type_key,
mm_count=mm_count,
)
def get_max_qwen2_vl_mm_tokens(ctx: InputContext, def get_max_qwen2_vl_mm_tokens(ctx: InputContext,
data_type_key: str, data_type_key: str,
*, *,
min_pixels=None, min_pixels: Optional[int] = None,
max_pixels=None) -> int: max_pixels: Optional[int] = None) -> int:
mm_processor_kwargs = get_mm_processor_kwargs(min_pixels=min_pixels, hf_config = ctx.get_hf_config(Qwen2VLConfig)
max_pixels=max_pixels) vision_config = hf_config.vision_config
image_processor = cached_get_image_processor(ctx.model_config.model,
**mm_processor_kwargs) hf_processor = ctx.get_hf_processor(Qwen2VLProcessor)
max_resized_height, max_resized_width, max_llm_image_tokens = \ image_processor = _get_image_processor(hf_processor)
_get_max_image_info(image_processor, data_type_key=data_type_key,
mm_count=1, min_pixels=min_pixels, _, _, max_llm_image_tokens = _get_vision_info(
max_pixels=max_pixels) vision_config,
height=9999999,
width=9999999,
min_pixels=min_pixels or image_processor.min_pixels,
max_pixels=max_pixels or image_processor.max_pixels,
data_type_key=data_type_key,
)
return max_llm_image_tokens return max_llm_image_tokens
...@@ -848,290 +766,166 @@ get_max_qwen2_vl_video_tokens = partial(get_max_qwen2_vl_mm_tokens, ...@@ -848,290 +766,166 @@ get_max_qwen2_vl_video_tokens = partial(get_max_qwen2_vl_mm_tokens,
data_type_key="video") data_type_key="video")
def dummy_data_for_qwen2_vl( class Qwen2VLMultiModalDataItems(MultiModalDataItems):
ctx: InputContext,
seq_len: int,
mm_counts: Mapping[str, int],
*,
min_pixels: Optional[int] = None,
max_pixels: Optional[int] = None
) -> Tuple[SequenceData, Optional[MultiModalDataDict]]:
mm_processor_kwargs = get_mm_processor_kwargs(min_pixels=min_pixels,
max_pixels=max_pixels)
image_processor = cached_get_image_processor(ctx.model_config.model,
**mm_processor_kwargs)
num_images = mm_counts["image"]
max_resized_height, max_resized_width, max_llm_image_tokens = \
_get_max_image_info(image_processor, data_type_key="image",
mm_count=num_images, min_pixels=min_pixels,
max_pixels=max_pixels)
if seq_len - max_llm_image_tokens - 2 < 0:
raise RuntimeError(
f"Qwen2-VL cannot process {num_images} images in a prompt, "
"please increase max_model_len or reduce image limit by "
"--limit-mm-per-prompt.")
# Check video counts.
num_videos = mm_counts["video"]
max_resized_height, max_resized_width, max_llm_video_tokens = \
_get_max_image_info(image_processor, data_type_key="video",
mm_count=num_videos, min_pixels=min_pixels,
max_pixels=max_pixels)
if seq_len - max_llm_video_tokens - 2 < 0:
raise RuntimeError(
f"Qwen2-VL cannot process {num_videos} videos in a prompt, "
"please increase max_model_len or reduce video limit by "
"--limit-mm-per-prompt.")
hf_config = ctx.get_hf_config(Qwen2VLConfig)
dummy_seqdata = SequenceData.from_prompt_token_counts(
(hf_config.vision_start_token_id, 1),
(hf_config.image_token_id, max_llm_image_tokens),
(hf_config.vision_end_token_id, 1),
(0, seq_len - max_llm_image_tokens - 2),
)
dummy_image = Image.new("RGB", (max_resized_width, max_resized_height),
color=0)
return DummyData(dummy_seqdata, {
"image":
dummy_image if num_images == 1 else [dummy_image] * num_images
})
@staticmethod
def from_dict(data: MultiModalDataDict) -> "MultiModalDataItems":
"""
Normalize :class:`MultiModalDataDict` to :class:`MultiModalDataItems`.
"""
multi_data = Qwen2VLMultiModalDataItems()
for k, v in data.items():
# TODO: Make a separate modality for embedding inputs
# to avoid confusion
# yapf: disable
if k == "video":
# Special case since even a single item can be a list
multi_data[k] = ( # type: ignore[index]
v if (isinstance(v, (dict, torch.Tensor)) # type: ignore[assignment]
or is_list_of(v, list)) else [v]
)
elif k in ("image", "audio"):
multi_data[k] = ( # type: ignore[index]
v if isinstance(v, (dict, torch.Tensor, list)) else [v]
)
else:
multi_data[k] = v if isinstance(v, list) else [v] # type: ignore[index]
# yapf: enable
def _get_llm_num_vision_tokens( return multi_data
mm_inputs: list,
data_type_key: str,
image_processor,
min_pixels: int,
max_pixels: int,
):
"""Get number of vision tokens of multimodal inputs.
This method is derived from `transformers.models.qwen2_vl. def get_item_counts(self) -> Mapping[str, int]:
image_processing_qwen2_vl.Qwen2VLImageProcessor._preprocess`. return {
""" m: (
image = to_numpy_array(mm_inputs[0]) len(items[f"{m}_grid_thw"]) # type: ignore
input_data_format = infer_channel_dimension_format(image) if isinstance(items, dict) else len(items))
height, width = get_image_size(image, channel_dim=input_data_format) for m, items in self.items()
}
_, _, llm_num_vision_tokens = _get_vision_info(
image_processor,
height=height,
width=width,
min_pixels=min_pixels,
max_pixels=max_pixels,
do_resize=image_processor.do_resize,
data_type_key=data_type_key,
mm_count=len(mm_inputs),
)
return llm_num_vision_tokens
def _expand_pad_tokens(inputs: list, token_id: int, make_batched_fn: Callable, class Qwen2VLMultiModalProcessor(BaseMultiModalProcessor):
data_type_key: str, image_processor: Any,
prompt_token_ids: List[int], min_pixels: Optional[int],
max_pixels: Optional[int]) -> List[int]:
"""
Expand pad tokens for multi-modal inputs (e.g., images or videos).
Args:
inputs (list): The multi-modal inputs (e.g., images or videos).
token_id (int): The token ID used to represent the multi-modal input.
make_batched_fn (Callable): A function to batch the inputs.
data_type_key (str): The type of the multi-modal input.
image_processor (Any): The image processor used to process the inputs.
prompt_token_ids (List[int]): The list of token IDs in the prompt.
min_pixels (int): min pixels to used for img processing
max_pixels (int): max pixels to be used for img processing
Returns:
List[int]: The list of token IDs for the multi-modal inputs.
"""
indices = [
idx for idx, token in enumerate(prompt_token_ids) if token == token_id
]
inputs = make_batched_fn(inputs)
assert len(indices) == len(inputs)
prompt_token_ids_with_data = []
for cnt, data in enumerate(inputs):
num_tokens = _get_llm_num_vision_tokens(
[data] if data_type_key == "image" else data,
data_type_key=data_type_key,
image_processor=image_processor,
min_pixels=min_pixels,
max_pixels=max_pixels,
)
if cnt == 0:
end_idx = indices[cnt]
non_data_tokens = prompt_token_ids[:end_idx]
else:
non_data_tokens = prompt_token_ids[indices[cnt - 1] +
1:indices[cnt]]
prompt_token_ids_with_data.extend(non_data_tokens)
prompt_token_ids_with_data.extend(token_id for _ in range(num_tokens))
prompt_token_ids_with_data.extend(prompt_token_ids[indices[-1] + 1:])
return prompt_token_ids_with_data
def input_processor_for_qwen2_vl(
ctx: InputContext,
inputs: DecoderOnlyInputs,
*,
min_pixels: Optional[int] = None,
max_pixels: Optional[int] = None,
) -> DecoderOnlyInputs:
multi_modal_data = inputs.get("multi_modal_data")
if multi_modal_data is None:
return inputs
image_inputs = multi_modal_data.get("image", None)
video_inputs = multi_modal_data.get("video", None)
processor = cached_get_processor(ctx.model_config.model)
image_processor = processor.image_processor
# Apply processor kwarg overrides for image processor options
min_pixels = min_pixels if min_pixels else image_processor.min_pixels
max_pixels = max_pixels if max_pixels else image_processor.max_pixels
model_config = ctx.model_config
hf_config = ctx.get_hf_config(Qwen2VLConfig)
# To avoid redundant processing of vision objects (resize, rescale, etc.), def _get_mm_items(
# we extract code of calculating number of vision tokens from self,
# `transformers.models.qwen2_vl.processing_qwen2_vl.Qwen2VLProcessor`. mm_data: MultiModalDataDict,
# ) -> MultiModalDataItems:
# The following code is equivalent to: return Qwen2VLMultiModalDataItems.from_dict(mm_data)
# prompt = inputs["prompt"]
# inputs = processor(text=[prompt],
# images=image_inputs,
# videos=video_inputs,
# padding=True,
# return_tensors="pt")
# prompt_token_ids = inputs["input_ids"][0].tolist()
tokenizer = cached_get_tokenizer(
model_config.tokenizer,
trust_remote_code=model_config.trust_remote_code)
prompt_token_ids = inputs["prompt_token_ids"]
# Expand image pad tokens.
if image_inputs is not None:
if isinstance(image_inputs, dict):
prompt_token_ids_with_image = []
image_indices = [
idx for idx, token in enumerate(prompt_token_ids)
if token == hf_config.image_token_id
]
# ensure all image tokens have grid_thw def _get_hf_processor(
assert \ self,
len(image_indices) == image_inputs["image_grid_thw"].size(0), \ *,
"image token num does not match image_grid_thw.shape" min_pixels: Optional[int] = None,
max_pixels: Optional[int] = None,
image_counter = 0 ) -> Qwen2VLProcessor:
pad_token_counter = 0 hf_processor = self.ctx.get_hf_processor(Qwen2VLProcessor)
for idx, token in enumerate(prompt_token_ids): image_processor = _get_image_processor(hf_processor)
if idx in image_indices:
grid_thw = image_inputs["image_grid_thw"][image_counter] if min_pixels:
grid_t, grid_h, grid_w = grid_thw image_processor.min_pixels = min_pixels
num_pad_tokens = (grid_t * grid_h * grid_w // if max_pixels:
image_processor.merge_size // image_processor.max_pixels = max_pixels
image_processor.merge_size) if max_pixels or min_pixels:
prompt_token_ids_with_image.extend([token] * image_processor.size = {
num_pad_tokens) "min_pixels": image_processor.min_pixels,
image_counter += 1 "max_pixels": image_processor.max_pixels,
pad_token_counter += num_pad_tokens }
return hf_processor
def _get_processor_data(
self,
mm_items: MultiModalDataItems,
) -> tuple[dict[str, Any], dict[str, Any]]:
processor_data = dict[str, Any]()
passthrough_data = dict[str, Any]()
for k, v in mm_items.items():
# TODO: Make a separate modality for embedding inputs
# to avoid confusion
if k in ("image", "video", "audio"):
if isinstance(v, dict):
# Pass through embedding inputs (dict)
passthrough_data.update(v)
elif isinstance(v, torch.Tensor) and v.ndim == 3:
# Pass through embedding inputs (single)
passthrough_data[f"{k}_embeds"] = [v]
elif (is_list_of(v, torch.Tensor) and len(v) > 0
and v[0].ndim == 2):
# Pass through embedding inputs (multi)
passthrough_data[f"{k}_embeds"] = v
else: else:
prompt_token_ids_with_image.append(token) # Map keys to plural form, e.g.: image -> images
processor_data[f"{k}s"] = v
else:
processor_data[k] = v
# ensure all embeddings are used return processor_data, passthrough_data
assert \
pad_token_counter == image_inputs["image_embeds"].size(0), \
"image_embeds.shape does not match image_grid_thw"
prompt_token_ids = prompt_token_ids_with_image def _get_prompt_replacements(
else: self,
prompt_token_ids = _expand_pad_tokens(image_inputs, mm_items: MultiModalDataItems,
hf_config.image_token_id, hf_inputs: BatchFeature,
make_batched_images, mm_processor_kwargs: Mapping[str, object],
"image", ) -> list[PromptReplacement]:
image_processor, hf_processor = self._get_hf_processor()
prompt_token_ids, image_processor = _get_image_processor(hf_processor)
min_pixels=min_pixels,
max_pixels=max_pixels) # NOTE: Only Qwen2VLProcessor in transformers 4.47.0 has
# image_token and video_token registered
if video_inputs is not None: placeholder = {
if isinstance(video_inputs, dict): "image": hf_processor.image_token,
prompt_token_ids_with_video = [] "video": hf_processor.video_token,
video_indices = [ }
idx for idx, token in enumerate(prompt_token_ids) merge_length = image_processor.merge_size**2
if token == hf_config.video_token_id
] def get_replacement_qwen2vl(item_idx: int, modality: str):
grid_thw = hf_inputs[f"{modality}_grid_thw"][item_idx]
num_tokens = grid_thw.prod() // merge_length
return placeholder[modality] * num_tokens
return [
PromptReplacement(
modality=modality,
target=placeholder[modality],
replacement=partial(get_replacement_qwen2vl,
modality=modality),
) for modality in ("image", "video")
]
# ensure all video tokens have grid_thw def _get_dummy_mm_inputs(
assert \ self,
len(video_indices) == video_inputs["video_grid_thw"].size(0), \ mm_counts: Mapping[str, int],
"video token num does not match video_grid_thw.shape" ) -> ProcessorInputs:
num_images = mm_counts["image"]
video_counter = 0 hf_processor = self._get_hf_processor()
pad_token_counter = 0 image_token: str = hf_processor.image_token
for idx, token in enumerate(prompt_token_ids): image_processor = _get_image_processor(hf_processor)
if idx in video_indices:
grid_thw = video_inputs["video_grid_thw"][video_counter] data = {}
grid_t, grid_h, grid_w = grid_thw resized_height, resized_width = smart_resize(
num_pad_tokens = (grid_t * grid_h * grid_w // height=9999999,
image_processor.merge_size // width=9999999,
image_processor.merge_size) factor=image_processor.patch_size * image_processor.merge_size,
prompt_token_ids_with_video.extend([token] * min_pixels=image_processor.min_pixels,
num_pad_tokens) max_pixels=image_processor.max_pixels,
video_counter += 1 )
pad_token_counter += num_pad_tokens
else:
prompt_token_ids_with_video.append(token)
# ensure all embeddings are used dummy_image = Image.new("RGB", (resized_width, resized_height),
assert \ color=0)
pad_token_counter == video_inputs["video_embeds"].size(0), \ data["image"] = [dummy_image] * num_images
"video_embeds.shape does not match video_grid_thw"
prompt_token_ids = prompt_token_ids_with_video return ProcessorInputs(
else: prompt_text=image_token * num_images,
prompt_token_ids = _expand_pad_tokens(video_inputs, mm_data=data,
hf_config.video_token_id, mm_processor_kwargs={},
make_batched_videos, )
"video",
image_processor,
prompt_token_ids,
min_pixels=min_pixels,
max_pixels=max_pixels)
prompt = inputs.get("prompt")
if prompt is None:
prompt = tokenizer.decode(prompt_token_ids)
return token_inputs(
prompt_token_ids=prompt_token_ids,
prompt=prompt,
multi_modal_data=multi_modal_data,
)
@MULTIMODAL_REGISTRY.register_image_input_mapper(
image_input_mapper_for_qwen2_vl)
@MULTIMODAL_REGISTRY.register_input_mapper("video",
video_input_mapper_for_qwen2_vl)
@MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_qwen2_vl_image_tokens) @MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_qwen2_vl_image_tokens)
@MULTIMODAL_REGISTRY.register_max_multimodal_tokens( @MULTIMODAL_REGISTRY.register_max_multimodal_tokens(
"video", get_max_qwen2_vl_video_tokens) "video", get_max_qwen2_vl_video_tokens)
@INPUT_REGISTRY.register_dummy_data(dummy_data_for_qwen2_vl) @MULTIMODAL_REGISTRY.register_processor(Qwen2VLMultiModalProcessor)
@INPUT_REGISTRY.register_input_processor(input_processor_for_qwen2_vl)
class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal, class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal,
SupportsLoRA, SupportsPP): SupportsLoRA, SupportsPP):
packed_modules_mapping = { packed_modules_mapping = {
...@@ -1156,10 +950,15 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal, ...@@ -1156,10 +950,15 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal,
] ]
embedding_modules = {} embedding_modules = {}
embedding_padding_modules = [] embedding_padding_modules = []
# To ensure correct weight loading and mapping.
hf_to_vllm_mapper = WeightsMapper(orig_to_new_prefix={
"lm_head.": "language_model.lm_head.",
"model.": "language_model.model.",
})
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
super().__init__() super().__init__()
config = vllm_config.model_config.hf_config config: Qwen2VLConfig = vllm_config.model_config.hf_config
cache_config = vllm_config.cache_config cache_config = vllm_config.cache_config
quant_config = vllm_config.quant_config quant_config = vllm_config.quant_config
multimodal_config = vllm_config.model_config.multimodal_config multimodal_config = vllm_config.model_config.multimodal_config
...@@ -1456,11 +1255,6 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal, ...@@ -1456,11 +1255,6 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal,
def load_weights(self, weights: Iterable[Tuple[str, def load_weights(self, weights: Iterable[Tuple[str,
torch.Tensor]]) -> Set[str]: torch.Tensor]]) -> Set[str]:
hf_to_vllm_mapper = WeightsMapper(
orig_to_new_prefix={
"lm_head.": "language_model.lm_head.",
"model.": "language_model.model.",
})
loader = AutoWeightsLoader(self) loader = AutoWeightsLoader(self)
return loader.load_weights(weights, mapper=hf_to_vllm_mapper) return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
...@@ -20,11 +20,10 @@ import torch.nn as nn ...@@ -20,11 +20,10 @@ import torch.nn as nn
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.platforms import current_platform from vllm.platforms import current_platform
from .adapters import as_embedding_model
from .interfaces import (has_inner_state, is_attention_free, is_hybrid, from .interfaces import (has_inner_state, is_attention_free, is_hybrid,
supports_cross_encoding, supports_multimodal, supports_cross_encoding, supports_multimodal,
supports_pp) supports_pp)
from .interfaces_base import is_pooling_model, is_text_generation_model from .interfaces_base import is_text_generation_model
logger = init_logger(__name__) logger = init_logger(__name__)
...@@ -46,6 +45,7 @@ _TEXT_GENERATION_MODELS = { ...@@ -46,6 +45,7 @@ _TEXT_GENERATION_MODELS = {
"DeciLMForCausalLM": ("decilm", "DeciLMForCausalLM"), "DeciLMForCausalLM": ("decilm", "DeciLMForCausalLM"),
"DeepseekForCausalLM": ("deepseek", "DeepseekForCausalLM"), "DeepseekForCausalLM": ("deepseek", "DeepseekForCausalLM"),
"DeepseekV2ForCausalLM": ("deepseek_v2", "DeepseekV2ForCausalLM"), "DeepseekV2ForCausalLM": ("deepseek_v2", "DeepseekV2ForCausalLM"),
"DeepseekV3ForCausalLM": ("deepseek_v3", "DeepseekV3ForCausalLM"),
"ExaoneForCausalLM": ("exaone", "ExaoneForCausalLM"), "ExaoneForCausalLM": ("exaone", "ExaoneForCausalLM"),
"FalconForCausalLM": ("falcon", "FalconForCausalLM"), "FalconForCausalLM": ("falcon", "FalconForCausalLM"),
"GemmaForCausalLM": ("gemma", "GemmaForCausalLM"), "GemmaForCausalLM": ("gemma", "GemmaForCausalLM"),
...@@ -113,6 +113,7 @@ _EMBEDDING_MODELS = { ...@@ -113,6 +113,7 @@ _EMBEDDING_MODELS = {
"Gemma2Model": ("gemma2", "Gemma2ForCausalLM"), "Gemma2Model": ("gemma2", "Gemma2ForCausalLM"),
"GlmForCausalLM": ("glm", "GlmForCausalLM"), "GlmForCausalLM": ("glm", "GlmForCausalLM"),
"GritLM": ("gritlm", "GritLM"), "GritLM": ("gritlm", "GritLM"),
"JambaForSequenceClassification": ("jamba", "JambaForSequenceClassification"), # noqa: E501
"LlamaModel": ("llama", "LlamaForCausalLM"), "LlamaModel": ("llama", "LlamaForCausalLM"),
**{ **{
# Multiple models share the same architecture, so we include them all # Multiple models share the same architecture, so we include them all
...@@ -124,12 +125,13 @@ _EMBEDDING_MODELS = { ...@@ -124,12 +125,13 @@ _EMBEDDING_MODELS = {
"Qwen2Model": ("qwen2", "Qwen2EmbeddingModel"), "Qwen2Model": ("qwen2", "Qwen2EmbeddingModel"),
"Qwen2ForCausalLM": ("qwen2", "Qwen2ForCausalLM"), "Qwen2ForCausalLM": ("qwen2", "Qwen2ForCausalLM"),
"Qwen2ForRewardModel": ("qwen2_rm", "Qwen2ForRewardModel"), "Qwen2ForRewardModel": ("qwen2_rm", "Qwen2ForRewardModel"),
"Qwen2ForSequenceClassification": ("qwen2_cls", "Qwen2ForSequenceClassification"), # noqa: E501
"TeleChat2ForCausalLM": ("telechat2", "TeleChat2ForCausalLM"), "TeleChat2ForCausalLM": ("telechat2", "TeleChat2ForCausalLM"),
# [Multimodal] # [Multimodal]
"LlavaNextForConditionalGeneration": ("llava_next", "LlavaNextForConditionalGeneration"), # noqa: E501 "LlavaNextForConditionalGeneration": ("llava_next", "LlavaNextForConditionalGeneration"), # noqa: E501
"Phi3VForCausalLM": ("phi3v", "Phi3VForCausalLM"), "Phi3VForCausalLM": ("phi3v", "Phi3VForCausalLM"),
"Qwen2VLForConditionalGeneration": ("qwen2_vl", "Qwen2VLForConditionalGeneration"), # noqa: E501 "Qwen2VLForConditionalGeneration": ("qwen2_vl", "Qwen2VLForConditionalGeneration"), # noqa: E501
# [Auto-converted (see adapters.py)]
"Qwen2ForSequenceClassification": ("qwen2", "Qwen2ForCausalLM"),
} }
_CROSS_ENCODER_MODELS = { _CROSS_ENCODER_MODELS = {
...@@ -225,19 +227,10 @@ class _ModelInfo: ...@@ -225,19 +227,10 @@ class _ModelInfo:
@staticmethod @staticmethod
def from_model_cls(model: Type[nn.Module]) -> "_ModelInfo": def from_model_cls(model: Type[nn.Module]) -> "_ModelInfo":
is_pooling_model_ = is_pooling_model(model)
if not is_pooling_model_:
try:
as_embedding_model(model)
except Exception:
pass
else:
is_pooling_model_ = True
return _ModelInfo( return _ModelInfo(
architecture=model.__name__, architecture=model.__name__,
is_text_generation_model=is_text_generation_model(model), is_text_generation_model=is_text_generation_model(model),
is_pooling_model=is_pooling_model_, is_pooling_model=True, # Can convert any model into a pooling model
supports_cross_encoding=supports_cross_encoding(model), supports_cross_encoding=supports_cross_encoding(model),
supports_multimodal=supports_multimodal(model), supports_multimodal=supports_multimodal(model),
supports_pp=supports_pp(model), supports_pp=supports_pp(model),
......
...@@ -31,6 +31,19 @@ from .utils import (AutoWeightsLoader, PPMissingLayer, WeightsMapper, ...@@ -31,6 +31,19 @@ from .utils import (AutoWeightsLoader, PPMissingLayer, WeightsMapper,
class TeleChat2Model(LlamaModel): class TeleChat2Model(LlamaModel):
hf_to_vllm_mapper = WeightsMapper(
orig_to_new_prefix={
"transformer.": "model.",
},
orig_to_new_substr={
".h.": ".layers.",
".self_attention.": ".self_attn.",
".word_embeddings.": ".embed_tokens.",
".dense.": ".o_proj.",
".ln_f.": ".norm.",
},
)
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
# 1. Initialize the LlamaModel with bias # 1. Initialize the LlamaModel with bias
vllm_config.model_config.hf_config.bias = True vllm_config.model_config.hf_config.bias = True
...@@ -111,21 +124,9 @@ class TeleChat2ForCausalLM(LlamaForCausalLM): ...@@ -111,21 +124,9 @@ class TeleChat2ForCausalLM(LlamaForCausalLM):
def load_weights(self, weights: Iterable[Tuple[str, def load_weights(self, weights: Iterable[Tuple[str,
torch.Tensor]]) -> Set[str]: torch.Tensor]]) -> Set[str]:
hf_to_vllm_mapper = WeightsMapper(
orig_to_new_prefix={
"transformer.": "model.",
},
orig_to_new_substr={
".h.": ".layers.",
".self_attention.": ".self_attn.",
".word_embeddings.": ".embed_tokens.",
".dense.": ".o_proj.",
".ln_f.": ".norm.",
},
)
loader = AutoWeightsLoader( loader = AutoWeightsLoader(
self, self,
skip_prefixes=(["lm_head."] skip_prefixes=(["lm_head."]
if self.config.tie_word_embeddings else None), if self.config.tie_word_embeddings else None),
) )
return loader.load_weights(weights, mapper=hf_to_vllm_mapper) return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
import math import math
from functools import cached_property, lru_cache from functools import cached_property, lru_cache
from typing import (Any, Dict, Iterable, List, Literal, Mapping, Optional, Set, from typing import (Any, Iterable, List, Literal, Mapping, Optional, Set,
Tuple, TypedDict, Union) Tuple, TypedDict, Union)
import numpy as np import numpy as np
...@@ -11,7 +11,7 @@ import torch ...@@ -11,7 +11,7 @@ import torch
import torch.utils.checkpoint import torch.utils.checkpoint
from torch import nn from torch import nn
from torch.nn import functional as F from torch.nn import functional as F
from transformers import BatchFeature from transformers import BatchFeature, ProcessorMixin
from transformers.models.whisper import WhisperFeatureExtractor from transformers.models.whisper import WhisperFeatureExtractor
from transformers.models.whisper.modeling_whisper import WhisperEncoder from transformers.models.whisper.modeling_whisper import WhisperEncoder
...@@ -25,11 +25,11 @@ from vllm.model_executor.model_loader.loader import DefaultModelLoader ...@@ -25,11 +25,11 @@ from vllm.model_executor.model_loader.loader import DefaultModelLoader
from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.multimodal import MULTIMODAL_REGISTRY, NestedTensors from vllm.multimodal import MULTIMODAL_REGISTRY, NestedTensors
from vllm.multimodal.processing import (BaseMultiModalProcessor, from vllm.multimodal.processing import (BaseMultiModalProcessor,
MultiModalDataDict,
MultiModalDataItems, ProcessorInputs, MultiModalDataItems, ProcessorInputs,
PromptReplacement) PromptReplacement)
from vllm.sequence import IntermediateTensors from vllm.sequence import IntermediateTensors
from vllm.transformers_utils.configs.ultravox import UltravoxConfig from vllm.transformers_utils.configs.ultravox import UltravoxConfig
from vllm.utils import is_list_of
from .interfaces import SupportsMultiModal, SupportsPP from .interfaces import SupportsMultiModal, SupportsPP
from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn, from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn,
...@@ -61,8 +61,8 @@ def cached_feature_extractor(model_id: str) -> WhisperFeatureExtractor: ...@@ -61,8 +61,8 @@ def cached_feature_extractor(model_id: str) -> WhisperFeatureExtractor:
def whisper_feature_extractor(ctx: InputContext) -> WhisperFeatureExtractor: def whisper_feature_extractor(ctx: InputContext) -> WhisperFeatureExtractor:
return cached_feature_extractor( hf_config = ctx.get_hf_config(UltravoxConfig)
ctx.get_hf_config(UltravoxConfig).audio_model_id) return cached_feature_extractor(hf_config.audio_model_id)
def get_ultravox_max_audio_tokens(ctx: InputContext): def get_ultravox_max_audio_tokens(ctx: InputContext):
...@@ -73,72 +73,71 @@ def get_ultravox_max_audio_tokens(ctx: InputContext): ...@@ -73,72 +73,71 @@ def get_ultravox_max_audio_tokens(ctx: InputContext):
class UltravoxMultiModalProcessor(BaseMultiModalProcessor): class UltravoxMultiModalProcessor(BaseMultiModalProcessor):
def _get_feature_extractor(self) -> WhisperFeatureExtractor: def _get_feature_extractor(self) -> WhisperFeatureExtractor:
return self._get_hf_processor().audio_processor.feature_extractor hf_processor = self._get_hf_processor()
return hf_processor.audio_processor.feature_extractor # type: ignore
def _resample_audio( def _get_processor_data(
self, self,
audio: np.ndarray, mm_items: MultiModalDataItems,
sr: int, ) -> tuple[dict[str, Any], dict[str, Any]]:
) -> Dict[str, Union[np.ndarray, int]]:
# resample audio to the model's sampling rate # resample audio to the model's sampling rate
feature_extractor = self._get_feature_extractor() feature_extractor = self._get_feature_extractor()
if sr != feature_extractor.sampling_rate: mm_items.resample_audios(feature_extractor.sampling_rate)
try:
import librosa return super()._get_processor_data(mm_items)
except ImportError as exc:
raise ImportError( def _call_hf_processor(
"Please install vllm[audio] for audio support.") from exc
audio = librosa.resample(audio,
orig_sr=sr,
target_sr=feature_extractor.sampling_rate)
sr = feature_extractor.sampling_rate
return {"audio": audio, "sampling_rate": sr}
def _apply_hf_processor(
self, self,
hf_processor: ProcessorMixin,
prompt: str, prompt: str,
mm_data: MultiModalDataDict, processor_data: Mapping[str, object],
mm_processor_kwargs: Mapping[str, object], mm_processor_kwargs: Mapping[str, object],
) -> BatchFeature: ) -> BatchFeature:
if not mm_data or not mm_data.get("audio", None): processor_data = dict(processor_data)
return super()._apply_hf_processor(prompt, mm_data, audios = processor_data.pop("audios", [])
mm_processor_kwargs)
if not audios:
return super()._call_hf_processor(
hf_processor,
prompt=prompt,
processor_data=processor_data,
mm_processor_kwargs=mm_processor_kwargs,
)
feature_extractor = self._get_feature_extractor()
mm_processor_kwargs = dict(
**mm_processor_kwargs,
sampling_rate=feature_extractor.sampling_rate,
)
audio_data = mm_data["audio"] # Already resampled by _get_processor_data
if not isinstance(audio_data, list): assert is_list_of(audios, np.ndarray)
audio_data = [audio_data]
# Ultravox processor doesn't support multiple inputs, # Ultravox processor doesn't support multiple inputs,
# therefore we need to input text and audio one by one # therefore we need to input text and audio one by one
tokenizer = self._get_tokenizer()
audio_features, audio_token_len = [], [] audio_features, audio_token_len = [], []
processed_inputs = {} shared_outputs = {}
for audio, sr in audio_data: for audio in audios:
data = self._resample_audio(audio, sr) # NOTE: Ultravox processor accepts "audio" instead of "audios"
processed_inputs = super()._apply_hf_processor( item_processor_data = dict(**processor_data, audio=audio)
prompt, data, mm_processor_kwargs)
prompt = tokenizer.decode(processed_inputs["input_ids"][0], item_outputs = super()._call_hf_processor(
skip_special_tokens=False) hf_processor,
audio_features.append( prompt=prompt,
processed_inputs.pop("audio_values").squeeze(0)) processor_data=item_processor_data,
audio_token_len.append( mm_processor_kwargs=mm_processor_kwargs,
processed_inputs.pop("audio_token_len").item()) )
return dict( audio_features.append(item_outputs.pop("audio_values")[0])
**processed_inputs, audio_token_len.append(item_outputs.pop("audio_token_len").item())
shared_outputs = item_outputs
combined_outputs = dict(
**shared_outputs,
audio_features=audio_features, audio_features=audio_features,
audio_token_len=audio_token_len, audio_token_len=audio_token_len,
) )
return BatchFeature(combined_outputs)
def _get_processor_data(
self,
mm_data: MultiModalDataDict,
) -> Tuple[Dict[str, Any], Dict[str, Any]]:
# Ultravox uses "audio" instead of "audios" as calling keyword
processor_data, passthrough_data = super()._get_processor_data(mm_data)
if "audios" in processor_data:
processor_data["audio"] = processor_data.pop("audios")
return processor_data, passthrough_data
def _get_prompt_replacements( def _get_prompt_replacements(
self, self,
...@@ -147,7 +146,7 @@ class UltravoxMultiModalProcessor(BaseMultiModalProcessor): ...@@ -147,7 +146,7 @@ class UltravoxMultiModalProcessor(BaseMultiModalProcessor):
mm_processor_kwargs: Mapping[str, object], mm_processor_kwargs: Mapping[str, object],
) -> list[PromptReplacement]: ) -> list[PromptReplacement]:
hf_processor = self._get_hf_processor() hf_processor = self._get_hf_processor()
placeholder = hf_processor.audio_token_replacement placeholder = hf_processor.audio_token_replacement # type: ignore
def get_replacement_ultravox(item_idx: int): def get_replacement_ultravox(item_idx: int):
audio_token_len = hf_inputs["audio_token_len"][item_idx] audio_token_len = hf_inputs["audio_token_len"][item_idx]
...@@ -171,7 +170,7 @@ class UltravoxMultiModalProcessor(BaseMultiModalProcessor): ...@@ -171,7 +170,7 @@ class UltravoxMultiModalProcessor(BaseMultiModalProcessor):
audio_count = mm_counts["audio"] audio_count = mm_counts["audio"]
audio = np.zeros(audio_len) audio = np.zeros(audio_len)
data = {"audio": [(audio, sampling_rate)] * audio_count} data = {"audio": [audio] * audio_count}
return ProcessorInputs( return ProcessorInputs(
prompt_text="<|audio|>" * audio_count, prompt_text="<|audio|>" * audio_count,
...@@ -303,6 +302,9 @@ class ModifiedWhisperEncoder(WhisperEncoder): ...@@ -303,6 +302,9 @@ class ModifiedWhisperEncoder(WhisperEncoder):
@MULTIMODAL_REGISTRY.register_processor(UltravoxMultiModalProcessor) @MULTIMODAL_REGISTRY.register_processor(UltravoxMultiModalProcessor)
class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP): class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP):
hf_to_vllm_mapper = WeightsMapper(
orig_to_new_prefix={"audio_tower.model.encoder.": "audio_tower."})
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
super().__init__() super().__init__()
config = vllm_config.model_config.hf_config config = vllm_config.model_config.hf_config
...@@ -495,9 +497,7 @@ class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP): ...@@ -495,9 +497,7 @@ class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP):
def load_weights(self, weights: Iterable[Tuple[str, def load_weights(self, weights: Iterable[Tuple[str,
torch.Tensor]]) -> Set[str]: torch.Tensor]]) -> Set[str]:
hf_to_vllm_mapper = WeightsMapper(
orig_to_new_prefix={"audio_tower.model.encoder.": "audio_tower."})
loader = AutoWeightsLoader(self, loader = AutoWeightsLoader(self,
ignore_unexpected_prefixes=["audio_tower."]) ignore_unexpected_prefixes=["audio_tower."])
return loader.load_weights(weights, mapper=hf_to_vllm_mapper) return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
\ No newline at end of file
...@@ -328,6 +328,15 @@ class PackedvLLMParameter(ModelWeightParameter): ...@@ -328,6 +328,15 @@ class PackedvLLMParameter(ModelWeightParameter):
marlin_tile_size=self.marlin_tile_size) marlin_tile_size=self.marlin_tile_size)
class BlockQuantScaleParameter(_ColumnvLLMParameter, RowvLLMParameter):
"""
Parameter class for weight scales loaded for weights with
block-wise quantization. Uses both column and row parallelism.
"""
pass
def permute_param_layout_(param: BasevLLMParameter, input_dim: int, def permute_param_layout_(param: BasevLLMParameter, input_dim: int,
output_dim: int, **kwargs) -> BasevLLMParameter: output_dim: int, **kwargs) -> BasevLLMParameter:
""" """
......
...@@ -11,7 +11,7 @@ The global :class:`~MultiModalRegistry` is used by model runners to ...@@ -11,7 +11,7 @@ The global :class:`~MultiModalRegistry` is used by model runners to
dispatch data processing according to its modality and the target model. dispatch data processing according to its modality and the target model.
See also: See also:
:ref:`input_processing_pipeline` :ref:`input-processing-pipeline`
""" """
__all__ = [ __all__ = [
......
import numpy as np
import numpy.typing as npt
from vllm.inputs.registry import InputContext from vllm.inputs.registry import InputContext
from vllm.utils import PlaceholderModule
from .base import MultiModalPlugin from .base import MultiModalPlugin
from .inputs import AudioItem, MultiModalData, MultiModalKwargs from .inputs import AudioItem, MultiModalData, MultiModalKwargs
try:
import librosa
except ImportError:
librosa = PlaceholderModule("librosa") # type: ignore[assignment]
class AudioPlugin(MultiModalPlugin): class AudioPlugin(MultiModalPlugin):
"""Plugin for audio data.""" """Plugin for audio data."""
...@@ -21,3 +30,12 @@ class AudioPlugin(MultiModalPlugin): ...@@ -21,3 +30,12 @@ class AudioPlugin(MultiModalPlugin):
def _default_max_multimodal_tokens(self, ctx: InputContext) -> int: def _default_max_multimodal_tokens(self, ctx: InputContext) -> int:
raise NotImplementedError( raise NotImplementedError(
"There is no default maximum multimodal tokens") "There is no default maximum multimodal tokens")
def resample_audio(
audio: npt.NDArray[np.floating],
*,
orig_sr: float,
target_sr: float,
) -> npt.NDArray[np.floating]:
return librosa.resample(audio, orig_sr=orig_sr, target_sr=target_sr)
...@@ -50,7 +50,7 @@ class MultiModalPlugin(ABC): ...@@ -50,7 +50,7 @@ class MultiModalPlugin(ABC):
(i.e., the modality of the data). (i.e., the modality of the data).
See also: See also:
:ref:`adding_multimodal_plugin` :ref:`adding-multimodal-plugin`
""" """
def __init__(self) -> None: def __init__(self) -> None:
...@@ -94,8 +94,8 @@ class MultiModalPlugin(ABC): ...@@ -94,8 +94,8 @@ class MultiModalPlugin(ABC):
If `None` is provided, then the default input mapper is used instead. If `None` is provided, then the default input mapper is used instead.
See also: See also:
- :ref:`input_processing_pipeline` - :ref:`input-processing-pipeline`
- :ref:`enabling_multimodal_inputs` - :ref:`enabling-multimodal-inputs`
""" """
def wrapper(model_cls: N) -> N: def wrapper(model_cls: N) -> N:
...@@ -130,8 +130,8 @@ class MultiModalPlugin(ABC): ...@@ -130,8 +130,8 @@ class MultiModalPlugin(ABC):
TypeError: If the data type is not supported. TypeError: If the data type is not supported.
See also: See also:
- :ref:`input_processing_pipeline` - :ref:`input-processing-pipeline`
- :ref:`enabling_multimodal_inputs` - :ref:`enabling-multimodal-inputs`
""" """
# Avoid circular import # Avoid circular import
...@@ -190,7 +190,7 @@ class MultiModalPlugin(ABC): ...@@ -190,7 +190,7 @@ class MultiModalPlugin(ABC):
If `None` is provided, then the default calculation is used instead. If `None` is provided, then the default calculation is used instead.
See also: See also:
:ref:`enabling_multimodal_inputs` :ref:`enabling-multimodal-inputs`
""" """
def wrapper(model_cls: N) -> N: def wrapper(model_cls: N) -> N:
...@@ -222,7 +222,7 @@ class MultiModalPlugin(ABC): ...@@ -222,7 +222,7 @@ class MultiModalPlugin(ABC):
The model is identified by ``model_config``. The model is identified by ``model_config``.
See also: See also:
:ref:`enabling_multimodal_inputs` :ref:`enabling-multimodal-inputs`
""" """
# Avoid circular import # Avoid circular import
from vllm.model_executor.model_loader import get_model_architecture from vllm.model_executor.model_loader import get_model_architecture
......
...@@ -84,3 +84,15 @@ class ImagePlugin(MultiModalPlugin): ...@@ -84,3 +84,15 @@ class ImagePlugin(MultiModalPlugin):
def _default_max_multimodal_tokens(self, ctx: InputContext) -> int: def _default_max_multimodal_tokens(self, ctx: InputContext) -> int:
return 3000 return 3000
def rescale_image_size(image: Image.Image,
size_factor: float,
transpose: int = -1) -> Image.Image:
"""Rescale the dimensions of an image by a constant factor."""
new_width = int(image.width * size_factor)
new_height = int(image.height * size_factor)
image = image.resize((new_width, new_height))
if transpose >= 0:
image = image.transpose(Image.Transpose(transpose))
return image
...@@ -15,31 +15,32 @@ _T = TypeVar("_T") ...@@ -15,31 +15,32 @@ _T = TypeVar("_T")
# yapf: disable # yapf: disable
ImageItem: TypeAlias = Union[Image, np.ndarray, torch.Tensor] ImageItem: TypeAlias = Union[Image, np.ndarray, torch.Tensor]
""" """
A :class:`transformers.image_utils.ImageInput` representing a single image, A :class:`transformers.image_utils.ImageInput` representing a single image
which can be passed to a HuggingFace :code:`ImageProcessor`. item, which can be passed to a HuggingFace :code:`ImageProcessor`.
""" """
VideoItem: TypeAlias = Union[ VideoItem: TypeAlias = Union[
List[Image], list[Image],
np.ndarray, np.ndarray,
torch.Tensor, torch.Tensor,
List[np.ndarray], list[np.ndarray],
List[torch.Tensor], list[torch.Tensor],
] ]
""" """
A :class:`transformers.image_utils.VideoInput` representing a single video
A :class:`transformers.image_utils.VideoInput` representing a single video, item, which can be passed to a HuggingFace :code:`VideoProcessor`.
which can be passed to a HuggingFace :code:`VideoProcessor`.
""" """
AudioItem: TypeAlias = Union[ AudioItem: TypeAlias = Union[
np.ndarray, np.ndarray,
List[float], list[float],
Tuple[np.ndarray, float], # DEPRECATED: Use mm_processor_kwargs instead # `(audio, sampling_rate)`: If the audio's sampling rate is different
# from that expected by the model, we need to resample it.
tuple[np.ndarray, float],
] ]
""" """
Represents a single audio that can be inputted to a HuggingFace Represents a single audio
:code:`AudioProcessor`. item, which can be passed to a HuggingFace :code:`AudioProcessor`.
""" """
# yapf: enable # yapf: enable
...@@ -74,7 +75,7 @@ Note: ...@@ -74,7 +75,7 @@ Note:
This dictionary also accepts modality keys defined outside This dictionary also accepts modality keys defined outside
:class:`MultiModalDataBuiltins` as long as a customized plugin :class:`MultiModalDataBuiltins` as long as a customized plugin
is registered through the :class:`~vllm.multimodal.MULTIMODAL_REGISTRY`. is registered through the :class:`~vllm.multimodal.MULTIMODAL_REGISTRY`.
Read more on that :ref:`here <adding_multimodal_plugin>`. Read more on that :ref:`here <adding-multimodal-plugin>`.
""" """
...@@ -215,6 +216,9 @@ class MultiModalInputsV2(TypedDict): ...@@ -215,6 +216,9 @@ class MultiModalInputsV2(TypedDict):
mm_kwargs: MultiModalKwargs mm_kwargs: MultiModalKwargs
"""Keyword arguments to be directly passed to the model after batching.""" """Keyword arguments to be directly passed to the model after batching."""
mm_hashes: NotRequired[List[str]]
"""The hashes of the multi-modal data."""
mm_placeholders: MultiModalPlaceholderDict mm_placeholders: MultiModalPlaceholderDict
""" """
For each modality, information about the placeholder tokens in For each modality, information about the placeholder tokens in
......
...@@ -17,6 +17,7 @@ from vllm.logger import init_logger ...@@ -17,6 +17,7 @@ from vllm.logger import init_logger
from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
from vllm.utils import flatten_2d_lists, full_groupby, is_list_of from vllm.utils import flatten_2d_lists, full_groupby, is_list_of
from .audio import resample_audio
from .inputs import (AudioItem, ImageItem, MultiModalDataDict, from .inputs import (AudioItem, ImageItem, MultiModalDataDict,
MultiModalInputsV2, MultiModalKwargs, PlaceholderRange, MultiModalInputsV2, MultiModalKwargs, PlaceholderRange,
VideoItem) VideoItem)
...@@ -30,7 +31,7 @@ _PromptSeq = Union[str, list[int]] ...@@ -30,7 +31,7 @@ _PromptSeq = Union[str, list[int]]
@dataclass @dataclass
class PromptReplacement: class PromptReplacement:
modality: str modality: str
"""The modality for which the replacement is made""" """The modality for which the replacement is made."""
target: _PromptSeq target: _PromptSeq
"""The text or token sequence to find and replace.""" """The text or token sequence to find and replace."""
...@@ -211,20 +212,54 @@ class MultiModalDataItems(UserDict[str, list[Any]]): ...@@ -211,20 +212,54 @@ class MultiModalDataItems(UserDict[str, list[Any]]):
corresponds to a list. corresponds to a list.
""" """
@staticmethod
def from_dict(data: MultiModalDataDict) -> "MultiModalDataItems":
"""
Normalize :class:`MultiModalDataDict` to :class:`MultiModalDataItems`.
"""
multi_data = MultiModalDataItems()
for k, v in data.items():
# TODO: Make a separate modality for embedding inputs
# to avoid confusion
# yapf: disable
if k == "video":
# Special case since even a single item can be a list
multi_data[k] = ( # type: ignore[index]
v if (isinstance(v, torch.Tensor)
or is_list_of(v, list)) else [v]
)
elif k in ("image", "audio"):
multi_data[k] = ( # type: ignore[index]
v if isinstance(v, (torch.Tensor, list)) else [v]
)
else:
multi_data[k] = v if isinstance(v, list) else [v] # type: ignore[index]
# yapf: enable
return multi_data
# NOTE: When a field (e.g. `images`) doesn't exist, directly appending to
# `self.images` doesn't update this dictionary, which may be confusing
# We annotate the getter methods as `Sequence` to prevent others from
# trying to update the list in this way
@property @property
def image(self) -> list[ImageItem]: def images(self) -> Sequence[ImageItem]:
return self["image"] return self.get("image", [])
@property @property
def video(self) -> list[VideoItem]: def videos(self) -> Sequence[VideoItem]:
return self["video"] return self.get("video", [])
@property @property
def audio(self) -> list[AudioItem]: def audios(self) -> Sequence[AudioItem]:
return self["audio"] return self.get("audio", [])
def get_item_counts(self) -> Mapping[str, int]:
return {m: len(items) for m, items in self.items()}
def get_image_size(self, item_idx: int) -> ImageSize: def get_image_size(self, item_idx: int) -> ImageSize:
image = self.image[item_idx] image = self.images[item_idx]
if isinstance(image, Image): if isinstance(image, Image):
return ImageSize(*image.size) return ImageSize(*image.size)
...@@ -234,25 +269,41 @@ class MultiModalDataItems(UserDict[str, list[Any]]): ...@@ -234,25 +269,41 @@ class MultiModalDataItems(UserDict[str, list[Any]]):
assert_never(image) assert_never(image)
def get_audio_with_sr(
self,
item_idx: int,
*,
default_sr: float,
) -> tuple[np.ndarray, float]:
audio = self.audios[item_idx]
if isinstance(audio, tuple):
return audio
if isinstance(audio, list):
return np.array(audio), default_sr
if isinstance(audio, np.ndarray):
return audio, default_sr
assert_never(audio)
def resample_audios(self, new_sr: float, *, drop_sr: bool = True) -> None:
"""
If :code:`drop_sr=True`, the audio items in this dictionary are updated
to be NumPy arrays which implicitly means that their sampling rate is
the same as the model's expected sampling rate; otherwise, they remain
as :code:`(audio, new_sr)` tuples.
"""
if not self.audios:
return
def to_multi_format(data: MultiModalDataDict) -> MultiModalDataItems: new_audios = []
""" for item_idx in range(len(self.audios)):
Normalize :class:`MultiModalDataDict` to :class:`MultiModalDataItems`. audio, sr = self.get_audio_with_sr(item_idx, default_sr=new_sr)
""" audio = resample_audio(audio, orig_sr=sr, target_sr=new_sr)
multi_data = MultiModalDataItems()
new_audios.append(audio if drop_sr else (audio, new_sr))
for k, v in data.items():
# yapf: disable
if k == "video":
# Special case since even a single item can be a list
multi_data[k] = v if is_list_of(v, list) else [v] # type: ignore[index]
elif k in ("image", "audio"):
multi_data[k] = v if isinstance(v, list) else [v] # type: ignore[index]
else:
multi_data[k] = v if isinstance(v, list) else [v] # type: ignore[index]
# yapf: enable
return multi_data self["audio"] = new_audios
class _TokenMatch(NamedTuple): class _TokenMatch(NamedTuple):
...@@ -567,6 +618,12 @@ class BaseMultiModalProcessor(ABC): ...@@ -567,6 +618,12 @@ class BaseMultiModalProcessor(ABC):
def _get_tokenizer(self) -> AnyTokenizer: def _get_tokenizer(self) -> AnyTokenizer:
return self.ctx.tokenizer return self.ctx.tokenizer
def _get_mm_items(
self,
mm_data: MultiModalDataDict,
) -> MultiModalDataItems:
return MultiModalDataItems.from_dict(mm_data)
@abstractmethod @abstractmethod
def _get_prompt_replacements( def _get_prompt_replacements(
self, self,
...@@ -596,18 +653,20 @@ class BaseMultiModalProcessor(ABC): ...@@ -596,18 +653,20 @@ class BaseMultiModalProcessor(ABC):
def _get_processor_data( def _get_processor_data(
self, self,
mm_data: MultiModalDataDict, mm_items: MultiModalDataItems,
) -> BatchFeature: ) -> tuple[dict[str, Any], dict[str, Any]]:
processor_data = dict[str, Any]() processor_data = dict[str, Any]()
passthrough_data = dict[str, Any]() passthrough_data = dict[str, Any]()
for k, v in mm_data.items():
for k, v in mm_items.items():
# TODO: Make a separate modality for embedding inputs # TODO: Make a separate modality for embedding inputs
# to avoid confusion # to avoid confusion
if k in ("image", "video", "audio"): if k in ("image", "video", "audio"):
if isinstance(v, torch.Tensor) and v.ndim == 3: if isinstance(v, torch.Tensor) and v.ndim == 3:
# Pass through embedding inputs (single) # Pass through embedding inputs (single)
passthrough_data[f"{k}_embeds"] = [v] passthrough_data[f"{k}_embeds"] = [v]
elif is_list_of(v, torch.Tensor) and v[0].ndim == 2: elif (is_list_of(v, torch.Tensor) and len(v) > 0
and v[0].ndim == 2):
# Pass through embedding inputs (multi) # Pass through embedding inputs (multi)
passthrough_data[f"{k}_embeds"] = v passthrough_data[f"{k}_embeds"] = v
else: else:
...@@ -615,40 +674,41 @@ class BaseMultiModalProcessor(ABC): ...@@ -615,40 +674,41 @@ class BaseMultiModalProcessor(ABC):
processor_data[f"{k}s"] = v processor_data[f"{k}s"] = v
else: else:
processor_data[k] = v processor_data[k] = v
return processor_data, passthrough_data return processor_data, passthrough_data
def _call_hf_processor(
self,
hf_processor: ProcessorMixin,
prompt: str,
processor_data: Mapping[str, object],
mm_processor_kwargs: Mapping[str, object],
) -> BatchFeature:
return self.ctx.call_hf_processor(
hf_processor,
prompt,
processor_data,
mm_processor_kwargs,
)
def _apply_hf_processor( def _apply_hf_processor(
self, self,
prompt: str, prompt: str,
mm_data: MultiModalDataDict, mm_items: MultiModalDataItems,
mm_processor_kwargs: Mapping[str, object], mm_processor_kwargs: Mapping[str, object],
) -> BatchFeature: ) -> BatchFeature:
# some mm_processor_kwargs may be used in processor initialization # some mm_processor_kwargs may be used in processor initialization
# instead of processor call # instead of processor call
hf_processor = self._get_hf_processor(**mm_processor_kwargs) hf_processor = self._get_hf_processor(**mm_processor_kwargs)
processor_data, passthrough_data = self._get_processor_data(mm_data) processor_data, passthrough_data = self._get_processor_data(mm_items)
assert callable(hf_processor) hf_inputs = self._call_hf_processor(
mm_processor_kwargs = self.ctx.resolve_hf_processor_call_kwargs(
hf_processor, hf_processor,
mm_processor_kwargs, prompt=prompt,
processor_data=processor_data,
mm_processor_kwargs=mm_processor_kwargs,
) )
try:
hf_inputs = hf_processor(
text=prompt, # type: ignore
**processor_data,
**mm_processor_kwargs,
return_tensors="pt",
)
except Exception as exc:
data = dict(text=prompt, **processor_data)
raise RuntimeError(
f"Failed to apply {type(hf_processor).__name__} "
f"on data={data} with kwargs={mm_processor_kwargs}") from exc
hf_inputs.update(passthrough_data) hf_inputs.update(passthrough_data)
return hf_inputs return hf_inputs
...@@ -730,25 +790,25 @@ class BaseMultiModalProcessor(ABC): ...@@ -730,25 +790,25 @@ class BaseMultiModalProcessor(ABC):
3. Extract information about the placeholder tokens from the 3. Extract information about the placeholder tokens from the
processed token IDs. processed token IDs.
""" """
tokenizer = self._get_tokenizer() mm_items = self._get_mm_items(mm_data)
hf_inputs = self._apply_hf_processor(prompt_text, mm_data, hf_inputs = self._apply_hf_processor(prompt_text, mm_items,
mm_processor_kwargs) mm_processor_kwargs)
prompt_ids, = hf_inputs.pop("input_ids").tolist() prompt_ids, = hf_inputs.pop("input_ids").tolist()
mm_kwargs = MultiModalKwargs(hf_inputs) mm_kwargs = MultiModalKwargs(hf_inputs)
mm_items = to_multi_format(mm_data)
prompt_repls = self._get_prompt_replacements(mm_items, hf_inputs, prompt_repls = self._get_prompt_replacements(mm_items, hf_inputs,
mm_processor_kwargs) mm_processor_kwargs)
all_prompt_repls = self._bind_prompt_replacements(prompt_repls) all_prompt_repls = self._bind_prompt_replacements(prompt_repls)
# If HF processor already inserts placeholder tokens, # If HF processor already inserts placeholder tokens,
# there is no need for us to insert them # there is no need for us to insert them
mm_item_counts = {m: len(items) for m, items in mm_items.items()} mm_item_counts = mm_items.get_item_counts()
all_placeholders = self._find_placeholders(all_prompt_repls, all_placeholders = self._find_placeholders(all_prompt_repls,
prompt_ids, mm_item_counts) prompt_ids, mm_item_counts)
if all_placeholders: if all_placeholders:
tokenizer = self._get_tokenizer()
prompt_text = _decode(tokenizer, prompt_ids) prompt_text = _decode(tokenizer, prompt_ids)
else: else:
( (
......
...@@ -76,7 +76,7 @@ class MultiModalRegistry: ...@@ -76,7 +76,7 @@ class MultiModalRegistry:
Register a multi-modal plugin so it can be recognized by vLLM. Register a multi-modal plugin so it can be recognized by vLLM.
See also: See also:
:ref:`adding_multimodal_plugin` :ref:`adding-multimodal-plugin`
""" """
data_type_key = plugin.get_data_key() data_type_key = plugin.get_data_key()
...@@ -311,8 +311,8 @@ class MultiModalRegistry: ...@@ -311,8 +311,8 @@ class MultiModalRegistry:
invoked to transform the data into a dictionary of model inputs. invoked to transform the data into a dictionary of model inputs.
See also: See also:
- :ref:`input_processing_pipeline` - :ref:`input-processing-pipeline`
- :ref:`enabling_multimodal_inputs` - :ref:`enabling-multimodal-inputs`
""" """
def wrapper(model_cls: N) -> N: def wrapper(model_cls: N) -> N:
......
...@@ -2,7 +2,7 @@ import base64 ...@@ -2,7 +2,7 @@ import base64
import os import os
from functools import lru_cache from functools import lru_cache
from io import BytesIO from io import BytesIO
from typing import Any, List, Optional, Tuple, TypeVar, Union from typing import List, Optional, Tuple, TypeVar, Union
import numpy as np import numpy as np
import numpy.typing as npt import numpy.typing as npt
...@@ -14,9 +14,25 @@ import vllm.envs as envs ...@@ -14,9 +14,25 @@ import vllm.envs as envs
from vllm.connections import global_http_connection from vllm.connections import global_http_connection
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.transformers_utils.tokenizer import AnyTokenizer, get_tokenizer from vllm.transformers_utils.tokenizer import AnyTokenizer, get_tokenizer
from vllm.utils import PlaceholderModule
from .inputs import MultiModalDataDict, PlaceholderRange from .inputs import MultiModalDataDict, PlaceholderRange
try:
import decord
except ImportError:
decord = PlaceholderModule("decord") # type: ignore[assignment]
try:
import librosa
except ImportError:
librosa = PlaceholderModule("librosa") # type: ignore[assignment]
try:
import soundfile
except ImportError:
soundfile = PlaceholderModule("soundfile") # type: ignore[assignment]
logger = init_logger(__name__) logger = init_logger(__name__)
cached_get_tokenizer = lru_cache(get_tokenizer) cached_get_tokenizer = lru_cache(get_tokenizer)
...@@ -138,19 +154,7 @@ async def async_fetch_image(image_url: str, ...@@ -138,19 +154,7 @@ async def async_fetch_image(image_url: str,
return image.convert(image_mode) return image.convert(image_mode)
def _load_video_frames_from_bytes(b: bytes): def _load_video_from_bytes(b: bytes, num_frames: int = 32) -> npt.NDArray:
frame = Image.open(BytesIO(b))
return np.array(frame)
def load_video_frames_from_base64(frame: Union[bytes, str]):
"""Load frame from base64 format."""
return _load_video_frames_from_bytes(base64.b64decode(frame))
def _load_video_from_bytes(b: bytes, num_frames: int = 32):
_, decord = try_import_video_packages()
video_path = BytesIO(b) video_path = BytesIO(b)
vr = decord.VideoReader(video_path, num_threads=1) vr = decord.VideoReader(video_path, num_threads=1)
total_frame_num = len(vr) total_frame_num = len(vr)
...@@ -168,13 +172,17 @@ def _load_video_from_bytes(b: bytes, num_frames: int = 32): ...@@ -168,13 +172,17 @@ def _load_video_from_bytes(b: bytes, num_frames: int = 32):
return frames return frames
def _load_video_from_data_url(video_url: str): def _load_video_from_data_url(video_url: str) -> npt.NDArray:
# Only split once and assume the second part is the base64 encoded image # Only split once and assume the second part is the base64 encoded video
frames_base64 = video_url.split(",")[1:] _, video_base64 = video_url.split(",", 1)
return np.stack([
load_video_frames_from_base64(frame_base64) if video_url.startswith("data:video/jpeg;"):
for frame_base64 in frames_base64 return np.stack([
]) np.array(load_image_from_base64(frame_base64))
for frame_base64 in video_base64.split(",")
])
return load_video_from_base64(video_base64)
def fetch_video(video_url: str, *, num_frames: int = 32) -> npt.NDArray: def fetch_video(video_url: str, *, num_frames: int = 32) -> npt.NDArray:
...@@ -217,22 +225,10 @@ async def async_fetch_video(video_url: str, ...@@ -217,22 +225,10 @@ async def async_fetch_video(video_url: str,
return video return video
def try_import_audio_packages() -> Tuple[Any, Any]:
try:
import librosa
import soundfile
except ImportError as exc:
raise ImportError(
"Please install vllm[audio] for audio support.") from exc
return librosa, soundfile
def fetch_audio(audio_url: str) -> Tuple[np.ndarray, Union[int, float]]: def fetch_audio(audio_url: str) -> Tuple[np.ndarray, Union[int, float]]:
""" """
Load audio from a URL. Load audio from a URL.
""" """
librosa, _ = try_import_audio_packages()
if audio_url.startswith("http"): if audio_url.startswith("http"):
audio_bytes = global_http_connection.get_bytes( audio_bytes = global_http_connection.get_bytes(
audio_url, audio_url,
...@@ -253,8 +249,6 @@ async def async_fetch_audio( ...@@ -253,8 +249,6 @@ async def async_fetch_audio(
""" """
Asynchronously fetch audio from a URL. Asynchronously fetch audio from a URL.
""" """
librosa, _ = try_import_audio_packages()
if audio_url.startswith("http"): if audio_url.startswith("http"):
audio_bytes = await global_http_connection.async_get_bytes( audio_bytes = await global_http_connection.async_get_bytes(
audio_url, audio_url,
...@@ -313,8 +307,6 @@ def encode_audio_base64( ...@@ -313,8 +307,6 @@ def encode_audio_base64(
sampling_rate: int, sampling_rate: int,
) -> str: ) -> str:
"""Encode audio as base64.""" """Encode audio as base64."""
_, soundfile = try_import_audio_packages()
buffered = BytesIO() buffered = BytesIO()
soundfile.write(buffered, audio, sampling_rate, format="WAV") soundfile.write(buffered, audio, sampling_rate, format="WAV")
...@@ -343,61 +335,7 @@ def load_image_from_base64(image: Union[bytes, str]) -> Image.Image: ...@@ -343,61 +335,7 @@ def load_image_from_base64(image: Union[bytes, str]) -> Image.Image:
return _load_image_from_bytes(base64.b64decode(image)) return _load_image_from_bytes(base64.b64decode(image))
def rescale_image_size(image: Image.Image, def encode_video_base64(frames: npt.NDArray) -> str:
size_factor: float,
transpose: int = -1) -> Image.Image:
"""Rescale the dimensions of an image by a constant factor."""
new_width = int(image.width * size_factor)
new_height = int(image.height * size_factor)
image = image.resize((new_width, new_height))
if transpose >= 0:
image = image.transpose(Image.Transpose(transpose))
return image
def try_import_video_packages() -> Any:
try:
import cv2
import decord
except ImportError as exc:
raise ImportError(
"Please install vllm[video] for video support.") from exc
return cv2, decord
def resize_video(frames: npt.NDArray, size: Tuple[int, int]) -> npt.NDArray:
cv2, _ = try_import_video_packages()
num_frames, _, _, channels = frames.shape
new_height, new_width = size
resized_frames = np.empty((num_frames, new_height, new_width, channels),
dtype=frames.dtype)
for i, frame in enumerate(frames):
resized_frame = cv2.resize(frame, (new_width, new_height))
resized_frames[i] = resized_frame
return resized_frames
def rescale_video_size(frames: npt.NDArray, size_factor: float) -> npt.NDArray:
_, height, width, _ = frames.shape
new_height = int(height * size_factor)
new_width = int(width * size_factor)
return resize_video(frames, (new_height, new_width))
def sample_frames_from_video(frames: npt.NDArray,
num_frames: int) -> npt.NDArray:
total_frames = frames.shape[0]
if num_frames == -1:
return frames
else:
frame_indices = np.linspace(0, total_frames - 1, num_frames, dtype=int)
sampled_frames = frames[frame_indices, ...]
return sampled_frames
def encode_video_base64(frames: npt.NDArray):
base64_frames = [] base64_frames = []
frames_list = [frames[i] for i in range(frames.shape[0])] frames_list = [frames[i] for i in range(frames.shape[0])]
for frame in frames_list: for frame in frames_list:
...@@ -406,6 +344,11 @@ def encode_video_base64(frames: npt.NDArray): ...@@ -406,6 +344,11 @@ def encode_video_base64(frames: npt.NDArray):
return ",".join(base64_frames) return ",".join(base64_frames)
def load_video_from_base64(video: Union[bytes, str]) -> npt.NDArray:
"""Load video from base64 format."""
return _load_video_from_bytes(base64.b64decode(video))
def resolve_visual_encoder_outputs( def resolve_visual_encoder_outputs(
encoder_outputs: Union[torch.Tensor, list[torch.Tensor]], encoder_outputs: Union[torch.Tensor, list[torch.Tensor]],
feature_sample_layers: Optional[list[int]], feature_sample_layers: Optional[list[int]],
......
from functools import lru_cache from functools import lru_cache
from typing import TYPE_CHECKING, Any, Dict, Optional from typing import TYPE_CHECKING, Any, Dict, Optional
import cv2
import numpy as np import numpy as np
import numpy.typing as npt
from vllm.inputs.registry import InputContext from vllm.inputs.registry import InputContext
from vllm.logger import init_logger from vllm.logger import init_logger
...@@ -75,3 +77,33 @@ class VideoPlugin(ImagePlugin): ...@@ -75,3 +77,33 @@ class VideoPlugin(ImagePlugin):
def _default_max_multimodal_tokens(self, ctx: InputContext) -> int: def _default_max_multimodal_tokens(self, ctx: InputContext) -> int:
return 4096 return 4096
def resize_video(frames: npt.NDArray, size: tuple[int, int]) -> npt.NDArray:
num_frames, _, _, channels = frames.shape
new_height, new_width = size
resized_frames = np.empty((num_frames, new_height, new_width, channels),
dtype=frames.dtype)
for i, frame in enumerate(frames):
resized_frame = cv2.resize(frame, (new_width, new_height))
resized_frames[i] = resized_frame
return resized_frames
def rescale_video_size(frames: npt.NDArray, size_factor: float) -> npt.NDArray:
_, height, width, _ = frames.shape
new_height = int(height * size_factor)
new_width = int(width * size_factor)
return resize_video(frames, (new_height, new_width))
def sample_frames_from_video(frames: npt.NDArray,
num_frames: int) -> npt.NDArray:
total_frames = frames.shape[0]
if num_frames == -1:
return frames
frame_indices = np.linspace(0, total_frames - 1, num_frames, dtype=int)
sampled_frames = frames[frame_indices, ...]
return sampled_frames
...@@ -355,7 +355,8 @@ class PoolingRequestOutput(Generic[_O]): ...@@ -355,7 +355,8 @@ class PoolingRequestOutput(Generic[_O]):
pooled_data = seq_group.pooled_data pooled_data = seq_group.pooled_data
assert pooled_data is not None assert pooled_data is not None
output = PoolingOutput(pooled_data) data = pooled_data.to(dtype=torch.float32, device="cpu")
output = PoolingOutput(data)
prompt_token_ids = seq_group.prompt_token_ids prompt_token_ids = seq_group.prompt_token_ids
finished = seq_group.is_finished() finished = seq_group.is_finished()
......
...@@ -54,7 +54,7 @@ class CpuPlatform(Platform): ...@@ -54,7 +54,7 @@ class CpuPlatform(Platform):
import vllm.envs as envs import vllm.envs as envs
from vllm.utils import GiB_bytes from vllm.utils import GiB_bytes
model_config = vllm_config.model_config model_config = vllm_config.model_config
# Reminder: Please update docs/source/usage/compatibility_matrix.rst # Reminder: Please update docs/source/usage/compatibility_matrix.md
# If the feature combo become valid # If the feature combo become valid
if not model_config.enforce_eager: if not model_config.enforce_eager:
logger.warning( logger.warning(
......
...@@ -165,7 +165,7 @@ def main(): ...@@ -165,7 +165,7 @@ def main():
required=False, required=False,
help="Read CLI options from a config file." help="Read CLI options from a config file."
"Must be a YAML with the following options:" "Must be a YAML with the following options:"
"https://docs.vllm.ai/en/latest/serving/openai_compatible_server.html#command-line-arguments-for-the-server" "https://docs.vllm.ai/en/latest/serving/openai_compatible_server.html#cli-reference"
) )
serve_parser = make_arg_parser(serve_parser) serve_parser = make_arg_parser(serve_parser)
serve_parser.set_defaults(dispatch_function=serve) serve_parser.set_defaults(dispatch_function=serve)
......
...@@ -113,7 +113,7 @@ def create_spec_worker(*args, **kwargs) -> "SpecDecodeWorker": ...@@ -113,7 +113,7 @@ def create_spec_worker(*args, **kwargs) -> "SpecDecodeWorker":
return spec_decode_worker return spec_decode_worker
# Reminder: Please update docs/source/usage/compatibility_matrix.rst # Reminder: Please update docs/source/usage/compatibility_matrix.md
# If the feature combo become valid # If the feature combo become valid
class SpecDecodeWorker(LoraNotSupportedWorkerBase): class SpecDecodeWorker(LoraNotSupportedWorkerBase):
"""Worker which implements speculative decoding. """Worker which implements speculative decoding.
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment