Unverified Commit 79aa2446 authored by Wenlong Wang's avatar Wenlong Wang Committed by GitHub
Browse files

[Multi Modal] Configurable MM Profiling (#25631)


Signed-off-by: default avatarwwl2755 <wangwenlong2755@gmail.com>
Signed-off-by: default avatarHarry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: default avatarHarry Mellor <19981378+hmellor@users.noreply.github.com>
parent 2ed3f20d
...@@ -33,6 +33,7 @@ from torch import nn ...@@ -33,6 +33,7 @@ from torch import nn
from transformers import BatchFeature, PretrainedConfig from transformers import BatchFeature, PretrainedConfig
from vllm.config import CacheConfig, VllmConfig from vllm.config import CacheConfig, VllmConfig
from vllm.config.multimodal import BaseDummyOptions
from vllm.model_executor.layers.linear import (ColumnParallelLinear, from vllm.model_executor.layers.linear import (ColumnParallelLinear,
RowParallelLinear) RowParallelLinear)
from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.quantization import QuantizationConfig
...@@ -181,13 +182,17 @@ class GraniteSpeechDummyInputsBuilder( ...@@ -181,13 +182,17 @@ class GraniteSpeechDummyInputsBuilder(
self, self,
seq_len: int, seq_len: int,
mm_counts: Mapping[str, int], mm_counts: Mapping[str, int],
mm_options: Optional[Mapping[str, BaseDummyOptions]] = None,
) -> MultiModalDataDict: ) -> MultiModalDataDict:
num_audios = mm_counts.get("audio", 0) num_audios = mm_counts.get("audio", 0)
audio_overrides = mm_options.get("audio") if mm_options else None
return { return {
"audio": "audio":
self._get_dummy_audios( self._get_dummy_audios(
length=self.info.get_max_audio_len(), length=self.info.get_max_audio_len(),
num_audios=num_audios, num_audios=num_audios,
overrides=audio_overrides,
) )
} }
......
...@@ -29,6 +29,7 @@ from transformers import BatchFeature, CLIPVisionConfig, SiglipVisionConfig ...@@ -29,6 +29,7 @@ from transformers import BatchFeature, CLIPVisionConfig, SiglipVisionConfig
from transformers.modeling_utils import no_init_weights from transformers.modeling_utils import no_init_weights
from vllm.config import VllmConfig from vllm.config import VllmConfig
from vllm.config.multimodal import BaseDummyOptions
from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.quantization import QuantizationConfig
from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.cache import BaseMultiModalProcessorCache from vllm.multimodal.cache import BaseMultiModalProcessorCache
...@@ -149,6 +150,7 @@ class HCXVisionDummyInputsBuilder( ...@@ -149,6 +150,7 @@ class HCXVisionDummyInputsBuilder(
self, self,
seq_len: int, seq_len: int,
mm_counts: Mapping[str, int], mm_counts: Mapping[str, int],
mm_options: Optional[Mapping[str, BaseDummyOptions]] = None,
) -> MultiModalDataDict: ) -> MultiModalDataDict:
num_images = mm_counts.get("image", 0) num_images = mm_counts.get("image", 0)
num_videos = mm_counts.get("video", 0) num_videos = mm_counts.get("video", 0)
...@@ -156,12 +158,17 @@ class HCXVisionDummyInputsBuilder( ...@@ -156,12 +158,17 @@ class HCXVisionDummyInputsBuilder(
target_width, target_height = \ target_width, target_height = \
self.info.get_image_size_with_most_features() self.info.get_image_size_with_most_features()
target_num_frames = 32 target_num_frames = 32
image_overrides = mm_options.get("image") if mm_options else None
video_overrides = mm_options.get("video") if mm_options else None
return { return {
"image": "image":
self._get_dummy_images( self._get_dummy_images(
width=target_width, width=target_width,
height=target_height, height=target_height,
num_images=num_images, num_images=num_images,
overrides=image_overrides,
), ),
"video": "video":
self._get_dummy_videos( self._get_dummy_videos(
...@@ -169,6 +176,7 @@ class HCXVisionDummyInputsBuilder( ...@@ -169,6 +176,7 @@ class HCXVisionDummyInputsBuilder(
height=target_height - 1, height=target_height - 1,
num_frames=target_num_frames, num_frames=target_num_frames,
num_videos=num_videos, num_videos=num_videos,
overrides=video_overrides,
) )
} }
......
...@@ -26,6 +26,7 @@ from transformers import (BatchFeature, Idefics3Config, Idefics3ImageProcessor, ...@@ -26,6 +26,7 @@ from transformers import (BatchFeature, Idefics3Config, Idefics3ImageProcessor,
Idefics3Processor) Idefics3Processor)
from vllm.config import VllmConfig from vllm.config import VllmConfig
from vllm.config.multimodal import BaseDummyOptions
from vllm.model_executor.layers.linear import ReplicatedLinear from vllm.model_executor.layers.linear import ReplicatedLinear
from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.logits_processor import LogitsProcessor
from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.quantization import QuantizationConfig
...@@ -292,17 +293,21 @@ class Idefics3DummyInputsBuilder(BaseDummyInputsBuilder[Idefics3ProcessingInfo] ...@@ -292,17 +293,21 @@ class Idefics3DummyInputsBuilder(BaseDummyInputsBuilder[Idefics3ProcessingInfo]
self, self,
seq_len: int, seq_len: int,
mm_counts: Mapping[str, int], mm_counts: Mapping[str, int],
mm_options: Optional[Mapping[str, BaseDummyOptions]] = None,
) -> MultiModalDataDict: ) -> MultiModalDataDict:
num_images = mm_counts.get("image", 0) num_images = mm_counts.get("image", 0)
hf_processor = self.info.get_hf_processor() hf_processor = self.info.get_hf_processor()
image_processor: Idefics3ImageProcessor = hf_processor.image_processor image_processor: Idefics3ImageProcessor = hf_processor.image_processor
longest_edge = image_processor.max_image_size['longest_edge'] longest_edge = image_processor.max_image_size['longest_edge']
image_overrides = mm_options.get("image") if mm_options else None
return { return {
"image": "image":
self._get_dummy_images(width=longest_edge, self._get_dummy_images(width=longest_edge,
height=longest_edge, height=longest_edge,
num_images=num_images) num_images=num_images,
overrides=image_overrides)
} }
......
...@@ -20,6 +20,7 @@ from transformers.models.internvl.video_processing_internvl import ( ...@@ -20,6 +20,7 @@ from transformers.models.internvl.video_processing_internvl import (
InternVLVideoProcessor) InternVLVideoProcessor)
from vllm.config import VllmConfig from vllm.config import VllmConfig
from vllm.config.multimodal import BaseDummyOptions
from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.quantization import QuantizationConfig
from vllm.model_executor.models.interns1_vit import InternS1VisionModel from vllm.model_executor.models.interns1_vit import InternS1VisionModel
from vllm.model_executor.models.module_mapping import MultiModelKeys from vllm.model_executor.models.module_mapping import MultiModelKeys
...@@ -270,6 +271,7 @@ class InternS1DummyInputsBuilder(BaseDummyInputsBuilder[InternS1ProcessingInfo] ...@@ -270,6 +271,7 @@ class InternS1DummyInputsBuilder(BaseDummyInputsBuilder[InternS1ProcessingInfo]
self, self,
seq_len: int, seq_len: int,
mm_counts: Mapping[str, int], mm_counts: Mapping[str, int],
mm_options: Optional[Mapping[str, BaseDummyOptions]] = None,
) -> MultiModalDataDict: ) -> MultiModalDataDict:
target_width, target_height = \ target_width, target_height = \
self.info.get_image_size_with_most_features() self.info.get_image_size_with_most_features()
...@@ -281,16 +283,21 @@ class InternS1DummyInputsBuilder(BaseDummyInputsBuilder[InternS1ProcessingInfo] ...@@ -281,16 +283,21 @@ class InternS1DummyInputsBuilder(BaseDummyInputsBuilder[InternS1ProcessingInfo]
config = self.info.get_hf_config() config = self.info.get_hf_config()
image_size_h, image_size_w = config.vision_config.image_size image_size_h, image_size_w = config.vision_config.image_size
image_overrides = mm_options.get("image") if mm_options else None
video_overrides = mm_options.get("video") if mm_options else None
return { return {
"image": "image":
self._get_dummy_images(width=target_width, self._get_dummy_images(width=target_width,
height=target_height, height=target_height,
num_images=num_images), num_images=num_images,
overrides=image_overrides),
"video": "video":
self._get_dummy_videos(width=image_size_w, self._get_dummy_videos(width=image_size_w,
height=image_size_h, height=image_size_h,
num_frames=target_num_frames, num_frames=target_num_frames,
num_videos=num_videos), num_videos=num_videos,
overrides=video_overrides),
} }
......
...@@ -20,6 +20,7 @@ from PIL import Image ...@@ -20,6 +20,7 @@ from PIL import Image
from transformers import BatchFeature, PretrainedConfig, TensorType from transformers import BatchFeature, PretrainedConfig, TensorType
from vllm.config import VllmConfig from vllm.config import VllmConfig
from vllm.config.multimodal import BaseDummyOptions
from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.quantization import QuantizationConfig
from vllm.model_executor.layers.quantization.awq import AWQConfig from vllm.model_executor.layers.quantization.awq import AWQConfig
from vllm.model_executor.models.intern_vit import (InternVisionModel, from vllm.model_executor.models.intern_vit import (InternVisionModel,
...@@ -747,16 +748,20 @@ class BaseInternVLDummyInputsBuilder(BaseDummyInputsBuilder[_I]): ...@@ -747,16 +748,20 @@ class BaseInternVLDummyInputsBuilder(BaseDummyInputsBuilder[_I]):
self, self,
seq_len: int, seq_len: int,
mm_counts: Mapping[str, int], mm_counts: Mapping[str, int],
mm_options: Optional[Mapping[str, BaseDummyOptions]] = None,
) -> MultiModalDataDict: ) -> MultiModalDataDict:
target_width, target_height = \ target_width, target_height = \
self.info.get_image_size_with_most_features() self.info.get_image_size_with_most_features()
num_images = mm_counts.get("image", 0) num_images = mm_counts.get("image", 0)
image_overrides = mm_options.get("image") if mm_options else None
return { return {
"image": "image":
self._get_dummy_images(width=target_width, self._get_dummy_images(width=target_width,
height=target_height, height=target_height,
num_images=num_images) num_images=num_images,
overrides=image_overrides)
} }
...@@ -913,21 +918,25 @@ class InternVLDummyInputsBuilder( ...@@ -913,21 +918,25 @@ class InternVLDummyInputsBuilder(
self, self,
seq_len: int, seq_len: int,
mm_counts: Mapping[str, int], mm_counts: Mapping[str, int],
mm_options: Optional[Mapping[str, BaseDummyOptions]] = None,
) -> MultiModalDataDict: ) -> MultiModalDataDict:
dummy_image = super().get_dummy_mm_data(seq_len=seq_len, dummy_image = super().get_dummy_mm_data(seq_len=seq_len,
mm_counts=mm_counts) mm_counts=mm_counts,
mm_options=mm_options)
if self.info.supports_video: if self.info.supports_video:
config = self.info.get_hf_config() config = self.info.get_hf_config()
image_size: int = config.vision_config.image_size image_size: int = config.vision_config.image_size
target_num_frames = \ target_num_frames = \
self.info.get_num_frames_with_most_features(seq_len, mm_counts) self.info.get_num_frames_with_most_features(seq_len, mm_counts)
num_videos = mm_counts.get("video", 0) num_videos = mm_counts.get("video", 0)
video_overrides = mm_options.get("video") if mm_options else None
dummy_video = { dummy_video = {
"video": "video":
self._get_dummy_videos(width=image_size, self._get_dummy_videos(width=image_size,
height=image_size, height=image_size,
num_frames=target_num_frames, num_frames=target_num_frames,
num_videos=num_videos) num_videos=num_videos,
overrides=video_overrides)
} }
else: else:
dummy_video = {} dummy_video = {}
......
...@@ -20,6 +20,7 @@ from transformers.utils import torch_int ...@@ -20,6 +20,7 @@ from transformers.utils import torch_int
from vllm.attention.backends.registry import _Backend from vllm.attention.backends.registry import _Backend
from vllm.attention.layer import check_upstream_fa_availability from vllm.attention.layer import check_upstream_fa_availability
from vllm.config import VllmConfig from vllm.config import VllmConfig
from vllm.config.multimodal import BaseDummyOptions
from vllm.distributed import get_tensor_model_parallel_world_size from vllm.distributed import get_tensor_model_parallel_world_size
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.model_executor.layers.linear import (ColumnParallelLinear, from vllm.model_executor.layers.linear import (ColumnParallelLinear,
...@@ -1170,6 +1171,7 @@ class KeyeBaseDummyInputsBuilder(BaseDummyInputsBuilder[_I]): ...@@ -1170,6 +1171,7 @@ class KeyeBaseDummyInputsBuilder(BaseDummyInputsBuilder[_I]):
self, self,
seq_len: int, seq_len: int,
mm_counts: Mapping[str, int], mm_counts: Mapping[str, int],
mm_options: Optional[Mapping[str, BaseDummyOptions]] = None,
) -> MultiModalDataDict: ) -> MultiModalDataDict:
num_images = mm_counts.get("image", 0) num_images = mm_counts.get("image", 0)
num_videos = mm_counts.get("video", 0) num_videos = mm_counts.get("video", 0)
...@@ -1179,12 +1181,16 @@ class KeyeBaseDummyInputsBuilder(BaseDummyInputsBuilder[_I]): ...@@ -1179,12 +1181,16 @@ class KeyeBaseDummyInputsBuilder(BaseDummyInputsBuilder[_I]):
target_num_frames = self.info.get_num_frames_with_most_features( target_num_frames = self.info.get_num_frames_with_most_features(
seq_len) seq_len)
image_overrides = mm_options.get("image") if mm_options else None
video_overrides = mm_options.get("video") if mm_options else None
mm_data = { mm_data = {
"image": "image":
self._get_dummy_images( self._get_dummy_images(
width=target_width, width=target_width,
height=target_height, height=target_height,
num_images=num_images, num_images=num_images,
overrides=image_overrides,
), ),
"video": "video":
self._get_dummy_videos( self._get_dummy_videos(
...@@ -1192,6 +1198,7 @@ class KeyeBaseDummyInputsBuilder(BaseDummyInputsBuilder[_I]): ...@@ -1192,6 +1198,7 @@ class KeyeBaseDummyInputsBuilder(BaseDummyInputsBuilder[_I]):
height=target_height, height=target_height,
num_frames=target_num_frames, num_frames=target_num_frames,
num_videos=num_videos, num_videos=num_videos,
overrides=video_overrides,
), ),
} }
......
...@@ -54,6 +54,7 @@ from transformers import BatchFeature ...@@ -54,6 +54,7 @@ from transformers import BatchFeature
from transformers.activations import GELUActivation from transformers.activations import GELUActivation
from vllm.config import VllmConfig from vllm.config import VllmConfig
from vllm.config.multimodal import BaseDummyOptions
from vllm.distributed import get_pp_group from vllm.distributed import get_pp_group
from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.layers.fused_moe import FusedMoE
from vllm.model_executor.layers.linear import ReplicatedLinear from vllm.model_executor.layers.linear import ReplicatedLinear
...@@ -212,14 +213,18 @@ class KimiVLDummyInputsBuilder(BaseDummyInputsBuilder[KimiVLProcessingInfo]): ...@@ -212,14 +213,18 @@ class KimiVLDummyInputsBuilder(BaseDummyInputsBuilder[KimiVLProcessingInfo]):
self, self,
seq_len: int, seq_len: int,
mm_counts: Mapping[str, int], mm_counts: Mapping[str, int],
mm_options: Optional[Mapping[str, BaseDummyOptions]] = None,
) -> MultiModalDataDict: ) -> MultiModalDataDict:
num_images = mm_counts.get("image", 0) num_images = mm_counts.get("image", 0)
image_overrides = mm_options.get("image") if mm_options else None
return { return {
"image": "image":
self._get_dummy_images(width=MaxImageTokenMeta.width, self._get_dummy_images(width=MaxImageTokenMeta.width,
height=MaxImageTokenMeta.height, height=MaxImageTokenMeta.height,
num_images=num_images) num_images=num_images,
overrides=image_overrides)
} }
......
...@@ -15,6 +15,7 @@ from transformers.models.llava import LlavaProcessor ...@@ -15,6 +15,7 @@ from transformers.models.llava import LlavaProcessor
from transformers.models.pixtral import PixtralProcessor from transformers.models.pixtral import PixtralProcessor
from vllm.config import VllmConfig from vllm.config import VllmConfig
from vllm.config.multimodal import BaseDummyOptions
from vllm.model_executor.layers.activation import get_act_fn from vllm.model_executor.layers.activation import get_act_fn
from vllm.model_executor.layers.linear import (ColumnParallelLinear, from vllm.model_executor.layers.linear import (ColumnParallelLinear,
RowParallelLinear) RowParallelLinear)
...@@ -195,17 +196,21 @@ class LlavaDummyInputsBuilder(BaseDummyInputsBuilder[_I]): ...@@ -195,17 +196,21 @@ class LlavaDummyInputsBuilder(BaseDummyInputsBuilder[_I]):
self, self,
seq_len: int, seq_len: int,
mm_counts: Mapping[str, int], mm_counts: Mapping[str, int],
mm_options: Optional[Mapping[str, BaseDummyOptions]] = None,
) -> MultiModalDataDict: ) -> MultiModalDataDict:
num_images = mm_counts.get("image", 0) num_images = mm_counts.get("image", 0)
target_width, target_height = \ target_width, target_height = \
self.info.get_image_size_with_most_features() self.info.get_image_size_with_most_features()
image_overrides = mm_options.get("image") if mm_options else None
return { return {
"image": "image":
self._get_dummy_images(width=target_width, self._get_dummy_images(width=target_width,
height=target_height, height=target_height,
num_images=num_images) num_images=num_images,
overrides=image_overrides)
} }
......
...@@ -11,6 +11,7 @@ from transformers import (BatchFeature, LlavaNextVideoConfig, ...@@ -11,6 +11,7 @@ from transformers import (BatchFeature, LlavaNextVideoConfig,
LlavaNextVideoProcessor) LlavaNextVideoProcessor)
from vllm.config import VllmConfig from vllm.config import VllmConfig
from vllm.config.multimodal import BaseDummyOptions
from vllm.model_executor.layers.activation import get_act_fn from vllm.model_executor.layers.activation import get_act_fn
from vllm.model_executor.models.clip import CLIPVisionModel from vllm.model_executor.models.clip import CLIPVisionModel
from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal import MULTIMODAL_REGISTRY
...@@ -150,6 +151,7 @@ class LlavaNextVideoDummyInputsBuilder( ...@@ -150,6 +151,7 @@ class LlavaNextVideoDummyInputsBuilder(
self, self,
seq_len: int, seq_len: int,
mm_counts: Mapping[str, int], mm_counts: Mapping[str, int],
mm_options: Optional[Mapping[str, BaseDummyOptions]] = None,
) -> MultiModalDataDict: ) -> MultiModalDataDict:
num_videos = mm_counts.get("video", 0) num_videos = mm_counts.get("video", 0)
...@@ -158,6 +160,8 @@ class LlavaNextVideoDummyInputsBuilder( ...@@ -158,6 +160,8 @@ class LlavaNextVideoDummyInputsBuilder(
target_num_frames = \ target_num_frames = \
self.info.get_num_frames_with_most_features(seq_len, mm_counts) self.info.get_num_frames_with_most_features(seq_len, mm_counts)
video_overrides = mm_options.get("video") if mm_options else None
return { return {
"video": "video":
self._get_dummy_videos( self._get_dummy_videos(
...@@ -165,6 +169,7 @@ class LlavaNextVideoDummyInputsBuilder( ...@@ -165,6 +169,7 @@ class LlavaNextVideoDummyInputsBuilder(
height=target_height, height=target_height,
num_frames=target_num_frames, num_frames=target_num_frames,
num_videos=num_videos, num_videos=num_videos,
overrides=video_overrides,
) )
} }
......
...@@ -13,6 +13,7 @@ from transformers.models.llava_onevision.modeling_llava_onevision import ( ...@@ -13,6 +13,7 @@ from transformers.models.llava_onevision.modeling_llava_onevision import (
get_anyres_image_grid_shape, unpad_image) get_anyres_image_grid_shape, unpad_image)
from vllm.config import VllmConfig from vllm.config import VllmConfig
from vllm.config.multimodal import BaseDummyOptions
from vllm.model_executor.layers.activation import get_act_fn from vllm.model_executor.layers.activation import get_act_fn
from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
...@@ -254,6 +255,7 @@ class LlavaOnevisionDummyInputsBuilder( ...@@ -254,6 +255,7 @@ class LlavaOnevisionDummyInputsBuilder(
self, self,
seq_len: int, seq_len: int,
mm_counts: Mapping[str, int], mm_counts: Mapping[str, int],
mm_options: Optional[Mapping[str, BaseDummyOptions]] = None,
) -> MultiModalDataDict: ) -> MultiModalDataDict:
num_images = mm_counts.get("image", 0) num_images = mm_counts.get("image", 0)
num_videos = mm_counts.get("video", 0) num_videos = mm_counts.get("video", 0)
...@@ -264,17 +266,22 @@ class LlavaOnevisionDummyInputsBuilder( ...@@ -264,17 +266,22 @@ class LlavaOnevisionDummyInputsBuilder(
self.info.get_num_frames_with_most_features(seq_len, self.info.get_num_frames_with_most_features(seq_len,
mm_counts) mm_counts)
image_overrides = mm_options.get("image") if mm_options else None
video_overrides = mm_options.get("video") if mm_options else None
return { return {
"image": "image":
self._get_dummy_images(width=target_width, self._get_dummy_images(width=target_width,
height=target_height, height=target_height,
num_images=num_images), num_images=num_images,
overrides=image_overrides),
"video": "video":
self._get_dummy_videos( self._get_dummy_videos(
width=target_width, width=target_width,
height=target_height, height=target_height,
num_frames=target_num_frames, num_frames=target_num_frames,
num_videos=num_videos, num_videos=num_videos,
overrides=video_overrides,
) )
} }
......
...@@ -36,6 +36,7 @@ from torch.nn.functional import scaled_dot_product_attention ...@@ -36,6 +36,7 @@ from torch.nn.functional import scaled_dot_product_attention
from transformers import BatchFeature from transformers import BatchFeature
from vllm.config import VllmConfig from vllm.config import VllmConfig
from vllm.config.multimodal import BaseDummyOptions
from vllm.distributed import get_tensor_model_parallel_world_size from vllm.distributed import get_tensor_model_parallel_world_size
from vllm.model_executor.layers.activation import get_act_fn from vllm.model_executor.layers.activation import get_act_fn
from vllm.model_executor.layers.linear import (ColumnParallelLinear, from vllm.model_executor.layers.linear import (ColumnParallelLinear,
...@@ -539,13 +540,17 @@ class MiDashengLMDummyInputsBuilder( ...@@ -539,13 +540,17 @@ class MiDashengLMDummyInputsBuilder(
self, self,
seq_len: int, seq_len: int,
mm_counts: Mapping[str, int], mm_counts: Mapping[str, int],
mm_options: Optional[Mapping[str, BaseDummyOptions]] = None,
) -> MultiModalDataDict: ) -> MultiModalDataDict:
num_audios = mm_counts.get("audio", 0) num_audios = mm_counts.get("audio", 0)
audio_overrides = mm_options.get("audio") if mm_options else None
return { return {
"audio": "audio":
self._get_dummy_audios(length=self.info.get_max_audio_len(), self._get_dummy_audios(length=self.info.get_max_audio_len(),
num_audios=num_audios) num_audios=num_audios,
overrides=audio_overrides)
} }
......
...@@ -36,6 +36,7 @@ from transformers.models.whisper.modeling_whisper import (ACT2FN, ...@@ -36,6 +36,7 @@ from transformers.models.whisper.modeling_whisper import (ACT2FN,
WhisperEncoder) WhisperEncoder)
from vllm.config import VllmConfig from vllm.config import VllmConfig
from vllm.config.multimodal import BaseDummyOptions
from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargsItems from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargsItems
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
NestedTensors) NestedTensors)
...@@ -237,18 +238,23 @@ class MiniCPMODummyInputsBuilder( ...@@ -237,18 +238,23 @@ class MiniCPMODummyInputsBuilder(
self, self,
seq_len: int, seq_len: int,
mm_counts: Mapping[str, int], mm_counts: Mapping[str, int],
mm_options: Optional[Mapping[str, BaseDummyOptions]] = None,
) -> MultiModalDataDict: ) -> MultiModalDataDict:
num_audios = mm_counts.get("audio", 0) num_audios = mm_counts.get("audio", 0)
audio_len = self.info.get_max_audio_chunks_with_most_features() * \ audio_len = self.info.get_max_audio_chunks_with_most_features() * \
self.info.get_default_audio_sampling_rate() self.info.get_default_audio_sampling_rate()
audio_overrides = mm_options.get("audio") if mm_options else None
audio_mm_data = { audio_mm_data = {
"audio": "audio":
self._get_dummy_audios(length=audio_len, num_audios=num_audios) self._get_dummy_audios(length=audio_len,
num_audios=num_audios,
overrides=audio_overrides)
} }
return { return {
**super().get_dummy_mm_data(seq_len, mm_counts), **super().get_dummy_mm_data(seq_len, mm_counts, mm_options),
**audio_mm_data, **audio_mm_data,
} }
......
...@@ -39,6 +39,7 @@ from transformers import BatchFeature, PretrainedConfig ...@@ -39,6 +39,7 @@ from transformers import BatchFeature, PretrainedConfig
from typing_extensions import TypeVar from typing_extensions import TypeVar
from vllm.config import VllmConfig from vllm.config import VllmConfig
from vllm.config.multimodal import BaseDummyOptions
from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.quantization import QuantizationConfig
from vllm.model_executor.layers.quantization.awq import AWQConfig from vllm.model_executor.layers.quantization.awq import AWQConfig
from vllm.model_executor.layers.quantization.awq_marlin import AWQMarlinConfig from vllm.model_executor.layers.quantization.awq_marlin import AWQMarlinConfig
...@@ -679,6 +680,7 @@ class MiniCPMVDummyInputsBuilder(BaseDummyInputsBuilder[_I]): ...@@ -679,6 +680,7 @@ class MiniCPMVDummyInputsBuilder(BaseDummyInputsBuilder[_I]):
self, self,
seq_len: int, seq_len: int,
mm_counts: Mapping[str, int], mm_counts: Mapping[str, int],
mm_options: Optional[Mapping[str, BaseDummyOptions]] = None,
) -> MultiModalDataDict: ) -> MultiModalDataDict:
num_images = mm_counts.get("image", 0) num_images = mm_counts.get("image", 0)
num_videos = mm_counts.get("video", 0) num_videos = mm_counts.get("video", 0)
...@@ -690,15 +692,20 @@ class MiniCPMVDummyInputsBuilder(BaseDummyInputsBuilder[_I]): ...@@ -690,15 +692,20 @@ class MiniCPMVDummyInputsBuilder(BaseDummyInputsBuilder[_I]):
num_video_frames = \ num_video_frames = \
self.info.get_num_frames_with_most_features(seq_len, mm_counts) self.info.get_num_frames_with_most_features(seq_len, mm_counts)
image_overrides = mm_options.get("image") if mm_options else None
video_overrides = mm_options.get("video") if mm_options else None
return { return {
"image": "image":
self._get_dummy_images(width=image_width, self._get_dummy_images(width=image_width,
height=image_height, height=image_height,
num_images=num_images), num_images=num_images,
overrides=image_overrides),
"video": [ "video": [
self._get_dummy_images(width=video_width, self._get_dummy_images(width=video_width,
height=video_height, height=video_height,
num_images=num_video_frames) num_images=num_video_frames,
overrides=video_overrides)
] * num_videos, ] * num_videos,
} }
......
...@@ -13,6 +13,7 @@ from transformers import (BatchFeature, Mistral3Config, PixtralVisionConfig, ...@@ -13,6 +13,7 @@ from transformers import (BatchFeature, Mistral3Config, PixtralVisionConfig,
from transformers.models.pixtral import PixtralProcessor from transformers.models.pixtral import PixtralProcessor
from vllm.config import VllmConfig from vllm.config import VllmConfig
from vllm.config.multimodal import BaseDummyOptions
from vllm.model_executor.layers.activation import get_act_fn from vllm.model_executor.layers.activation import get_act_fn
from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.layernorm import RMSNorm
from vllm.model_executor.layers.linear import (ColumnParallelLinear, from vllm.model_executor.layers.linear import (ColumnParallelLinear,
...@@ -208,17 +209,21 @@ class Mistral3DummyInputsBuilder(BaseDummyInputsBuilder[_I]): ...@@ -208,17 +209,21 @@ class Mistral3DummyInputsBuilder(BaseDummyInputsBuilder[_I]):
self, self,
seq_len: int, seq_len: int,
mm_counts: Mapping[str, int], mm_counts: Mapping[str, int],
mm_options: Optional[Mapping[str, BaseDummyOptions]] = None,
) -> MultiModalDataDict: ) -> MultiModalDataDict:
num_images = mm_counts.get("image", 0) num_images = mm_counts.get("image", 0)
target_width, target_height = \ target_width, target_height = \
self.info.get_image_size_with_most_features() self.info.get_image_size_with_most_features()
image_overrides = mm_options.get("image") if mm_options else None
return { return {
"image": "image":
self._get_dummy_images(width=target_width, self._get_dummy_images(width=target_width,
height=target_height, height=target_height,
num_images=num_images) num_images=num_images,
overrides=image_overrides)
} }
......
...@@ -31,6 +31,7 @@ from transformers.models.llama4.image_processing_llama4_fast import ( ...@@ -31,6 +31,7 @@ from transformers.models.llama4.image_processing_llama4_fast import (
from vllm.attention.layer import MultiHeadAttention from vllm.attention.layer import MultiHeadAttention
from vllm.config import VllmConfig from vllm.config import VllmConfig
from vllm.config.multimodal import BaseDummyOptions
from vllm.distributed import get_tensor_model_parallel_world_size from vllm.distributed import get_tensor_model_parallel_world_size
from vllm.model_executor.layers.linear import (ColumnParallelLinear, from vllm.model_executor.layers.linear import (ColumnParallelLinear,
QKVParallelLinear, QKVParallelLinear,
...@@ -689,17 +690,21 @@ class Mllama4DummyInputsBuilder(BaseDummyInputsBuilder[Mllama4ProcessingInfo]): ...@@ -689,17 +690,21 @@ class Mllama4DummyInputsBuilder(BaseDummyInputsBuilder[Mllama4ProcessingInfo]):
self, self,
seq_len: int, seq_len: int,
mm_counts: Mapping[str, int], mm_counts: Mapping[str, int],
mm_options: Optional[Mapping[str, BaseDummyOptions]] = None,
) -> MultiModalDataDict: ) -> MultiModalDataDict:
num_images = mm_counts.get("image", 0) num_images = mm_counts.get("image", 0)
(target_width, (target_width,
target_height) = self.info.get_image_size_with_most_features() target_height) = self.info.get_image_size_with_most_features()
image_overrides = mm_options.get("image") if mm_options else None
return { return {
"image": "image":
self._get_dummy_images(width=target_width, self._get_dummy_images(width=target_width,
height=target_height, height=target_height,
num_images=num_images) num_images=num_images,
overrides=image_overrides)
} }
......
...@@ -22,6 +22,7 @@ from vllm.attention import Attention ...@@ -22,6 +22,7 @@ from vllm.attention import Attention
from vllm.attention.layer import MultiHeadAttention from vllm.attention.layer import MultiHeadAttention
from vllm.compilation.decorators import support_torch_compile from vllm.compilation.decorators import support_torch_compile
from vllm.config import CacheConfig, VllmConfig from vllm.config import CacheConfig, VllmConfig
from vllm.config.multimodal import BaseDummyOptions
from vllm.distributed import (get_pp_group, get_tensor_model_parallel_rank, from vllm.distributed import (get_pp_group, get_tensor_model_parallel_rank,
get_tensor_model_parallel_world_size, get_tensor_model_parallel_world_size,
split_tensor_along_last_dim, split_tensor_along_last_dim,
...@@ -1226,16 +1227,20 @@ class MolmoDummyInputsBuilder(BaseDummyInputsBuilder[MolmoProcessingInfo]): ...@@ -1226,16 +1227,20 @@ class MolmoDummyInputsBuilder(BaseDummyInputsBuilder[MolmoProcessingInfo]):
self, self,
seq_len: int, seq_len: int,
mm_counts: Mapping[str, int], mm_counts: Mapping[str, int],
mm_options: Optional[Mapping[str, BaseDummyOptions]] = None,
) -> MultiModalDataDict: ) -> MultiModalDataDict:
target_width, target_height = \ target_width, target_height = \
self.info.get_image_size_with_most_features() self.info.get_image_size_with_most_features()
num_images = mm_counts.get("image", 0) num_images = mm_counts.get("image", 0)
image_overrides = mm_options.get("image") if mm_options else None
return { return {
"image": "image":
self._get_dummy_images(width=target_width, self._get_dummy_images(width=target_width,
height=target_height, height=target_height,
num_images=num_images) num_images=num_images,
overrides=image_overrides)
} }
......
...@@ -21,6 +21,7 @@ from PIL import Image ...@@ -21,6 +21,7 @@ from PIL import Image
from transformers import BatchFeature, PretrainedConfig, TensorType from transformers import BatchFeature, PretrainedConfig, TensorType
from vllm.config import VllmConfig from vllm.config import VllmConfig
from vllm.config.multimodal import BaseDummyOptions
from vllm.model_executor.layers.activation import ReLUSquaredActivation from vllm.model_executor.layers.activation import ReLUSquaredActivation
from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.layernorm import RMSNorm
from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.model_loader.weight_utils import default_weight_loader
...@@ -809,6 +810,7 @@ class NanoNemotronVLDummyInputsBuilder(BaseDummyInputsBuilder[_I]): ...@@ -809,6 +810,7 @@ class NanoNemotronVLDummyInputsBuilder(BaseDummyInputsBuilder[_I]):
self, self,
seq_len: int, seq_len: int,
mm_counts: Mapping[str, int], mm_counts: Mapping[str, int],
mm_options: Optional[Mapping[str, BaseDummyOptions]] = None,
) -> MultiModalDataDict: ) -> MultiModalDataDict:
# Use default max_num_tiles for dummy data generation # Use default max_num_tiles for dummy data generation
max_num_tiles = 12 max_num_tiles = 12
...@@ -816,11 +818,14 @@ class NanoNemotronVLDummyInputsBuilder(BaseDummyInputsBuilder[_I]): ...@@ -816,11 +818,14 @@ class NanoNemotronVLDummyInputsBuilder(BaseDummyInputsBuilder[_I]):
self.info.get_image_size_with_most_features(max_num_tiles)) self.info.get_image_size_with_most_features(max_num_tiles))
num_images = mm_counts.get("image", 0) num_images = mm_counts.get("image", 0)
image_overrides = mm_options.get("image") if mm_options else None
return { return {
"image": "image":
self._get_dummy_images(width=target_width, self._get_dummy_images(width=target_width,
height=target_height, height=target_height,
num_images=num_images) num_images=num_images,
overrides=image_overrides)
} }
...@@ -837,21 +842,25 @@ class NanoNemotronVLDummyInputsBuilder( ...@@ -837,21 +842,25 @@ class NanoNemotronVLDummyInputsBuilder(
self, self,
seq_len: int, seq_len: int,
mm_counts: Mapping[str, int], mm_counts: Mapping[str, int],
mm_options: Optional[Mapping[str, BaseDummyOptions]] = None,
) -> MultiModalDataDict: ) -> MultiModalDataDict:
dummy_image = super().get_dummy_mm_data(seq_len=seq_len, dummy_image = super().get_dummy_mm_data(seq_len=seq_len,
mm_counts=mm_counts) mm_counts=mm_counts,
mm_options=mm_options)
if self.info.supports_video: if self.info.supports_video:
config = self.info.get_hf_config() config = self.info.get_hf_config()
image_size: int = config.force_image_size image_size: int = config.force_image_size
target_num_frames = \ target_num_frames = \
self.info.get_num_frames_with_most_features(seq_len, mm_counts) self.info.get_num_frames_with_most_features(seq_len, mm_counts)
num_videos = mm_counts.get("video", 0) num_videos = mm_counts.get("video", 0)
video_overrides = mm_options.get("video") if mm_options else None
dummy_video = { dummy_video = {
"video": "video":
self._get_dummy_videos(width=image_size, self._get_dummy_videos(width=image_size,
height=image_size, height=image_size,
num_frames=target_num_frames, num_frames=target_num_frames,
num_videos=num_videos) num_videos=num_videos,
overrides=video_overrides)
} }
else: else:
dummy_video = {} dummy_video = {}
......
...@@ -14,6 +14,7 @@ import torch ...@@ -14,6 +14,7 @@ import torch
import torch.nn as nn import torch.nn as nn
from transformers import PretrainedConfig from transformers import PretrainedConfig
from vllm.config.multimodal import BaseDummyOptions
from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.quantization import QuantizationConfig
from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import MultiModalDataDict, MultiModalKwargsItems from vllm.multimodal.inputs import MultiModalDataDict, MultiModalKwargsItems
...@@ -86,16 +87,20 @@ class NVLMDummyInputsBuilder(BaseInternVLDummyInputsBuilder[NVLMProcessingInfo] ...@@ -86,16 +87,20 @@ class NVLMDummyInputsBuilder(BaseInternVLDummyInputsBuilder[NVLMProcessingInfo]
self, self,
seq_len: int, seq_len: int,
mm_counts: Mapping[str, int], mm_counts: Mapping[str, int],
mm_options: Optional[Mapping[str, BaseDummyOptions]] = None,
) -> MultiModalDataDict: ) -> MultiModalDataDict:
target_width, target_height = \ target_width, target_height = \
self.info.get_image_size_with_most_features() self.info.get_image_size_with_most_features()
num_images = mm_counts.get("image", 0) num_images = mm_counts.get("image", 0)
image_overrides = mm_options.get("image") if mm_options else None
return { return {
"image": "image":
self._get_dummy_images(width=target_width, self._get_dummy_images(width=target_width,
height=target_height, height=target_height,
num_images=num_images) num_images=num_images,
overrides=image_overrides)
} }
......
...@@ -28,6 +28,7 @@ from torch.nn.functional import gumbel_softmax, pad, softmax ...@@ -28,6 +28,7 @@ from torch.nn.functional import gumbel_softmax, pad, softmax
from transformers import BatchFeature, PretrainedConfig from transformers import BatchFeature, PretrainedConfig
from vllm.config import VllmConfig from vllm.config import VllmConfig
from vllm.config.multimodal import BaseDummyOptions
from vllm.model_executor.layers.linear import ReplicatedLinear from vllm.model_executor.layers.linear import ReplicatedLinear
from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.quantization import QuantizationConfig
from vllm.model_executor.models.aimv2 import AIMv2Model from vllm.model_executor.models.aimv2 import AIMv2Model
...@@ -283,17 +284,21 @@ class OvisDummyInputsBuilder(BaseDummyInputsBuilder[OvisProcessingInfo]): ...@@ -283,17 +284,21 @@ class OvisDummyInputsBuilder(BaseDummyInputsBuilder[OvisProcessingInfo]):
self, self,
seq_len: int, seq_len: int,
mm_counts: Mapping[str, int], mm_counts: Mapping[str, int],
mm_options: Optional[Mapping[str, BaseDummyOptions]] = None,
) -> MultiModalDataDict: ) -> MultiModalDataDict:
num_images = mm_counts.get("image", 0) num_images = mm_counts.get("image", 0)
target_width, target_height = \ target_width, target_height = \
self.info.get_image_size_with_most_features() self.info.get_image_size_with_most_features()
image_overrides = mm_options.get("image") if mm_options else None
mm_data = { mm_data = {
"image": "image":
self._get_dummy_images(width=target_width, self._get_dummy_images(width=target_width,
height=target_height, height=target_height,
num_images=num_images), num_images=num_images,
overrides=image_overrides),
} }
return mm_data return mm_data
......
...@@ -10,6 +10,7 @@ import torch.nn as nn ...@@ -10,6 +10,7 @@ import torch.nn as nn
from transformers import BaseImageProcessor, BatchFeature, PretrainedConfig from transformers import BaseImageProcessor, BatchFeature, PretrainedConfig
from vllm.config import VllmConfig from vllm.config import VllmConfig
from vllm.config.multimodal import BaseDummyOptions
from vllm.model_executor.layers.linear import ReplicatedLinear from vllm.model_executor.layers.linear import ReplicatedLinear
from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.quantization import QuantizationConfig
from vllm.model_executor.models.ovis import (OvisImagePatchInputs, from vllm.model_executor.models.ovis import (OvisImagePatchInputs,
...@@ -290,6 +291,7 @@ class Ovis2_5DummyInputsBuilder(BaseDummyInputsBuilder[Ovis2_5ProcessingInfo]): ...@@ -290,6 +291,7 @@ class Ovis2_5DummyInputsBuilder(BaseDummyInputsBuilder[Ovis2_5ProcessingInfo]):
self, self,
seq_len: int, seq_len: int,
mm_counts: Mapping[str, int], mm_counts: Mapping[str, int],
mm_options: Optional[Mapping[str, BaseDummyOptions]] = None,
) -> MultiModalDataDict: ) -> MultiModalDataDict:
num_images = mm_counts.get("image", 0) num_images = mm_counts.get("image", 0)
num_videos = mm_counts.get("video", 0) num_videos = mm_counts.get("video", 0)
...@@ -298,17 +300,23 @@ class Ovis2_5DummyInputsBuilder(BaseDummyInputsBuilder[Ovis2_5ProcessingInfo]): ...@@ -298,17 +300,23 @@ class Ovis2_5DummyInputsBuilder(BaseDummyInputsBuilder[Ovis2_5ProcessingInfo]):
self.info.get_image_size_with_most_features() self.info.get_image_size_with_most_features()
target_num_frames = \ target_num_frames = \
self.info.get_num_frames_with_most_features(seq_len, mm_counts) self.info.get_num_frames_with_most_features(seq_len, mm_counts)
image_overrides = mm_options.get("image") if mm_options else None
video_overrides = mm_options.get("video") if mm_options else None
mm_data = { mm_data = {
"image": "image":
self._get_dummy_images(width=target_width, self._get_dummy_images(width=target_width,
height=target_height, height=target_height,
num_images=num_images), num_images=num_images,
overrides=image_overrides),
"video": "video":
self._get_dummy_videos( self._get_dummy_videos(
width=target_width, width=target_width,
height=target_height, height=target_height,
num_frames=target_num_frames, num_frames=target_num_frames,
num_videos=num_videos, num_videos=num_videos,
overrides=video_overrides,
) )
} }
return mm_data return mm_data
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment