"vllm/vscode:/vscode.git/clone" did not exist on "35d44b45570396b28ce94186b2438dbc608fd6c0"
Unverified Commit ba2f0acc authored by Cyrus Leung's avatar Cyrus Leung Committed by GitHub
Browse files

[Misc] Reorganize inputs (#35182)


Signed-off-by: default avatarDarkLight1337 <tlleungac@connect.ust.hk>
parent 678b3c99
...@@ -19,6 +19,7 @@ from transformers.utils import torch_int ...@@ -19,6 +19,7 @@ from transformers.utils import torch_int
from vllm.config import VllmConfig from vllm.config import VllmConfig
from vllm.config.multimodal import BaseDummyOptions from vllm.config.multimodal import BaseDummyOptions
from vllm.distributed import get_tensor_model_parallel_world_size from vllm.distributed import get_tensor_model_parallel_world_size
from vllm.inputs import ModalityData, MultiModalDataDict
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.model_executor.layers.attention import ( from vllm.model_executor.layers.attention import (
MMEncoderAttention, MMEncoderAttention,
...@@ -41,8 +42,6 @@ from vllm.model_executor.models.module_mapping import MultiModelKeys ...@@ -41,8 +42,6 @@ from vllm.model_executor.models.module_mapping import MultiModelKeys
from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import ( from vllm.multimodal.inputs import (
ImageItem, ImageItem,
ModalityData,
MultiModalDataDict,
MultiModalFeatureSpec, MultiModalFeatureSpec,
MultiModalFieldConfig, MultiModalFieldConfig,
MultiModalKwargsItems, MultiModalKwargsItems,
......
...@@ -14,13 +14,13 @@ from transformers.activations import GELUActivation ...@@ -14,13 +14,13 @@ from transformers.activations import GELUActivation
from transformers.feature_extraction_utils import BatchFeature from transformers.feature_extraction_utils import BatchFeature
from vllm.config import VllmConfig from vllm.config import VllmConfig
from vllm.inputs import ModalityData
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.model_executor.layers.linear import ColumnParallelLinear, RowParallelLinear from vllm.model_executor.layers.linear import ColumnParallelLinear, RowParallelLinear
from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.quantization import QuantizationConfig
from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import ( from vllm.multimodal.inputs import (
ImageItem, ImageItem,
ModalityData,
MultiModalFeatureSpec, MultiModalFeatureSpec,
MultiModalFieldConfig, MultiModalFieldConfig,
MultiModalKwargsItems, MultiModalKwargsItems,
......
...@@ -14,7 +14,7 @@ from transformers import WhisperConfig as HFWhisperConfig ...@@ -14,7 +14,7 @@ from transformers import WhisperConfig as HFWhisperConfig
from vllm.config import ModelConfig, SpeechToTextConfig, VllmConfig from vllm.config import ModelConfig, SpeechToTextConfig, VllmConfig
from vllm.config.multimodal import BaseDummyOptions from vllm.config.multimodal import BaseDummyOptions
from vllm.inputs.data import PromptType, TokensPrompt from vllm.inputs import PromptType, TokensPrompt
from vllm.model_executor.model_loader import DefaultModelLoader from vllm.model_executor.model_loader import DefaultModelLoader
from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.model_loader.weight_utils import default_weight_loader
from vllm.model_executor.models.interfaces import ( from vllm.model_executor.models.interfaces import (
......
...@@ -16,6 +16,7 @@ from transformers import BatchFeature ...@@ -16,6 +16,7 @@ from transformers import BatchFeature
from vllm.config import VllmConfig from vllm.config import VllmConfig
from vllm.config.multimodal import BaseDummyOptions from vllm.config.multimodal import BaseDummyOptions
from vllm.inputs import MultiModalDataDict
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.quantization import QuantizationConfig
from vllm.model_executor.layers.quantization.compressed_tensors import ( from vllm.model_executor.layers.quantization.compressed_tensors import (
...@@ -35,7 +36,6 @@ from vllm.model_executor.models.kimi_k25_vit import ( ...@@ -35,7 +36,6 @@ from vllm.model_executor.models.kimi_k25_vit import (
) )
from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import ( from vllm.multimodal.inputs import (
MultiModalDataDict,
MultiModalFieldConfig, MultiModalFieldConfig,
MultiModalKwargsItems, MultiModalKwargsItems,
NestedTensors, NestedTensors,
......
...@@ -54,12 +54,12 @@ from transformers.activations import GELUActivation ...@@ -54,12 +54,12 @@ from transformers.activations import GELUActivation
from vllm.config import VllmConfig from vllm.config import VllmConfig
from vllm.config.multimodal import BaseDummyOptions from vllm.config.multimodal import BaseDummyOptions
from vllm.inputs import MultiModalDataDict
from vllm.model_executor.layers.linear import ReplicatedLinear from vllm.model_executor.layers.linear import ReplicatedLinear
from vllm.model_executor.models.interfaces import SupportsMultiModal, SupportsPP from vllm.model_executor.models.interfaces import SupportsMultiModal, SupportsPP
from vllm.model_executor.models.moonvit import MoonVitPretrainedModel from vllm.model_executor.models.moonvit import MoonVitPretrainedModel
from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import ( from vllm.multimodal.inputs import (
MultiModalDataDict,
MultiModalFieldConfig, MultiModalFieldConfig,
MultiModalKwargsItems, MultiModalKwargsItems,
NestedTensors, NestedTensors,
......
...@@ -21,6 +21,7 @@ from transformers.models.lfm2_vl.image_processing_lfm2_vl_fast import ( ...@@ -21,6 +21,7 @@ from transformers.models.lfm2_vl.image_processing_lfm2_vl_fast import (
from vllm.config import VllmConfig from vllm.config import VllmConfig
from vllm.config.multimodal import BaseDummyOptions from vllm.config.multimodal import BaseDummyOptions
from vllm.forward_context import set_forward_context from vllm.forward_context import set_forward_context
from vllm.inputs import MultiModalDataDict
from vllm.model_executor.layers.mamba.mamba_utils import ( from vllm.model_executor.layers.mamba.mamba_utils import (
MambaStateCopyFunc, MambaStateCopyFunc,
MambaStateCopyFuncCalculator, MambaStateCopyFuncCalculator,
...@@ -30,7 +31,6 @@ from vllm.model_executor.layers.mamba.mamba_utils import ( ...@@ -30,7 +31,6 @@ from vllm.model_executor.layers.mamba.mamba_utils import (
from vllm.model_executor.models.module_mapping import MultiModelKeys from vllm.model_executor.models.module_mapping import MultiModelKeys
from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import ( from vllm.multimodal.inputs import (
MultiModalDataDict,
MultiModalFieldConfig, MultiModalFieldConfig,
MultiModalKwargsItems, MultiModalKwargsItems,
) )
......
...@@ -20,17 +20,15 @@ from transformers.models.pixtral import PixtralProcessor ...@@ -20,17 +20,15 @@ from transformers.models.pixtral import PixtralProcessor
from vllm.config import VllmConfig from vllm.config import VllmConfig
from vllm.config.multimodal import BaseDummyOptions from vllm.config.multimodal import BaseDummyOptions
from vllm.inputs import MultiModalDataDict, MultiModalInput, mm_input
from vllm.model_executor.layers.activation import get_act_fn from vllm.model_executor.layers.activation import get_act_fn
from vllm.model_executor.layers.linear import ColumnParallelLinear, RowParallelLinear from vllm.model_executor.layers.linear import ColumnParallelLinear, RowParallelLinear
from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.quantization import QuantizationConfig
from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.cache import BaseMultiModalProcessorCache from vllm.multimodal.cache import BaseMultiModalProcessorCache
from vllm.multimodal.inputs import ( from vllm.multimodal.inputs import (
MultiModalDataDict,
MultiModalFieldConfig, MultiModalFieldConfig,
MultiModalInputs,
MultiModalKwargsItems, MultiModalKwargsItems,
mm_inputs,
) )
from vllm.multimodal.parse import ( from vllm.multimodal.parse import (
ImageEmbeddingItems, ImageEmbeddingItems,
...@@ -777,7 +775,7 @@ class MantisMultiModalProcessor(LlavaMultiModalProcessor): ...@@ -777,7 +775,7 @@ class MantisMultiModalProcessor(LlavaMultiModalProcessor):
self, self,
inputs: ProcessorInputs, inputs: ProcessorInputs,
timing_ctx: TimingContext, timing_ctx: TimingContext,
) -> MultiModalInputs: ) -> MultiModalInput:
hf_config = self.info.get_hf_config() hf_config = self.info.get_hf_config()
image_token_id = hf_config.image_token_index image_token_id = hf_config.image_token_index
...@@ -833,7 +831,7 @@ class MantisMultiModalProcessor(LlavaMultiModalProcessor): ...@@ -833,7 +831,7 @@ class MantisMultiModalProcessor(LlavaMultiModalProcessor):
for modality, placeholders in mm_placeholders.items() for modality, placeholders in mm_placeholders.items()
} }
return mm_inputs( return mm_input(
prompt_token_ids=prompt_ids, prompt_token_ids=prompt_ids,
mm_kwargs=mm_kwargs, mm_kwargs=mm_kwargs,
mm_hashes=mm_hashes, mm_hashes=mm_hashes,
......
...@@ -11,11 +11,11 @@ from transformers import BatchFeature, LlavaNextVideoConfig, LlavaNextVideoProce ...@@ -11,11 +11,11 @@ from transformers import BatchFeature, LlavaNextVideoConfig, LlavaNextVideoProce
from vllm.config import VllmConfig from vllm.config import VllmConfig
from vllm.config.multimodal import BaseDummyOptions from vllm.config.multimodal import BaseDummyOptions
from vllm.inputs import MultiModalDataDict
from vllm.model_executor.layers.activation import get_act_fn from vllm.model_executor.layers.activation import get_act_fn
from vllm.model_executor.models.clip import CLIPVisionModel from vllm.model_executor.models.clip import CLIPVisionModel
from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import ( from vllm.multimodal.inputs import (
MultiModalDataDict,
MultiModalFieldConfig, MultiModalFieldConfig,
MultiModalKwargsItems, MultiModalKwargsItems,
) )
......
...@@ -15,10 +15,10 @@ from transformers.models.llava_onevision.modeling_llava_onevision import ( ...@@ -15,10 +15,10 @@ from transformers.models.llava_onevision.modeling_llava_onevision import (
from vllm.config import VllmConfig from vllm.config import VllmConfig
from vllm.config.multimodal import BaseDummyOptions from vllm.config.multimodal import BaseDummyOptions
from vllm.inputs import MultiModalDataDict
from vllm.model_executor.layers.activation import get_act_fn from vllm.model_executor.layers.activation import get_act_fn
from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import ( from vllm.multimodal.inputs import (
MultiModalDataDict,
MultiModalFieldConfig, MultiModalFieldConfig,
MultiModalKwargsItems, MultiModalKwargsItems,
) )
......
...@@ -38,6 +38,7 @@ from transformers import BatchFeature ...@@ -38,6 +38,7 @@ from transformers import BatchFeature
from vllm.config import VllmConfig from vllm.config import VllmConfig
from vllm.config.multimodal import BaseDummyOptions from vllm.config.multimodal import BaseDummyOptions
from vllm.distributed import get_tensor_model_parallel_world_size from vllm.distributed import get_tensor_model_parallel_world_size
from vllm.inputs import MultiModalDataDict
from vllm.model_executor.layers.activation import get_act_fn from vllm.model_executor.layers.activation import get_act_fn
from vllm.model_executor.layers.conv import Conv2dLayer from vllm.model_executor.layers.conv import Conv2dLayer
from vllm.model_executor.layers.linear import ( from vllm.model_executor.layers.linear import (
...@@ -48,7 +49,6 @@ from vllm.model_executor.layers.linear import ( ...@@ -48,7 +49,6 @@ from vllm.model_executor.layers.linear import (
from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.quantization import QuantizationConfig
from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import ( from vllm.multimodal.inputs import (
MultiModalDataDict,
MultiModalFieldConfig, MultiModalFieldConfig,
MultiModalKwargsItems, MultiModalKwargsItems,
) )
......
...@@ -41,9 +41,9 @@ from transformers.models.whisper.modeling_whisper import ( ...@@ -41,9 +41,9 @@ from transformers.models.whisper.modeling_whisper import (
from vllm.config import VllmConfig from vllm.config import VllmConfig
from vllm.config.multimodal import BaseDummyOptions from vllm.config.multimodal import BaseDummyOptions
from vllm.inputs import ModalityData, MultiModalDataDict
from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargsItems from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargsItems
from vllm.multimodal.inputs import ( from vllm.multimodal.inputs import (
MultiModalDataDict,
MultiModalFieldConfig, MultiModalFieldConfig,
NestedTensors, NestedTensors,
) )
...@@ -51,7 +51,6 @@ from vllm.multimodal.parse import ( ...@@ -51,7 +51,6 @@ from vllm.multimodal.parse import (
AudioItem, AudioItem,
AudioProcessorItems, AudioProcessorItems,
DictEmbeddingItems, DictEmbeddingItems,
ModalityData,
ModalityDataItems, ModalityDataItems,
MultiModalDataItems, MultiModalDataItems,
) )
......
...@@ -41,6 +41,7 @@ from typing_extensions import TypeVar ...@@ -41,6 +41,7 @@ from typing_extensions import TypeVar
from vllm.config import VllmConfig from vllm.config import VllmConfig
from vllm.config.multimodal import BaseDummyOptions from vllm.config.multimodal import BaseDummyOptions
from vllm.inputs import ModalityData, MultiModalDataDict
from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.quantization import QuantizationConfig
from vllm.model_executor.layers.resampler import ( from vllm.model_executor.layers.resampler import (
BaseResampler, BaseResampler,
...@@ -54,7 +55,6 @@ from vllm.model_executor.models.qwen2 import Qwen2ForCausalLM ...@@ -54,7 +55,6 @@ from vllm.model_executor.models.qwen2 import Qwen2ForCausalLM
from vllm.model_executor.models.qwen3 import Qwen3ForCausalLM from vllm.model_executor.models.qwen3 import Qwen3ForCausalLM
from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import ( from vllm.multimodal.inputs import (
MultiModalDataDict,
MultiModalFieldConfig, MultiModalFieldConfig,
MultiModalKwargsItems, MultiModalKwargsItems,
NestedTensors, NestedTensors,
...@@ -64,7 +64,6 @@ from vllm.multimodal.parse import ( ...@@ -64,7 +64,6 @@ from vllm.multimodal.parse import (
ImageItem, ImageItem,
ImageProcessorItems, ImageProcessorItems,
ImageSize, ImageSize,
ModalityData,
ModalityDataItems, ModalityDataItems,
MultiModalDataItems, MultiModalDataItems,
MultiModalDataParser, MultiModalDataParser,
......
...@@ -11,6 +11,7 @@ from transformers.models.pixtral import PixtralProcessor ...@@ -11,6 +11,7 @@ from transformers.models.pixtral import PixtralProcessor
from vllm.config import VllmConfig from vllm.config import VllmConfig
from vllm.config.multimodal import BaseDummyOptions from vllm.config.multimodal import BaseDummyOptions
from vllm.inputs import MultiModalDataDict
from vllm.model_executor.layers.activation import get_act_fn from vllm.model_executor.layers.activation import get_act_fn
from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.layernorm import RMSNorm
from vllm.model_executor.layers.linear import ColumnParallelLinear, RowParallelLinear from vllm.model_executor.layers.linear import ColumnParallelLinear, RowParallelLinear
...@@ -18,7 +19,6 @@ from vllm.model_executor.layers.quantization import QuantizationConfig ...@@ -18,7 +19,6 @@ from vllm.model_executor.layers.quantization import QuantizationConfig
from vllm.model_executor.models.module_mapping import MultiModelKeys from vllm.model_executor.models.module_mapping import MultiModelKeys
from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import ( from vllm.multimodal.inputs import (
MultiModalDataDict,
MultiModalFieldConfig, MultiModalFieldConfig,
MultiModalKwargsItems, MultiModalKwargsItems,
) )
......
...@@ -38,6 +38,7 @@ from vllm.compilation.decorators import ( ...@@ -38,6 +38,7 @@ from vllm.compilation.decorators import (
from vllm.config import VllmConfig, set_current_vllm_config from vllm.config import VllmConfig, set_current_vllm_config
from vllm.config.multimodal import BaseDummyOptions from vllm.config.multimodal import BaseDummyOptions
from vllm.distributed import get_tensor_model_parallel_world_size from vllm.distributed import get_tensor_model_parallel_world_size
from vllm.inputs import MultiModalDataDict
from vllm.model_executor.layers.attention import MMEncoderAttention from vllm.model_executor.layers.attention import MMEncoderAttention
from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.layers.fused_moe import FusedMoE
from vllm.model_executor.layers.linear import ( from vllm.model_executor.layers.linear import (
...@@ -53,7 +54,6 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader ...@@ -53,7 +54,6 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
from vllm.model_executor.models.module_mapping import MultiModelKeys from vllm.model_executor.models.module_mapping import MultiModelKeys
from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import ( from vllm.multimodal.inputs import (
MultiModalDataDict,
MultiModalFieldConfig, MultiModalFieldConfig,
MultiModalKwargsItems, MultiModalKwargsItems,
) )
......
...@@ -29,6 +29,7 @@ from vllm.distributed import ( ...@@ -29,6 +29,7 @@ from vllm.distributed import (
split_tensor_along_last_dim, split_tensor_along_last_dim,
tensor_model_parallel_all_gather, tensor_model_parallel_all_gather,
) )
from vllm.inputs import MultiModalDataDict
from vllm.model_executor.layers.activation import MulAndSilu, QuickGELU, SiluAndMul from vllm.model_executor.layers.activation import MulAndSilu, QuickGELU, SiluAndMul
from vllm.model_executor.layers.attention import Attention, MMEncoderAttention from vllm.model_executor.layers.attention import Attention, MMEncoderAttention
from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.layernorm import RMSNorm
...@@ -49,7 +50,6 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader ...@@ -49,7 +50,6 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
from vllm.model_executor.models.module_mapping import MultiModelKeys from vllm.model_executor.models.module_mapping import MultiModelKeys
from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import ( from vllm.multimodal.inputs import (
MultiModalDataDict,
MultiModalFieldConfig, MultiModalFieldConfig,
MultiModalKwargsItems, MultiModalKwargsItems,
) )
......
...@@ -33,6 +33,7 @@ from vllm.distributed import ( ...@@ -33,6 +33,7 @@ from vllm.distributed import (
split_tensor_along_last_dim, split_tensor_along_last_dim,
tensor_model_parallel_all_gather, tensor_model_parallel_all_gather,
) )
from vllm.inputs import MultiModalDataDict
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.model_executor.layers.activation import MulAndSilu, SiluAndMul, get_act_fn from vllm.model_executor.layers.activation import MulAndSilu, SiluAndMul, get_act_fn
from vllm.model_executor.layers.attention import Attention, MMEncoderAttention from vllm.model_executor.layers.attention import Attention, MMEncoderAttention
...@@ -54,7 +55,6 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader ...@@ -54,7 +55,6 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
from vllm.model_executor.models.module_mapping import MultiModelKeys from vllm.model_executor.models.module_mapping import MultiModelKeys
from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import ( from vllm.multimodal.inputs import (
MultiModalDataDict,
MultiModalFieldConfig, MultiModalFieldConfig,
MultiModalKwargsItems, MultiModalKwargsItems,
VideoItem, VideoItem,
......
...@@ -21,6 +21,7 @@ from transformers import BatchFeature ...@@ -21,6 +21,7 @@ from transformers import BatchFeature
from vllm.config import VllmConfig from vllm.config import VllmConfig
from vllm.config.multimodal import BaseDummyOptions, VideoDummyOptions from vllm.config.multimodal import BaseDummyOptions, VideoDummyOptions
from vllm.inputs import MultiModalDataDict, MultiModalInput
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.model_executor.layers.activation import ReLUSquaredActivation from vllm.model_executor.layers.activation import ReLUSquaredActivation
from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.layernorm import RMSNorm
...@@ -48,9 +49,7 @@ from vllm.multimodal.evs import ( ...@@ -48,9 +49,7 @@ from vllm.multimodal.evs import (
from vllm.multimodal.inputs import ( from vllm.multimodal.inputs import (
AudioItem, AudioItem,
BatchedTensorInputs, BatchedTensorInputs,
MultiModalDataDict,
MultiModalFieldConfig, MultiModalFieldConfig,
MultiModalInputs,
MultiModalKwargsItems, MultiModalKwargsItems,
VideoItem, VideoItem,
) )
...@@ -576,7 +575,7 @@ class NanoNemotronVLMultiModalProcessor( ...@@ -576,7 +575,7 @@ class NanoNemotronVLMultiModalProcessor(
self, self,
inputs: ProcessorInputs, inputs: ProcessorInputs,
timing_ctx: TimingContext, timing_ctx: TimingContext,
) -> MultiModalInputs: ) -> MultiModalInput:
use_audio_in_video = bool( use_audio_in_video = bool(
inputs.hf_processor_mm_kwargs.get("use_audio_in_video", False) inputs.hf_processor_mm_kwargs.get("use_audio_in_video", False)
) )
...@@ -632,7 +631,7 @@ class NanoNemotronVLMultiModalProcessor( ...@@ -632,7 +631,7 @@ class NanoNemotronVLMultiModalProcessor(
for modality, placeholders in mm_placeholders.items() for modality, placeholders in mm_placeholders.items()
} }
return MultiModalInputs( return MultiModalInput(
type="multimodal", type="multimodal",
prompt_token_ids=prompt_ids, prompt_token_ids=prompt_ids,
mm_kwargs=mm_info.kwargs, mm_kwargs=mm_info.kwargs,
......
...@@ -23,6 +23,7 @@ from transformers import ( ...@@ -23,6 +23,7 @@ from transformers import (
from vllm.config import CacheConfig, VllmConfig from vllm.config import CacheConfig, VllmConfig
from vllm.config.lora import LoRAConfig from vllm.config.lora import LoRAConfig
from vllm.config.multimodal import BaseDummyOptions from vllm.config.multimodal import BaseDummyOptions
from vllm.inputs import MultiModalDataDict
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.model_executor.layers.activation import get_act_fn from vllm.model_executor.layers.activation import get_act_fn
from vllm.model_executor.layers.linear import ColumnParallelLinear, RowParallelLinear from vllm.model_executor.layers.linear import ColumnParallelLinear, RowParallelLinear
...@@ -41,7 +42,6 @@ from vllm.model_executor.models.radio import RadioModel ...@@ -41,7 +42,6 @@ from vllm.model_executor.models.radio import RadioModel
from vllm.model_executor.models.whisper import WhisperAttention, WhisperCrossAttention from vllm.model_executor.models.whisper import WhisperAttention, WhisperCrossAttention
from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import ( from vllm.multimodal.inputs import (
MultiModalDataDict,
MultiModalFieldConfig, MultiModalFieldConfig,
MultiModalKwargsItems, MultiModalKwargsItems,
) )
......
...@@ -14,12 +14,10 @@ import torch.nn as nn ...@@ -14,12 +14,10 @@ import torch.nn as nn
from transformers import PretrainedConfig from transformers import PretrainedConfig
from vllm.config.multimodal import BaseDummyOptions from vllm.config.multimodal import BaseDummyOptions
from vllm.inputs import MultiModalDataDict
from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.quantization import QuantizationConfig
from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import ( from vllm.multimodal.inputs import BatchedTensorInputs
BatchedTensorInputs,
MultiModalDataDict,
)
from vllm.multimodal.parse import ( from vllm.multimodal.parse import (
ImageEmbeddingItems, ImageEmbeddingItems,
ImageProcessorItems, ImageProcessorItems,
......
...@@ -30,6 +30,7 @@ from transformers import BatchFeature, PretrainedConfig ...@@ -30,6 +30,7 @@ from transformers import BatchFeature, PretrainedConfig
from vllm.config import VllmConfig from vllm.config import VllmConfig
from vllm.config.multimodal import BaseDummyOptions from vllm.config.multimodal import BaseDummyOptions
from vllm.inputs import MultiModalDataDict
from vllm.model_executor.layers.linear import ReplicatedLinear from vllm.model_executor.layers.linear import ReplicatedLinear
from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.quantization import QuantizationConfig
from vllm.model_executor.models.aimv2 import AIMv2Model from vllm.model_executor.models.aimv2 import AIMv2Model
...@@ -42,7 +43,6 @@ from vllm.model_executor.models.utils import ( ...@@ -42,7 +43,6 @@ from vllm.model_executor.models.utils import (
) )
from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import ( from vllm.multimodal.inputs import (
MultiModalDataDict,
MultiModalFieldConfig, MultiModalFieldConfig,
MultiModalKwargsItems, MultiModalKwargsItems,
) )
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment