"vscode:/vscode.git/clone" did not exist on "fed5849d3fd7a5e7454cf87f101a18c2bad0436f"
Unverified Commit 79f05e44 authored by Roger Wang's avatar Roger Wang Committed by GitHub
Browse files

[Multimodal] Always enable hashing mm data (#23308)


Signed-off-by: default avatarRoger Wang <hey@rogerw.io>
Signed-off-by: default avatarDarkLight1337 <tlleungac@connect.ust.hk>
Co-authored-by: default avatarDarkLight1337 <tlleungac@connect.ust.hk>
parent f8daddcc
...@@ -1685,15 +1685,6 @@ class ModelConfig: ...@@ -1685,15 +1685,6 @@ class ModelConfig:
def is_multimodal_model(self) -> bool: def is_multimodal_model(self) -> bool:
return self.multimodal_config is not None return self.multimodal_config is not None
@property
def processor_return_mm_hashes(self) -> bool:
"""Whether the multi-modal processor should output hashes."""
mm_config = self.multimodal_config
if mm_config is None:
return False
return mm_config.mm_processor_cache_gb > 0
@property @property
def enable_mm_processor_cache(self) -> bool: def enable_mm_processor_cache(self) -> bool:
"""Whether the multi-modal processor cache should be enabled.""" """Whether the multi-modal processor cache should be enabled."""
......
...@@ -254,7 +254,6 @@ class InputPreprocessor: ...@@ -254,7 +254,6 @@ class InputPreprocessor:
mm_processor_kwargs: Optional[Mapping[str, object]], mm_processor_kwargs: Optional[Mapping[str, object]],
tokenization_kwargs: Optional[dict[str, Any]] = None, tokenization_kwargs: Optional[dict[str, Any]] = None,
lora_request: Optional[LoRARequest] = None, lora_request: Optional[LoRARequest] = None,
return_mm_hashes: bool = False,
) -> MultiModalInputs: ) -> MultiModalInputs:
""" """
Apply the model's multi-modal processor to a multi-modal prompt, Apply the model's multi-modal processor to a multi-modal prompt,
...@@ -271,8 +270,7 @@ class InputPreprocessor: ...@@ -271,8 +270,7 @@ class InputPreprocessor:
return mm_processor.apply(prompt, return mm_processor.apply(prompt,
mm_data, mm_data,
hf_processor_mm_kwargs=mm_processor_kwargs, hf_processor_mm_kwargs=mm_processor_kwargs,
tokenization_kwargs=tokenization_kwargs, tokenization_kwargs=tokenization_kwargs)
return_mm_hashes=return_mm_hashes)
async def _process_multimodal_async( async def _process_multimodal_async(
self, self,
...@@ -281,7 +279,6 @@ class InputPreprocessor: ...@@ -281,7 +279,6 @@ class InputPreprocessor:
mm_processor_kwargs: Optional[Mapping[str, object]], mm_processor_kwargs: Optional[Mapping[str, object]],
tokenization_kwargs: Optional[dict[str, Any]] = None, tokenization_kwargs: Optional[dict[str, Any]] = None,
lora_request: Optional[LoRARequest] = None, lora_request: Optional[LoRARequest] = None,
return_mm_hashes: bool = False,
) -> MultiModalInputs: ) -> MultiModalInputs:
""" """
Async version of Async version of
...@@ -297,8 +294,7 @@ class InputPreprocessor: ...@@ -297,8 +294,7 @@ class InputPreprocessor:
return mm_processor.apply(prompt, return mm_processor.apply(prompt,
mm_data, mm_data,
hf_processor_mm_kwargs=mm_processor_kwargs, hf_processor_mm_kwargs=mm_processor_kwargs,
tokenization_kwargs=tokenization_kwargs, tokenization_kwargs=tokenization_kwargs)
return_mm_hashes=return_mm_hashes)
def _process_embeds( def _process_embeds(
self, self,
...@@ -335,7 +331,6 @@ class InputPreprocessor: ...@@ -335,7 +331,6 @@ class InputPreprocessor:
parsed_content: TokensPrompt, parsed_content: TokensPrompt,
tokenization_kwargs: Optional[dict[str, Any]] = None, tokenization_kwargs: Optional[dict[str, Any]] = None,
lora_request: Optional[LoRARequest] = None, lora_request: Optional[LoRARequest] = None,
return_mm_hashes: bool = False,
) -> Union[TokenInputs, MultiModalInputs]: ) -> Union[TokenInputs, MultiModalInputs]:
prompt_token_ids = parsed_content["prompt_token_ids"] prompt_token_ids = parsed_content["prompt_token_ids"]
token_type_ids = parsed_content.get("token_type_ids") token_type_ids = parsed_content.get("token_type_ids")
...@@ -348,7 +343,6 @@ class InputPreprocessor: ...@@ -348,7 +343,6 @@ class InputPreprocessor:
parsed_content.get("mm_processor_kwargs"), parsed_content.get("mm_processor_kwargs"),
tokenization_kwargs=tokenization_kwargs, tokenization_kwargs=tokenization_kwargs,
lora_request=lora_request, lora_request=lora_request,
return_mm_hashes=return_mm_hashes,
) )
else: else:
inputs = token_inputs( inputs = token_inputs(
...@@ -366,7 +360,6 @@ class InputPreprocessor: ...@@ -366,7 +360,6 @@ class InputPreprocessor:
parsed_content: TokensPrompt, parsed_content: TokensPrompt,
tokenization_kwargs: Optional[dict[str, Any]] = None, tokenization_kwargs: Optional[dict[str, Any]] = None,
lora_request: Optional[LoRARequest] = None, lora_request: Optional[LoRARequest] = None,
return_mm_hashes: bool = False,
) -> Union[TokenInputs, MultiModalInputs]: ) -> Union[TokenInputs, MultiModalInputs]:
prompt_token_ids = parsed_content["prompt_token_ids"] prompt_token_ids = parsed_content["prompt_token_ids"]
token_type_ids = parsed_content.get("token_type_ids") token_type_ids = parsed_content.get("token_type_ids")
...@@ -379,7 +372,6 @@ class InputPreprocessor: ...@@ -379,7 +372,6 @@ class InputPreprocessor:
parsed_content.get("mm_processor_kwargs"), parsed_content.get("mm_processor_kwargs"),
tokenization_kwargs=tokenization_kwargs, tokenization_kwargs=tokenization_kwargs,
lora_request=lora_request, lora_request=lora_request,
return_mm_hashes=return_mm_hashes,
) )
else: else:
inputs = token_inputs( inputs = token_inputs(
...@@ -397,7 +389,6 @@ class InputPreprocessor: ...@@ -397,7 +389,6 @@ class InputPreprocessor:
parsed_content: TextPrompt, parsed_content: TextPrompt,
tokenization_kwargs: Optional[dict[str, Any]] = None, tokenization_kwargs: Optional[dict[str, Any]] = None,
lora_request: Optional[LoRARequest] = None, lora_request: Optional[LoRARequest] = None,
return_mm_hashes: bool = False,
) -> Union[TokenInputs, MultiModalInputs]: ) -> Union[TokenInputs, MultiModalInputs]:
prompt_text = parsed_content["prompt"] prompt_text = parsed_content["prompt"]
...@@ -409,7 +400,6 @@ class InputPreprocessor: ...@@ -409,7 +400,6 @@ class InputPreprocessor:
parsed_content.get("mm_processor_kwargs"), parsed_content.get("mm_processor_kwargs"),
tokenization_kwargs=tokenization_kwargs, tokenization_kwargs=tokenization_kwargs,
lora_request=lora_request, lora_request=lora_request,
return_mm_hashes=return_mm_hashes,
) )
else: else:
prompt_token_ids = self._tokenize_prompt( prompt_token_ids = self._tokenize_prompt(
...@@ -432,7 +422,6 @@ class InputPreprocessor: ...@@ -432,7 +422,6 @@ class InputPreprocessor:
parsed_content: TextPrompt, parsed_content: TextPrompt,
tokenization_kwargs: Optional[dict[str, Any]] = None, tokenization_kwargs: Optional[dict[str, Any]] = None,
lora_request: Optional[LoRARequest] = None, lora_request: Optional[LoRARequest] = None,
return_mm_hashes: bool = False,
) -> Union[TokenInputs, MultiModalInputs]: ) -> Union[TokenInputs, MultiModalInputs]:
prompt_text = parsed_content["prompt"] prompt_text = parsed_content["prompt"]
...@@ -444,7 +433,6 @@ class InputPreprocessor: ...@@ -444,7 +433,6 @@ class InputPreprocessor:
parsed_content.get("mm_processor_kwargs"), parsed_content.get("mm_processor_kwargs"),
tokenization_kwargs=tokenization_kwargs, tokenization_kwargs=tokenization_kwargs,
lora_request=lora_request, lora_request=lora_request,
return_mm_hashes=return_mm_hashes,
) )
else: else:
prompt_token_ids = await self._tokenize_prompt_async( prompt_token_ids = await self._tokenize_prompt_async(
...@@ -467,7 +455,6 @@ class InputPreprocessor: ...@@ -467,7 +455,6 @@ class InputPreprocessor:
prompt: SingletonPrompt, prompt: SingletonPrompt,
tokenization_kwargs: Optional[dict[str, Any]] = None, tokenization_kwargs: Optional[dict[str, Any]] = None,
lora_request: Optional[LoRARequest] = None, lora_request: Optional[LoRARequest] = None,
return_mm_hashes: bool = False,
) -> SingletonInputs: ) -> SingletonInputs:
""" """
Extract the singleton inputs from a prompt. Extract the singleton inputs from a prompt.
...@@ -476,7 +463,6 @@ class InputPreprocessor: ...@@ -476,7 +463,6 @@ class InputPreprocessor:
* prompt: single encoder or decoder input prompt * prompt: single encoder or decoder input prompt
* lora_request: this is only valid for decoder prompts * lora_request: this is only valid for decoder prompts
* return_mm_hashes: whether to return multimodal hashes
Returns: Returns:
...@@ -490,21 +476,18 @@ class InputPreprocessor: ...@@ -490,21 +476,18 @@ class InputPreprocessor:
return self._process_tokens( return self._process_tokens(
parsed["content"], parsed["content"],
lora_request=lora_request, lora_request=lora_request,
return_mm_hashes=return_mm_hashes,
) )
if parsed["type"] == "text": if parsed["type"] == "text":
return self._process_text( return self._process_text(
parsed["content"], parsed["content"],
tokenization_kwargs=tokenization_kwargs, tokenization_kwargs=tokenization_kwargs,
lora_request=lora_request, lora_request=lora_request,
return_mm_hashes=return_mm_hashes,
) )
if parsed["type"] == "str": if parsed["type"] == "str":
return self._process_text( return self._process_text(
TextPrompt(prompt=parsed["content"]), TextPrompt(prompt=parsed["content"]),
tokenization_kwargs=tokenization_kwargs, tokenization_kwargs=tokenization_kwargs,
lora_request=lora_request, lora_request=lora_request,
return_mm_hashes=return_mm_hashes,
) )
assert_never(parsed) assert_never(parsed)
...@@ -514,7 +497,6 @@ class InputPreprocessor: ...@@ -514,7 +497,6 @@ class InputPreprocessor:
prompt: SingletonPrompt, prompt: SingletonPrompt,
tokenization_kwargs: Optional[dict[str, Any]] = None, tokenization_kwargs: Optional[dict[str, Any]] = None,
lora_request: Optional[LoRARequest] = None, lora_request: Optional[LoRARequest] = None,
return_mm_hashes: bool = False,
) -> SingletonInputs: ) -> SingletonInputs:
""" """
Async version of Async version of
...@@ -528,21 +510,18 @@ class InputPreprocessor: ...@@ -528,21 +510,18 @@ class InputPreprocessor:
return await self._process_tokens_async( return await self._process_tokens_async(
parsed["content"], parsed["content"],
lora_request=lora_request, lora_request=lora_request,
return_mm_hashes=return_mm_hashes,
) )
if parsed["type"] == "text": if parsed["type"] == "text":
return await self._process_text_async( return await self._process_text_async(
parsed["content"], parsed["content"],
tokenization_kwargs=tokenization_kwargs, tokenization_kwargs=tokenization_kwargs,
lora_request=lora_request, lora_request=lora_request,
return_mm_hashes=return_mm_hashes,
) )
if parsed["type"] == "str": if parsed["type"] == "str":
return await self._process_text_async( return await self._process_text_async(
TextPrompt(prompt=parsed["content"]), TextPrompt(prompt=parsed["content"]),
tokenization_kwargs=tokenization_kwargs, tokenization_kwargs=tokenization_kwargs,
lora_request=lora_request, lora_request=lora_request,
return_mm_hashes=return_mm_hashes,
) )
assert_never(parsed) assert_never(parsed)
...@@ -785,7 +764,6 @@ class InputPreprocessor: ...@@ -785,7 +764,6 @@ class InputPreprocessor:
prompt: SingletonPrompt, prompt: SingletonPrompt,
tokenization_kwargs: Optional[dict[str, Any]] = None, tokenization_kwargs: Optional[dict[str, Any]] = None,
lora_request: Optional[LoRARequest] = None, lora_request: Optional[LoRARequest] = None,
return_mm_hashes: bool = False,
) -> DecoderOnlyInputs: ) -> DecoderOnlyInputs:
""" """
For decoder-only models: For decoder-only models:
...@@ -796,7 +774,6 @@ class InputPreprocessor: ...@@ -796,7 +774,6 @@ class InputPreprocessor:
* prompt: input prompt * prompt: input prompt
* lora_request * lora_request
* return_mm_hashes
Returns: Returns:
...@@ -807,7 +784,6 @@ class InputPreprocessor: ...@@ -807,7 +784,6 @@ class InputPreprocessor:
prompt, prompt,
tokenization_kwargs=tokenization_kwargs, tokenization_kwargs=tokenization_kwargs,
lora_request=lora_request, lora_request=lora_request,
return_mm_hashes=return_mm_hashes,
) )
return self._build_decoder_only_llm_inputs(prompt_comps) return self._build_decoder_only_llm_inputs(prompt_comps)
...@@ -817,7 +793,6 @@ class InputPreprocessor: ...@@ -817,7 +793,6 @@ class InputPreprocessor:
prompt: SingletonPrompt, prompt: SingletonPrompt,
tokenization_kwargs: Optional[dict[str, Any]] = None, tokenization_kwargs: Optional[dict[str, Any]] = None,
lora_request: Optional[LoRARequest] = None, lora_request: Optional[LoRARequest] = None,
return_mm_hashes: bool = False,
) -> DecoderOnlyInputs: ) -> DecoderOnlyInputs:
""" """
Async version of Async version of
...@@ -827,7 +802,6 @@ class InputPreprocessor: ...@@ -827,7 +802,6 @@ class InputPreprocessor:
prompt, prompt,
tokenization_kwargs=tokenization_kwargs, tokenization_kwargs=tokenization_kwargs,
lora_request=lora_request, lora_request=lora_request,
return_mm_hashes=return_mm_hashes,
) )
return self._build_decoder_only_llm_inputs(prompt_comps) return self._build_decoder_only_llm_inputs(prompt_comps)
...@@ -837,17 +811,15 @@ class InputPreprocessor: ...@@ -837,17 +811,15 @@ class InputPreprocessor:
prompt: PromptType, prompt: PromptType,
tokenization_kwargs: Optional[dict[str, Any]] = None, tokenization_kwargs: Optional[dict[str, Any]] = None,
lora_request: Optional[LoRARequest] = None, lora_request: Optional[LoRARequest] = None,
return_mm_hashes: bool = False,
) -> ProcessorInputs: ) -> ProcessorInputs:
"""Preprocess the input prompt.""" """Preprocess the input prompt."""
if self.model_config.is_encoder_decoder: if self.model_config.is_encoder_decoder:
assert not return_mm_hashes, (
"Multimodal hashes for encoder-decoder models should not be ",
"returned until they are supported on vLLM V1.")
# Encoder-decoder model requires special mapping of # Encoder-decoder model requires special mapping of
# input prompts to encoder & decoder # input prompts to encoder & decoder.
return self._process_encoder_decoder_prompt( return self._process_encoder_decoder_prompt(
prompt, tokenization_kwargs) prompt,
tokenization_kwargs,
)
if is_explicit_encoder_decoder_prompt(prompt): if is_explicit_encoder_decoder_prompt(prompt):
raise ValueError("Cannot pass encoder-decoder prompt " raise ValueError("Cannot pass encoder-decoder prompt "
...@@ -858,7 +830,6 @@ class InputPreprocessor: ...@@ -858,7 +830,6 @@ class InputPreprocessor:
prompt, prompt,
tokenization_kwargs=tokenization_kwargs, tokenization_kwargs=tokenization_kwargs,
lora_request=lora_request, lora_request=lora_request,
return_mm_hashes=return_mm_hashes,
) )
async def preprocess_async( async def preprocess_async(
...@@ -866,19 +837,18 @@ class InputPreprocessor: ...@@ -866,19 +837,18 @@ class InputPreprocessor:
prompt: PromptType, prompt: PromptType,
tokenization_kwargs: Optional[dict[str, Any]] = None, tokenization_kwargs: Optional[dict[str, Any]] = None,
lora_request: Optional[LoRARequest] = None, lora_request: Optional[LoRARequest] = None,
return_mm_hashes: bool = False,
) -> ProcessorInputs: ) -> ProcessorInputs:
""" """
Async version of Async version of
[`preprocess`][vllm.inputs.preprocess.InputPreprocessor.preprocess]. [`preprocess`][vllm.inputs.preprocess.InputPreprocessor.preprocess].
""" """
if self.model_config.is_encoder_decoder: if self.model_config.is_encoder_decoder:
assert not return_mm_hashes, (
"Multimodal hashes for encoder-decoder models should not be ",
"returned until they are supported on vLLM V1.")
# Encoder-decoder model requires special mapping of # Encoder-decoder model requires special mapping of
# input prompts to encoder & decoder # input prompts to encoder & decoder.
return await self._process_encoder_decoder_prompt_async(prompt) return await self._process_encoder_decoder_prompt_async(
prompt,
tokenization_kwargs,
)
if is_explicit_encoder_decoder_prompt(prompt): if is_explicit_encoder_decoder_prompt(prompt):
raise ValueError("Cannot pass encoder-decoder prompt " raise ValueError("Cannot pass encoder-decoder prompt "
...@@ -889,5 +859,4 @@ class InputPreprocessor: ...@@ -889,5 +859,4 @@ class InputPreprocessor:
prompt, prompt,
tokenization_kwargs=tokenization_kwargs, tokenization_kwargs=tokenization_kwargs,
lora_request=lora_request, lora_request=lora_request,
return_mm_hashes=return_mm_hashes,
) )
...@@ -290,8 +290,6 @@ class DeepseekVL2MultiModalProcessor( ...@@ -290,8 +290,6 @@ class DeepseekVL2MultiModalProcessor(
mm_data_items: MultiModalDataItems, mm_data_items: MultiModalDataItems,
hf_processor_mm_kwargs: Mapping[str, object], hf_processor_mm_kwargs: Mapping[str, object],
tokenization_kwargs: Mapping[str, object], tokenization_kwargs: Mapping[str, object],
*,
return_mm_hashes: bool,
) -> tuple[list[int], MultiModalProcessingInfo, bool]: ) -> tuple[list[int], MultiModalProcessingInfo, bool]:
# The processor logic is different for len(images) <= 2 vs > 2 # The processor logic is different for len(images) <= 2 vs > 2
# Since the processing cache assumes that the processor output is # Since the processing cache assumes that the processor output is
...@@ -303,7 +301,6 @@ class DeepseekVL2MultiModalProcessor( ...@@ -303,7 +301,6 @@ class DeepseekVL2MultiModalProcessor(
mm_data_items=mm_data_items, mm_data_items=mm_data_items,
hf_processor_mm_kwargs=hf_processor_mm_kwargs, hf_processor_mm_kwargs=hf_processor_mm_kwargs,
tokenization_kwargs=tokenization_kwargs, tokenization_kwargs=tokenization_kwargs,
return_mm_hashes=return_mm_hashes,
) )
return super()._cached_apply_hf_processor( return super()._cached_apply_hf_processor(
...@@ -311,7 +308,6 @@ class DeepseekVL2MultiModalProcessor( ...@@ -311,7 +308,6 @@ class DeepseekVL2MultiModalProcessor(
mm_data_items=mm_data_items, mm_data_items=mm_data_items,
hf_processor_mm_kwargs=hf_processor_mm_kwargs, hf_processor_mm_kwargs=hf_processor_mm_kwargs,
tokenization_kwargs=tokenization_kwargs, tokenization_kwargs=tokenization_kwargs,
return_mm_hashes=return_mm_hashes,
) )
......
...@@ -479,8 +479,6 @@ class H2OVLMultiModalProcessor( ...@@ -479,8 +479,6 @@ class H2OVLMultiModalProcessor(
mm_data_items: MultiModalDataItems, mm_data_items: MultiModalDataItems,
hf_processor_mm_kwargs: Mapping[str, object], hf_processor_mm_kwargs: Mapping[str, object],
tokenization_kwargs: Mapping[str, object], tokenization_kwargs: Mapping[str, object],
*,
return_mm_hashes: bool,
) -> tuple[list[int], MultiModalProcessingInfo, bool]: ) -> tuple[list[int], MultiModalProcessingInfo, bool]:
# The processor logic is different for len(images) <= 1 vs > 1 # The processor logic is different for len(images) <= 1 vs > 1
# Since the processing cache assumes that the processor output is # Since the processing cache assumes that the processor output is
...@@ -492,7 +490,6 @@ class H2OVLMultiModalProcessor( ...@@ -492,7 +490,6 @@ class H2OVLMultiModalProcessor(
mm_data_items=mm_data_items, mm_data_items=mm_data_items,
hf_processor_mm_kwargs=hf_processor_mm_kwargs, hf_processor_mm_kwargs=hf_processor_mm_kwargs,
tokenization_kwargs=tokenization_kwargs, tokenization_kwargs=tokenization_kwargs,
return_mm_hashes=return_mm_hashes,
) )
return super()._cached_apply_hf_processor( return super()._cached_apply_hf_processor(
...@@ -500,7 +497,6 @@ class H2OVLMultiModalProcessor( ...@@ -500,7 +497,6 @@ class H2OVLMultiModalProcessor(
mm_data_items=mm_data_items, mm_data_items=mm_data_items,
hf_processor_mm_kwargs=hf_processor_mm_kwargs, hf_processor_mm_kwargs=hf_processor_mm_kwargs,
tokenization_kwargs=tokenization_kwargs, tokenization_kwargs=tokenization_kwargs,
return_mm_hashes=return_mm_hashes,
) )
......
...@@ -795,7 +795,6 @@ class MantisMultiModalProcessor(LlavaMultiModalProcessor): ...@@ -795,7 +795,6 @@ class MantisMultiModalProcessor(LlavaMultiModalProcessor):
mm_data: MultiModalDataDict, mm_data: MultiModalDataDict,
hf_processor_mm_kwargs: Mapping[str, object], hf_processor_mm_kwargs: Mapping[str, object],
tokenization_kwargs: Optional[Mapping[str, object]] = None, tokenization_kwargs: Optional[Mapping[str, object]] = None,
return_mm_hashes: bool = False,
) -> MultiModalInputs: ) -> MultiModalInputs:
hf_config = self.info.get_hf_config() hf_config = self.info.get_hf_config()
image_token_id = hf_config.image_token_index image_token_id = hf_config.image_token_index
...@@ -807,7 +806,7 @@ class MantisMultiModalProcessor(LlavaMultiModalProcessor): ...@@ -807,7 +806,7 @@ class MantisMultiModalProcessor(LlavaMultiModalProcessor):
) )
result = super().apply(prompt, mm_data, hf_processor_mm_kwargs, result = super().apply(prompt, mm_data, hf_processor_mm_kwargs,
tokenization_kwargs, return_mm_hashes) tokenization_kwargs)
mm_items = self._to_mm_items(mm_data) mm_items = self._to_mm_items(mm_data)
mm_item_counts = mm_items.get_all_counts() mm_item_counts = mm_items.get_all_counts()
......
...@@ -168,10 +168,9 @@ class MllamaMultiModalProcessor(EncDecMultiModalProcessor[MllamaProcessingInfo] ...@@ -168,10 +168,9 @@ class MllamaMultiModalProcessor(EncDecMultiModalProcessor[MllamaProcessingInfo]
mm_data: MultiModalDataDict, mm_data: MultiModalDataDict,
hf_processor_mm_kwargs: Mapping[str, object], hf_processor_mm_kwargs: Mapping[str, object],
tokenization_kwargs: Optional[Mapping[str, object]] = None, tokenization_kwargs: Optional[Mapping[str, object]] = None,
return_mm_hashes: bool = False,
) -> MultiModalEncDecInputs: ) -> MultiModalEncDecInputs:
mm_inputs = super().apply(prompt, mm_data, hf_processor_mm_kwargs, mm_inputs = super().apply(prompt, mm_data, hf_processor_mm_kwargs,
tokenization_kwargs, return_mm_hashes) tokenization_kwargs)
image_token_id = self.info.get_hf_config().image_token_index image_token_id = self.info.get_hf_config().image_token_index
# Check that the number of image tokens in the decoder prompt matches # Check that the number of image tokens in the decoder prompt matches
......
...@@ -194,10 +194,9 @@ class PaliGemmaMultiModalProcessor( ...@@ -194,10 +194,9 @@ class PaliGemmaMultiModalProcessor(
mm_data: MultiModalDataDict, mm_data: MultiModalDataDict,
hf_processor_mm_kwargs: Mapping[str, object], hf_processor_mm_kwargs: Mapping[str, object],
tokenization_kwargs: Optional[Mapping[str, object]] = None, tokenization_kwargs: Optional[Mapping[str, object]] = None,
return_mm_hashes: bool = False,
) -> MultiModalInputs: ) -> MultiModalInputs:
mm_inputs = super().apply(prompt, mm_data, hf_processor_mm_kwargs, mm_inputs = super().apply(prompt, mm_data, hf_processor_mm_kwargs,
tokenization_kwargs, return_mm_hashes) tokenization_kwargs)
prompt_token_ids = mm_inputs["prompt_token_ids"] prompt_token_ids = mm_inputs["prompt_token_ids"]
tokenizer = self.info.get_tokenizer() tokenizer = self.info.get_tokenizer()
......
...@@ -308,15 +308,12 @@ class PixtralMultiModalProcessor(BaseMultiModalProcessor[PixtralProcessingInfo] ...@@ -308,15 +308,12 @@ class PixtralMultiModalProcessor(BaseMultiModalProcessor[PixtralProcessingInfo]
mm_data_items: MultiModalDataItems, mm_data_items: MultiModalDataItems,
hf_processor_mm_kwargs: Mapping[str, object], hf_processor_mm_kwargs: Mapping[str, object],
tokenization_kwargs: Mapping[str, object], tokenization_kwargs: Mapping[str, object],
*,
return_mm_hashes: bool,
) -> tuple[list[int], MultiModalProcessingInfo, bool]: ) -> tuple[list[int], MultiModalProcessingInfo, bool]:
prompt_ids, mm_info, _ = super()._cached_apply_hf_processor( prompt_ids, mm_info, _ = super()._cached_apply_hf_processor(
prompt=prompt, prompt=prompt,
mm_data_items=mm_data_items, mm_data_items=mm_data_items,
hf_processor_mm_kwargs=hf_processor_mm_kwargs, hf_processor_mm_kwargs=hf_processor_mm_kwargs,
tokenization_kwargs=tokenization_kwargs, tokenization_kwargs=tokenization_kwargs,
return_mm_hashes=return_mm_hashes,
) )
# NOTE: The tokens are already inserted by the chat template # NOTE: The tokens are already inserted by the chat template
......
...@@ -18,7 +18,7 @@ ...@@ -18,7 +18,7 @@
"""Inference-only IBM/NASA Prithvi Geospatial model.""" """Inference-only IBM/NASA Prithvi Geospatial model."""
from collections.abc import Iterable, Mapping, Sequence from collections.abc import Iterable, Mapping, Sequence
from typing import Optional, Union from typing import Any, Optional, Union
import torch import torch
import torch.nn as nn import torch.nn as nn
...@@ -32,18 +32,56 @@ from vllm.model_executor.models.interfaces import ( ...@@ -32,18 +32,56 @@ from vllm.model_executor.models.interfaces import (
default_pooling_type) default_pooling_type)
from vllm.model_executor.models.utils import AutoWeightsLoader from vllm.model_executor.models.utils import AutoWeightsLoader
from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, from vllm.multimodal.inputs import (ImageItem, ModalityData,
MultiModalFieldElem, MultiModalInputs, MultiModalDataDict, MultiModalFieldConfig,
MultiModalKwargsItem, MultiModalInputs, MultiModalKwargsItems,
MultiModalKwargsItems, PlaceholderRange)
MultiModalSharedField, PlaceholderRange) from vllm.multimodal.parse import (DictEmbeddingItems, ModalityDataItems,
from vllm.multimodal.parse import MultiModalDataItems MultiModalDataItems, MultiModalDataParser)
from vllm.multimodal.processing import (BaseMultiModalProcessor, from vllm.multimodal.processing import (BaseMultiModalProcessor,
BaseProcessingInfo, PromptUpdate) BaseProcessingInfo, PromptUpdate)
from vllm.multimodal.profiling import BaseDummyInputsBuilder from vllm.multimodal.profiling import BaseDummyInputsBuilder
from vllm.sequence import IntermediateTensors from vllm.sequence import IntermediateTensors
def _prithvi_field_config(hf_inputs: Mapping[str, torch.Tensor]):
# This model receives in input a multi-dimensional tensor representing
# a single image patch and therefore it is not to be split
# into multiple elements, but rather to be considered a single one.
# Hence, the decision of using a MultiModalSharedField.
# The expected shape is (num_channels, width, height).
# This model however allows the user to also submit multiple image
# patches as a batch, adding a further dimension to the above shape.
# At this stage we only support submitting one patch per request and
# batching is achieved via vLLM batching.
# TODO (christian-pinto): enable support for multi patch requests
# in tandem with vLLM batching.
return dict(
pixel_values=MultiModalFieldConfig.shared(batch_size=1,
modality="image"),
location_coords=MultiModalFieldConfig.shared(batch_size=1,
modality="image"),
)
class PrithviGeoSpatialMAEMultiModalDataParser(MultiModalDataParser):
def _parse_image_data(
self,
data: Union[dict[str, torch.Tensor], ModalityData[ImageItem]],
) -> Optional[ModalityDataItems[Any, Any]]:
if isinstance(data, dict):
return DictEmbeddingItems(
data,
modality="image",
required_fields={"pixel_values", "location_coords"},
fields_factory=_prithvi_field_config,
)
return super()._parse_image_data(data)
class PrithviGeoSpatialMAEProcessingInfo(BaseProcessingInfo): class PrithviGeoSpatialMAEProcessingInfo(BaseProcessingInfo):
def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
...@@ -64,26 +102,26 @@ class PrithviGeoSpatialMAEInputBuilder( ...@@ -64,26 +102,26 @@ class PrithviGeoSpatialMAEInputBuilder(
# This model input is fixed and is in the form of a torch Tensor. # This model input is fixed and is in the form of a torch Tensor.
# The size of pixel_values might change in the cases where we resize # The size of pixel_values might change in the cases where we resize
# the input but never exceeds the dimensions below. # the input but never exceeds the dimensions below.
return { image_data = {
"pixel_values": torch.full((6, 512, 512), 1.0, "pixel_values": torch.full((6, 512, 512), 1.0,
dtype=torch.float16), dtype=torch.float16),
"location_coords": torch.full((1, 2), 1.0, dtype=torch.float16), "location_coords": torch.full((1, 2), 1.0, dtype=torch.float16),
} }
return {"image": image_data}
class PrithviGeoSpatialMAEMultiModalProcessor(BaseMultiModalProcessor): class PrithviGeoSpatialMAEMultiModalProcessor(BaseMultiModalProcessor):
def _get_data_parser(self) -> MultiModalDataParser:
return PrithviGeoSpatialMAEMultiModalDataParser()
def _get_mm_fields_config( def _get_mm_fields_config(
self, self,
hf_inputs: BatchFeature, hf_inputs: BatchFeature,
hf_processor_mm_kwargs: Mapping[str, object], hf_processor_mm_kwargs: Mapping[str, object],
) -> Mapping[str, MultiModalFieldConfig]: ) -> Mapping[str, MultiModalFieldConfig]:
return dict( return _prithvi_field_config(hf_inputs)
pixel_values=MultiModalFieldConfig.shared(batch_size=1,
modality="image"),
location_coords=MultiModalFieldConfig.shared(batch_size=1,
modality="image"),
)
def _get_prompt_updates( def _get_prompt_updates(
self, self,
...@@ -99,46 +137,32 @@ class PrithviGeoSpatialMAEMultiModalProcessor(BaseMultiModalProcessor): ...@@ -99,46 +137,32 @@ class PrithviGeoSpatialMAEMultiModalProcessor(BaseMultiModalProcessor):
mm_data: MultiModalDataDict, mm_data: MultiModalDataDict,
hf_processor_mm_kwargs: Mapping[str, object], hf_processor_mm_kwargs: Mapping[str, object],
tokenization_kwargs: Optional[Mapping[str, object]] = None, tokenization_kwargs: Optional[Mapping[str, object]] = None,
return_mm_hashes: bool = False,
) -> MultiModalInputs: ) -> MultiModalInputs:
mm_kwargs = {} if "image" in mm_data:
image_data = mm_data["image"]
for k, v in mm_data.items():
if isinstance(v, dict) and k == "image":
mm_kwargs.update(v)
else: else:
mm_kwargs[k] = v image_data = mm_data
mm_data = {"image": mm_data}
mm_items = self._to_mm_items(mm_data)
mm_hashes = self._hash_mm_items(mm_items, hf_processor_mm_kwargs,
tokenization_kwargs or {})
mm_placeholders = {"image": [PlaceholderRange(offset=0, length=0)]} mm_placeholders = {"image": [PlaceholderRange(offset=0, length=0)]}
# This model receives in input a multi-dimensional tensor representing mm_processed_data = BatchFeature(image_data)
# a single image patch and therefore it is not to be split
# into multiple elements, but rather to be considered a single one.
# Hence, the decision of using a MultiModalSharedField.
# The expected shape is (num_channels, width, height).
# This model however allows the user to also submit multiple image mm_kwargs = MultiModalKwargsItems.from_hf_inputs(
# patches as a batch, adding a further dimension to the above shape. mm_processed_data,
# At this stage we only support submitting one patch per request and self._get_mm_fields_config(mm_processed_data,
# batching is achieved via vLLM batching. hf_processor_mm_kwargs),
# TODO (christian-pinto): enable support for multi patch requests )
# in tandem with vLLM batching.
multimodal_kwargs_items = [
MultiModalKwargsItem.from_elems([
MultiModalFieldElem(
modality="image",
key=key,
data=data,
field=MultiModalSharedField(1),
) for key, data in mm_kwargs.items()
])
]
return MultiModalInputs( return MultiModalInputs(
type="multimodal", type="multimodal",
prompt=prompt, prompt=prompt,
prompt_token_ids=[1], prompt_token_ids=[1],
mm_kwargs=MultiModalKwargsItems.from_seq(multimodal_kwargs_items), mm_kwargs=mm_kwargs,
mm_hashes=None, mm_hashes=mm_hashes,
mm_placeholders=mm_placeholders, mm_placeholders=mm_placeholders,
) )
......
...@@ -310,7 +310,6 @@ class MultiModalProcessor(BaseMultiModalProcessor[MultiModalProcessingInfo]): ...@@ -310,7 +310,6 @@ class MultiModalProcessor(BaseMultiModalProcessor[MultiModalProcessingInfo]):
mm_data: MultiModalDataDict, mm_data: MultiModalDataDict,
hf_processor_mm_kwargs: Mapping[str, object], hf_processor_mm_kwargs: Mapping[str, object],
tokenization_kwargs: Optional[Mapping[str, object]] = None, tokenization_kwargs: Optional[Mapping[str, object]] = None,
return_mm_hashes: bool = False,
) -> MultiModalInputs: ) -> MultiModalInputs:
""" """
Process multi-modal inputs to be used in vLLM. Process multi-modal inputs to be used in vLLM.
......
...@@ -288,15 +288,12 @@ class VoxtralMultiModalProcessor(BaseMultiModalProcessor[VoxtralProcessingInfo] ...@@ -288,15 +288,12 @@ class VoxtralMultiModalProcessor(BaseMultiModalProcessor[VoxtralProcessingInfo]
mm_data_items: MultiModalDataItems, mm_data_items: MultiModalDataItems,
hf_processor_mm_kwargs: Mapping[str, object], hf_processor_mm_kwargs: Mapping[str, object],
tokenization_kwargs: Mapping[str, object], tokenization_kwargs: Mapping[str, object],
*,
return_mm_hashes: bool,
) -> tuple[list[int], MultiModalProcessingInfo, bool]: ) -> tuple[list[int], MultiModalProcessingInfo, bool]:
prompt_ids, mm_info, _ = super()._cached_apply_hf_processor( prompt_ids, mm_info, _ = super()._cached_apply_hf_processor(
prompt=prompt, prompt=prompt,
mm_data_items=mm_data_items, mm_data_items=mm_data_items,
hf_processor_mm_kwargs=hf_processor_mm_kwargs, hf_processor_mm_kwargs=hf_processor_mm_kwargs,
tokenization_kwargs=tokenization_kwargs, tokenization_kwargs=tokenization_kwargs,
return_mm_hashes=return_mm_hashes,
) )
# NOTE: The tokens are already inserted by the chat template # NOTE: The tokens are already inserted by the chat template
......
...@@ -43,7 +43,7 @@ class MultiModalHasher: ...@@ -43,7 +43,7 @@ class MultiModalHasher:
return cls.item_to_bytes( return cls.item_to_bytes(
"image", np.asarray(convert_image_mode(obj, "RGBA"))) "image", np.asarray(convert_image_mode(obj, "RGBA")))
if isinstance(obj, torch.Tensor): if isinstance(obj, torch.Tensor):
return cls.item_to_bytes("tensor", obj.numpy()) return cls.item_to_bytes("tensor", obj.cpu().numpy())
if isinstance(obj, np.ndarray): if isinstance(obj, np.ndarray):
# If the array is non-contiguous, we need to copy it first # If the array is non-contiguous, we need to copy it first
arr_data = obj.data if obj.flags.c_contiguous else obj.tobytes() arr_data = obj.data if obj.flags.c_contiguous else obj.tobytes()
......
...@@ -901,7 +901,7 @@ class MultiModalInputs(TypedDict): ...@@ -901,7 +901,7 @@ class MultiModalInputs(TypedDict):
mm_kwargs: MultiModalKwargsItems mm_kwargs: MultiModalKwargsItems
"""Keyword arguments to be directly passed to the model after batching.""" """Keyword arguments to be directly passed to the model after batching."""
mm_hashes: Optional["MultiModalHashDict"] mm_hashes: "MultiModalHashDict"
"""The hashes of the multi-modal data.""" """The hashes of the multi-modal data."""
mm_placeholders: "MultiModalPlaceholderDict" mm_placeholders: "MultiModalPlaceholderDict"
......
...@@ -998,7 +998,7 @@ A collection of prompt updates with a similar structure as ...@@ -998,7 +998,7 @@ A collection of prompt updates with a similar structure as
class MultiModalProcessingInfo(NamedTuple): class MultiModalProcessingInfo(NamedTuple):
kwargs: MultiModalKwargsItems kwargs: MultiModalKwargsItems
hashes: Optional[MultiModalHashes] hashes: MultiModalHashes
prompt_updates: MultiModalPromptUpdates prompt_updates: MultiModalPromptUpdates
...@@ -1399,8 +1399,6 @@ class BaseMultiModalProcessor(ABC, Generic[_I]): ...@@ -1399,8 +1399,6 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
mm_data_items: MultiModalDataItems, mm_data_items: MultiModalDataItems,
hf_processor_mm_kwargs: Mapping[str, object], hf_processor_mm_kwargs: Mapping[str, object],
tokenization_kwargs: Mapping[str, object], tokenization_kwargs: Mapping[str, object],
*,
return_mm_hashes: bool,
) -> tuple[list[int], MultiModalProcessingInfo, bool]: ) -> tuple[list[int], MultiModalProcessingInfo, bool]:
( (
prompt_ids, prompt_ids,
...@@ -1420,9 +1418,8 @@ class BaseMultiModalProcessor(ABC, Generic[_I]): ...@@ -1420,9 +1418,8 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
hf_processor_mm_kwargs), hf_processor_mm_kwargs),
) )
mm_hashes = (self._hash_mm_items(mm_data_items, hf_processor_mm_kwargs, mm_hashes = self._hash_mm_items(mm_data_items, hf_processor_mm_kwargs,
tokenization_kwargs) tokenization_kwargs)
if return_mm_hashes else None)
unbound_prompt_updates = self._get_prompt_updates( unbound_prompt_updates = self._get_prompt_updates(
mm_data_items, mm_data_items,
...@@ -1446,8 +1443,6 @@ class BaseMultiModalProcessor(ABC, Generic[_I]): ...@@ -1446,8 +1443,6 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
mm_data_items: MultiModalDataItems, mm_data_items: MultiModalDataItems,
hf_processor_mm_kwargs: Mapping[str, object], hf_processor_mm_kwargs: Mapping[str, object],
tokenization_kwargs: Mapping[str, object], tokenization_kwargs: Mapping[str, object],
*,
return_mm_hashes: bool,
) -> tuple[list[int], MultiModalProcessingInfo, bool]: ) -> tuple[list[int], MultiModalProcessingInfo, bool]:
""" """
Apply the HF processor on the full prompt text, Apply the HF processor on the full prompt text,
...@@ -1462,7 +1457,6 @@ class BaseMultiModalProcessor(ABC, Generic[_I]): ...@@ -1462,7 +1457,6 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
mm_data_items=mm_data_items, mm_data_items=mm_data_items,
hf_processor_mm_kwargs=hf_processor_mm_kwargs, hf_processor_mm_kwargs=hf_processor_mm_kwargs,
tokenization_kwargs=tokenization_kwargs, tokenization_kwargs=tokenization_kwargs,
return_mm_hashes=return_mm_hashes,
) )
mm_hashes = self._hash_mm_items(mm_data_items, hf_processor_mm_kwargs, mm_hashes = self._hash_mm_items(mm_data_items, hf_processor_mm_kwargs,
...@@ -1476,8 +1470,6 @@ class BaseMultiModalProcessor(ABC, Generic[_I]): ...@@ -1476,8 +1470,6 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
mm_hashes=mm_hashes, mm_hashes=mm_hashes,
) )
mm_hashes_to_return = mm_hashes if return_mm_hashes else None
# NOTE: `prompt` does not correspond to `mm_missing_data_items`, # NOTE: `prompt` does not correspond to `mm_missing_data_items`,
# so we can't apply prompt updates until the new multimodal # so we can't apply prompt updates until the new multimodal
# items are combined with the cached multimodal items # items are combined with the cached multimodal items
...@@ -1515,7 +1507,7 @@ class BaseMultiModalProcessor(ABC, Generic[_I]): ...@@ -1515,7 +1507,7 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
mm_info = MultiModalProcessingInfo( mm_info = MultiModalProcessingInfo(
kwargs=mm_kwargs, kwargs=mm_kwargs,
hashes=mm_hashes_to_return, hashes=mm_hashes,
prompt_updates=mm_prompt_updates, prompt_updates=mm_prompt_updates,
) )
...@@ -1697,7 +1689,6 @@ class BaseMultiModalProcessor(ABC, Generic[_I]): ...@@ -1697,7 +1689,6 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
mm_data: MultiModalDataDict, mm_data: MultiModalDataDict,
hf_processor_mm_kwargs: Mapping[str, object], hf_processor_mm_kwargs: Mapping[str, object],
tokenization_kwargs: Optional[Mapping[str, object]] = None, tokenization_kwargs: Optional[Mapping[str, object]] = None,
return_mm_hashes: bool = False,
) -> MultiModalInputs: ) -> MultiModalInputs:
""" """
Process multi-modal inputs to be used in vLLM. Process multi-modal inputs to be used in vLLM.
...@@ -1726,7 +1717,6 @@ class BaseMultiModalProcessor(ABC, Generic[_I]): ...@@ -1726,7 +1717,6 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
mm_items, mm_items,
hf_processor_mm_kwargs, hf_processor_mm_kwargs,
tokenization_kwargs=tokenization_kwargs, tokenization_kwargs=tokenization_kwargs,
return_mm_hashes=return_mm_hashes,
) )
# NOTE: tokenization_kwargs are not required to init processor # NOTE: tokenization_kwargs are not required to init processor
...@@ -1811,7 +1801,6 @@ class EncDecMultiModalProcessor(BaseMultiModalProcessor[_I]): ...@@ -1811,7 +1801,6 @@ class EncDecMultiModalProcessor(BaseMultiModalProcessor[_I]):
mm_data: MultiModalDataDict, mm_data: MultiModalDataDict,
hf_processor_mm_kwargs: Mapping[str, object], hf_processor_mm_kwargs: Mapping[str, object],
tokenization_kwargs: Optional[Mapping[str, object]] = None, tokenization_kwargs: Optional[Mapping[str, object]] = None,
return_mm_hashes: bool = False,
) -> MultiModalEncDecInputs: ) -> MultiModalEncDecInputs:
""" """
Process multi-modal inputs to be used in vLLM. Process multi-modal inputs to be used in vLLM.
...@@ -1826,7 +1815,6 @@ class EncDecMultiModalProcessor(BaseMultiModalProcessor[_I]): ...@@ -1826,7 +1815,6 @@ class EncDecMultiModalProcessor(BaseMultiModalProcessor[_I]):
mm_data, mm_data,
hf_processor_mm_kwargs, hf_processor_mm_kwargs,
tokenization_kwargs, tokenization_kwargs,
return_mm_hashes,
) )
return self._get_enc_dec_inputs( return self._get_enc_dec_inputs(
......
...@@ -17,7 +17,6 @@ from vllm.multimodal.utils import argsort_mm_positions ...@@ -17,7 +17,6 @@ from vllm.multimodal.utils import argsort_mm_positions
from vllm.pooling_params import PoolingParams from vllm.pooling_params import PoolingParams
from vllm.sampling_params import SamplingParams from vllm.sampling_params import SamplingParams
from vllm.transformers_utils.tokenizer_group import TokenizerGroup from vllm.transformers_utils.tokenizer_group import TokenizerGroup
from vllm.utils import is_list_of
from vllm.v1.engine import EngineCoreRequest from vllm.v1.engine import EngineCoreRequest
from vllm.v1.engine.mm_input_cache import MultiModalInputCacheClient from vllm.v1.engine.mm_input_cache import MultiModalInputCacheClient
from vllm.v1.structured_output.backend_guidance import ( from vllm.v1.structured_output.backend_guidance import (
...@@ -253,13 +252,10 @@ class Processor: ...@@ -253,13 +252,10 @@ class Processor:
# 1. Tokenize text prompt, with LoRA request if one exists. # 1. Tokenize text prompt, with LoRA request if one exists.
# 2. For multimodal models with a merged preprocessor, preprocess # 2. For multimodal models with a merged preprocessor, preprocess
# multimodal data and expand prompt token ids accordingly. # multimodal data and expand prompt token ids accordingly.
return_mm_hashes = (self.model_config.processor_return_mm_hashes
or bool(self.cache_config.enable_prefix_caching))
processed_inputs: ProcessorInputs = self.input_preprocessor.preprocess( processed_inputs: ProcessorInputs = self.input_preprocessor.preprocess(
prompt, prompt,
tokenization_kwargs=tokenization_kwargs, tokenization_kwargs=tokenization_kwargs,
lora_request=lora_request, lora_request=lora_request,
return_mm_hashes=return_mm_hashes,
) )
from vllm.platforms import current_platform from vllm.platforms import current_platform
current_platform.validate_request( current_platform.validate_request(
...@@ -302,7 +298,7 @@ class Processor: ...@@ -302,7 +298,7 @@ class Processor:
if decoder_inputs["type"] == "multimodal": if decoder_inputs["type"] == "multimodal":
decoder_mm_inputs = decoder_inputs["mm_kwargs"] decoder_mm_inputs = decoder_inputs["mm_kwargs"]
decoder_mm_positions = decoder_inputs["mm_placeholders"] decoder_mm_positions = decoder_inputs["mm_placeholders"]
decoder_mm_hashes = decoder_inputs.get("mm_hashes") decoder_mm_hashes = decoder_inputs["mm_hashes"]
# Merge and flatten multimodal placeholders, hashes and inputs # Merge and flatten multimodal placeholders, hashes and inputs
# from dictionaries to lists, and sort them by each item's position # from dictionaries to lists, and sort them by each item's position
...@@ -317,19 +313,15 @@ class Processor: ...@@ -317,19 +313,15 @@ class Processor:
decoder_mm_positions[modality][idx] decoder_mm_positions[modality][idx]
for modality, idx in sorted_mm_idxs for modality, idx in sorted_mm_idxs
] ]
sorted_mm_hashes = None if decoder_mm_hashes is None else [ sorted_mm_hashes = [
decoder_mm_hashes[modality][idx] decoder_mm_hashes[modality][idx]
for modality, idx in sorted_mm_idxs for modality, idx in sorted_mm_idxs
] ]
if sorted_mm_hashes is not None:
sorted_mm_inputs = self.mm_input_cache_client.get_and_update( sorted_mm_inputs = self.mm_input_cache_client.get_and_update(
orig_sorted_mm_inputs, orig_sorted_mm_inputs,
sorted_mm_hashes, sorted_mm_hashes,
) )
else:
assert is_list_of(orig_sorted_mm_inputs, MultiModalKwargsItem)
sorted_mm_inputs = orig_sorted_mm_inputs
return decoder_inputs.get("prompt"), EngineCoreRequest( return decoder_inputs.get("prompt"), EngineCoreRequest(
request_id=request_id, request_id=request_id,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment