Unverified Commit 657855ab authored by Cyrus Leung's avatar Cyrus Leung Committed by GitHub
Browse files

[Misc] Cleanup more configs and processors (#37560)


Signed-off-by: default avatarDarkLight1337 <tlleungac@connect.ust.hk>
parent e27b8ba3
...@@ -520,8 +520,10 @@ class SpeculativeConfig: ...@@ -520,8 +520,10 @@ class SpeculativeConfig:
# Replace hf_config for EAGLE draft_model # Replace hf_config for EAGLE draft_model
if self.method in ("eagle", "eagle3"): if self.method in ("eagle", "eagle3"):
from vllm.transformers_utils.configs import SpeculatorsConfig
from vllm.transformers_utils.configs.eagle import EAGLEConfig from vllm.transformers_utils.configs.eagle import EAGLEConfig
from vllm.transformers_utils.configs.speculators import (
SpeculatorsConfig,
)
if isinstance( if isinstance(
self.draft_model_config.hf_config, self.draft_model_config.hf_config,
......
...@@ -32,7 +32,7 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ( ...@@ -32,7 +32,7 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
) )
from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.model_loader.weight_utils import default_weight_loader
from vllm.sequence import IntermediateTensors from vllm.sequence import IntermediateTensors
from vllm.transformers_utils.configs import ChatGLMConfig from vllm.transformers_utils.configs.chatglm import ChatGLMConfig
from .interfaces import SupportsLoRA, SupportsPP, SupportsQuant from .interfaces import SupportsLoRA, SupportsPP, SupportsQuant
from .utils import ( from .utils import (
......
...@@ -54,7 +54,7 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ( ...@@ -54,7 +54,7 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
) )
from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.model_loader.weight_utils import default_weight_loader
from vllm.sequence import IntermediateTensors from vllm.sequence import IntermediateTensors
from vllm.transformers_utils.configs import RWConfig from vllm.transformers_utils.configs.falcon import RWConfig
from .interfaces import SupportsPP from .interfaces import SupportsPP
from .utils import ( from .utils import (
......
...@@ -24,7 +24,7 @@ from vllm.model_executor.layers.fused_moe import FusedMoE ...@@ -24,7 +24,7 @@ from vllm.model_executor.layers.fused_moe import FusedMoE
from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.layernorm import RMSNorm
from vllm.model_executor.layers.linear import ReplicatedLinear from vllm.model_executor.layers.linear import ReplicatedLinear
from vllm.model_executor.models.olmoe import OlmoeAttention, OlmoeForCausalLM from vllm.model_executor.models.olmoe import OlmoeAttention, OlmoeForCausalLM
from vllm.transformers_utils.configs import FlexOlmoConfig from vllm.transformers_utils.configs.flex_olmo import FlexOlmoConfig
logger = init_logger(__name__) logger = init_logger(__name__)
......
...@@ -20,7 +20,6 @@ from vllm.config import VllmConfig ...@@ -20,7 +20,6 @@ from vllm.config import VllmConfig
from vllm.config.multimodal import BaseDummyOptions from vllm.config.multimodal import BaseDummyOptions
from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.quantization import QuantizationConfig
from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.cache import BaseMultiModalProcessorCache
from vllm.multimodal.inputs import ( from vllm.multimodal.inputs import (
MultiModalDataDict, MultiModalDataDict,
MultiModalFieldConfig, MultiModalFieldConfig,
...@@ -31,7 +30,6 @@ from vllm.multimodal.processing import ( ...@@ -31,7 +30,6 @@ from vllm.multimodal.processing import (
BaseDummyInputsBuilder, BaseDummyInputsBuilder,
BaseMultiModalProcessor, BaseMultiModalProcessor,
BaseProcessingInfo, BaseProcessingInfo,
InputProcessingContext,
PromptReplacement, PromptReplacement,
PromptUpdate, PromptUpdate,
) )
...@@ -336,28 +334,6 @@ class HCXVisionMultiModalProcessor(BaseMultiModalProcessor[HCXVisionProcessingIn ...@@ -336,28 +334,6 @@ class HCXVisionMultiModalProcessor(BaseMultiModalProcessor[HCXVisionProcessingIn
return fields return fields
def _build_hcxvision_hf_info(
ctx: InputProcessingContext,
) -> HCXVisionProcessingInfo:
return HCXVisionProcessingInfo(ctx)
def _build_hcxvision_hf_processor(
info: HCXVisionProcessingInfo,
dummy_inputs: BaseDummyInputsBuilder[HCXVisionProcessingInfo],
*,
cache: BaseMultiModalProcessorCache | None = None,
) -> BaseMultiModalProcessor:
if isinstance(info, HCXVisionProcessingInfo):
return HCXVisionMultiModalProcessor(
info,
dummy_inputs, # type: ignore
cache=cache,
)
raise NotImplementedError(type(info))
def init_vision_tower_for_hcxvision( def init_vision_tower_for_hcxvision(
vision_config, vision_config,
quant_config: QuantizationConfig | None, quant_config: QuantizationConfig | None,
...@@ -587,8 +563,8 @@ class HCXVisionCAbstractor(nn.Module): ...@@ -587,8 +563,8 @@ class HCXVisionCAbstractor(nn.Module):
@MULTIMODAL_REGISTRY.register_processor( @MULTIMODAL_REGISTRY.register_processor(
_build_hcxvision_hf_processor, HCXVisionMultiModalProcessor,
info=_build_hcxvision_hf_info, info=HCXVisionProcessingInfo,
dummy_inputs=HCXVisionDummyInputsBuilder, dummy_inputs=HCXVisionDummyInputsBuilder,
) )
class HCXVisionForCausalLM(nn.Module, SupportsMultiModal, SupportsPP): class HCXVisionForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
......
This diff is collapsed.
...@@ -49,7 +49,7 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ( ...@@ -49,7 +49,7 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
) )
from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.model_loader.weight_utils import default_weight_loader
from vllm.sequence import IntermediateTensors from vllm.sequence import IntermediateTensors
from vllm.transformers_utils.configs import JAISConfig from vllm.transformers_utils.configs.jais import JAISConfig
from .interfaces import SupportsPP from .interfaces import SupportsPP
from .utils import ( from .utils import (
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# ruff: noqa: E501
""" """
Kimi-K2.5 Model Implementation for vLLM. Kimi-K2.5 Model Implementation for vLLM.
Kimi-K2.5 extends Kimi-K2 with vision support Kimi-K2.5 extends Kimi-K2 with vision support.
This module defines:
- KimiK25ProcessingInfo/KimiK25MultiModalProcessor: Processing logic
- KimiK25ForConditionalGeneration: Main model class
""" """
from collections.abc import Iterable, Mapping, Sequence from collections.abc import Iterable, Mapping, Sequence
...@@ -18,14 +13,13 @@ from typing import Annotated, Any, Literal ...@@ -18,14 +13,13 @@ from typing import Annotated, Any, Literal
import torch import torch
from torch import nn from torch import nn
from transformers import BatchFeature from transformers import BatchFeature
from transformers.processing_utils import ProcessorMixin
from vllm.config import VllmConfig from vllm.config import VllmConfig
from vllm.config.multimodal import BaseDummyOptions from vllm.config.multimodal import BaseDummyOptions
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.quantization import QuantizationConfig
from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors import ( from vllm.model_executor.layers.quantization.compressed_tensors import (
CompressedTensorsConfig, compressed_tensors,
) )
from vllm.model_executor.models.interfaces import ( from vllm.model_executor.models.interfaces import (
SupportsEagle, SupportsEagle,
...@@ -45,7 +39,6 @@ from vllm.multimodal.inputs import ( ...@@ -45,7 +39,6 @@ from vllm.multimodal.inputs import (
MultiModalFieldConfig, MultiModalFieldConfig,
MultiModalKwargsItems, MultiModalKwargsItems,
NestedTensors, NestedTensors,
VisionChunk,
VisionChunkImage, VisionChunkImage,
VisionChunkVideo, VisionChunkVideo,
) )
...@@ -60,8 +53,9 @@ from vllm.multimodal.processing import ( ...@@ -60,8 +53,9 @@ from vllm.multimodal.processing import (
) )
from vllm.platforms import current_platform from vllm.platforms import current_platform
from vllm.sequence import IntermediateTensors from vllm.sequence import IntermediateTensors
from vllm.transformers_utils.configs import KimiK25Config from vllm.transformers_utils.configs.kimi_k25 import KimiK25Config
from vllm.transformers_utils.processor import cached_get_image_processor from vllm.transformers_utils.processor import cached_get_image_processor
from vllm.transformers_utils.processors.kimi_k25 import KimiK25Processor
from vllm.utils.tensor_schema import TensorSchema, TensorShape from vllm.utils.tensor_schema import TensorSchema, TensorShape
from .utils import ( from .utils import (
...@@ -101,69 +95,6 @@ class KimiK25MediaPixelInputs(TensorSchema): ...@@ -101,69 +95,6 @@ class KimiK25MediaPixelInputs(TensorSchema):
grid_thws: Annotated[torch.Tensor, TensorShape("nm", 3)] grid_thws: Annotated[torch.Tensor, TensorShape("nm", 3)]
class MoonshotKimiVAutoProcessor(ProcessorMixin):
attributes = ["tokenizer"]
tokenizer_class = "AutoTokenizer"
def __init__(
self, media_processor=None, tokenizer=None, media_token_id: int | None = None
):
super().__init__(tokenizer)
self.media_processor = media_processor
self.media_token_id = media_token_id
assert self.media_token_id is not None
# We do not support str input for text here
def __call__(
self,
vision_chunks: list[VisionChunk] | None = None,
*,
text: list[int] | str,
**kwargs,
) -> BatchFeature:
"""
Args:
vision_chunks: List of VisionChunk items to be processed.
For image: VisionChunkImage with type='image', image=PIL.Image
For video_chunk: VisionChunkVideo with type='video_chunk', video_chunk=list[PIL.Image]
text: The token ids to be fed to a model (required).
Returns:
[`BatchFeature`]: A [`BatchFeature`] with the following fields:
- **input_ids** -- list of token ids to be fed to a model.
- **pixel_values** -- Pixel values to be fed to a model. Returned when `vision_chunks` is not `None`.
- **grid_thws** -- list of image 3D grid in LLM. Returned when `vision_chunks` is not `None`.
"""
mm_inputs = {}
input_ids = self.tokenizer.encode(text) if isinstance(text, str) else text
if vision_chunks is not None:
assert isinstance(vision_chunks, list)
mm_inputs = self.media_processor.preprocess(vision_chunks)
num_tokens_per_chunk = [
self.media_processor.media_tokens_calculator(chunk)
for chunk in vision_chunks
]
new_input_ids = []
for token in input_ids:
if token == self.media_token_id:
new_input_ids.extend(
[self.media_token_id] * num_tokens_per_chunk.pop(0)
)
else:
new_input_ids.append(token)
input_ids = new_input_ids
# XXX: _apply_hf_processor_text_mm will call tolist() on input_ids
return BatchFeature(
data={
"input_ids": torch.tensor([input_ids]),
**mm_inputs,
}
)
class KimiK25ProcessingInfo(BaseProcessingInfo): class KimiK25ProcessingInfo(BaseProcessingInfo):
"""Processing information for Kimi-K2.5 model. """Processing information for Kimi-K2.5 model.
...@@ -180,7 +111,7 @@ class KimiK25ProcessingInfo(BaseProcessingInfo): ...@@ -180,7 +111,7 @@ class KimiK25ProcessingInfo(BaseProcessingInfo):
trust_remote_code=self.ctx.model_config.trust_remote_code, trust_remote_code=self.ctx.model_config.trust_remote_code,
) )
self.media_processor = media_processor self.media_processor = media_processor
self.hf_processor = MoonshotKimiVAutoProcessor( self.hf_processor = KimiK25Processor(
media_processor=self.media_processor, media_processor=self.media_processor,
tokenizer=self.get_tokenizer(), tokenizer=self.get_tokenizer(),
media_token_id=self.media_token_id, media_token_id=self.media_token_id,
...@@ -263,12 +194,14 @@ class KimiK25MultiModalProcessor(BaseMultiModalProcessor[KimiK25ProcessingInfo]) ...@@ -263,12 +194,14 @@ class KimiK25MultiModalProcessor(BaseMultiModalProcessor[KimiK25ProcessingInfo])
) -> Mapping[str, MultiModalFieldConfig]: ) -> Mapping[str, MultiModalFieldConfig]:
"""Indicates how to slice media input into multiple items. """Indicates how to slice media input into multiple items.
pixel_values: [N, 3, patch_size, patch_size], all patches collected from B medias pixel_values: [N, 3, patch_size, patch_size],
grid_thws: [B,3], each item: [N_t, N_h ,N_w], indicates the grid size in time/height/width direction all patches collected from B medias
for current item. grid_thws: [B,3], each item: [N_t, N_h ,N_w],
indicates the grid size in time/height/width direction for current item.
by multiplying [N_t, N_h ,N_w], we get the number of patches for each media item, thus we can slice by multiplying [N_t, N_h ,N_w], we get the number of patches
pixel_values by pixel_values[start:start + N_t*N_h*N_w] to get patches of one item. for each media item, thus we can slice pixel_values by
pixel_values[start:start + N_t*N_h*N_w] to get patches of one item.
""" """
grid_thws = hf_inputs.get("grid_thws", torch.empty((0, 3))) grid_thws = hf_inputs.get("grid_thws", torch.empty((0, 3)))
...@@ -403,7 +336,7 @@ class KimiK25ForConditionalGeneration( ...@@ -403,7 +336,7 @@ class KimiK25ForConditionalGeneration(
self.media_placeholder: int = self.config.media_placeholder_token_id self.media_placeholder: int = self.config.media_placeholder_token_id
def _maybe_ignore_quant_config(self, quant_config: QuantizationConfig): def _maybe_ignore_quant_config(self, quant_config: QuantizationConfig):
if isinstance(quant_config, CompressedTensorsConfig): if isinstance(quant_config, compressed_tensors.CompressedTensorsConfig):
return None return None
return quant_config return quant_config
......
...@@ -77,7 +77,7 @@ from vllm.multimodal.processing import ( ...@@ -77,7 +77,7 @@ from vllm.multimodal.processing import (
PromptUpdate, PromptUpdate,
) )
from vllm.sequence import IntermediateTensors from vllm.sequence import IntermediateTensors
from vllm.transformers_utils.configs import KimiVLConfig, MoonViTConfig from vllm.transformers_utils.configs.kimi_vl import KimiVLConfig, MoonViTConfig
from vllm.utils.tensor_schema import TensorSchema, TensorShape from vllm.utils.tensor_schema import TensorSchema, TensorShape
from .utils import AutoWeightsLoader, init_vllm_registered_model, maybe_prefix from .utils import AutoWeightsLoader, init_vllm_registered_model, maybe_prefix
......
...@@ -39,7 +39,7 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ( ...@@ -39,7 +39,7 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
) )
from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.model_loader.weight_utils import default_weight_loader
from vllm.sequence import IntermediateTensors from vllm.sequence import IntermediateTensors
from vllm.transformers_utils.configs import Lfm2MoeConfig from vllm.transformers_utils.configs.lfm2_moe import Lfm2MoeConfig
from .interfaces import ( from .interfaces import (
HasInnerState, HasInnerState,
......
...@@ -16,8 +16,7 @@ from vllm.model_executor.models.mistral3 import ( ...@@ -16,8 +16,7 @@ from vllm.model_executor.models.mistral3 import (
Mistral3ForConditionalGeneration, Mistral3ForConditionalGeneration,
Mistral3MultiModalProjector, Mistral3MultiModalProjector,
Mistral3ProcessingInfo, Mistral3ProcessingInfo,
_build_mistral3_info, init_vision_tower_for_mistral3,
init_vision_tower_for_llava,
) )
from vllm.model_executor.models.pixtral import PixtralHFEncoderInfo from vllm.model_executor.models.pixtral import PixtralHFEncoderInfo
from vllm.model_executor.models.utils import ( from vllm.model_executor.models.utils import (
...@@ -27,11 +26,9 @@ from vllm.model_executor.models.utils import ( ...@@ -27,11 +26,9 @@ from vllm.model_executor.models.utils import (
maybe_prefix, maybe_prefix,
) )
from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.cache import BaseMultiModalProcessorCache
from vllm.multimodal.inputs import MultiModalFieldConfig, MultiModalKwargsItems from vllm.multimodal.inputs import MultiModalFieldConfig, MultiModalKwargsItems
from vllm.multimodal.parse import ImageProcessorItems, MultiModalDataItems from vllm.multimodal.parse import ImageProcessorItems, MultiModalDataItems
from vllm.multimodal.processing import ( from vllm.multimodal.processing import (
BaseDummyInputsBuilder,
BaseMultiModalProcessor, BaseMultiModalProcessor,
PromptReplacement, PromptReplacement,
PromptUpdate, PromptUpdate,
...@@ -128,19 +125,9 @@ class LightOnOCRMultiModalProcessor(BaseMultiModalProcessor[Mistral3ProcessingIn ...@@ -128,19 +125,9 @@ class LightOnOCRMultiModalProcessor(BaseMultiModalProcessor[Mistral3ProcessingIn
] ]
def _build_LightOnOCR_processor(
info: _I,
dummy_inputs: BaseDummyInputsBuilder[_I],
*,
cache: BaseMultiModalProcessorCache | None = None,
):
assert isinstance(info, Mistral3ProcessingInfo)
return LightOnOCRMultiModalProcessor(info, dummy_inputs, cache=cache)
@MULTIMODAL_REGISTRY.register_processor( @MULTIMODAL_REGISTRY.register_processor(
_build_LightOnOCR_processor, LightOnOCRMultiModalProcessor,
info=_build_mistral3_info, info=Mistral3ProcessingInfo,
dummy_inputs=Mistral3DummyInputsBuilder, dummy_inputs=Mistral3DummyInputsBuilder,
) )
class LightOnOCRForConditionalGeneration(Mistral3ForConditionalGeneration): class LightOnOCRForConditionalGeneration(Mistral3ForConditionalGeneration):
...@@ -164,7 +151,7 @@ class LightOnOCRForConditionalGeneration(Mistral3ForConditionalGeneration): ...@@ -164,7 +151,7 @@ class LightOnOCRForConditionalGeneration(Mistral3ForConditionalGeneration):
self.multimodal_config = multimodal_config self.multimodal_config = multimodal_config
with self._mark_tower_model(vllm_config, "image"): with self._mark_tower_model(vllm_config, "image"):
self.vision_tower = init_vision_tower_for_llava( self.vision_tower = init_vision_tower_for_mistral3(
config, config,
quant_config=quant_config, quant_config=quant_config,
require_post_norm=False, require_post_norm=False,
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from abc import abstractmethod
from collections.abc import Iterable, Mapping, Sequence from collections.abc import Iterable, Mapping, Sequence
from typing import Annotated, Final, Literal, Protocol, TypeVar from typing import Annotated, Literal
import torch import torch
import torch.nn as nn import torch.nn as nn
from transformers import ( from transformers import BatchFeature, Mistral3Config, PixtralVisionConfig
BatchFeature,
Mistral3Config,
PixtralVisionConfig,
PretrainedConfig,
)
from transformers.models.pixtral import PixtralProcessor from transformers.models.pixtral import PixtralProcessor
from vllm.config import VllmConfig from vllm.config import VllmConfig
...@@ -23,7 +17,6 @@ from vllm.model_executor.layers.linear import ColumnParallelLinear, RowParallelL ...@@ -23,7 +17,6 @@ from vllm.model_executor.layers.linear import ColumnParallelLinear, RowParallelL
from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.quantization import QuantizationConfig
from vllm.model_executor.models.module_mapping import MultiModelKeys from vllm.model_executor.models.module_mapping import MultiModelKeys
from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.cache import BaseMultiModalProcessorCache
from vllm.multimodal.inputs import ( from vllm.multimodal.inputs import (
MultiModalDataDict, MultiModalDataDict,
MultiModalFieldConfig, MultiModalFieldConfig,
...@@ -34,7 +27,6 @@ from vllm.multimodal.processing import ( ...@@ -34,7 +27,6 @@ from vllm.multimodal.processing import (
BaseDummyInputsBuilder, BaseDummyInputsBuilder,
BaseMultiModalProcessor, BaseMultiModalProcessor,
BaseProcessingInfo, BaseProcessingInfo,
InputProcessingContext,
PromptReplacement, PromptReplacement,
PromptUpdate, PromptUpdate,
PromptUpdateDetails, PromptUpdateDetails,
...@@ -178,27 +170,15 @@ class Mistral3MultiModalProjector(nn.Module): ...@@ -178,27 +170,15 @@ class Mistral3MultiModalProjector(nn.Module):
return hidden_states return hidden_states
class LlavaLikeConfig(Protocol): class Mistral3ProcessingInfo(BaseProcessingInfo):
vision_config: Final[PretrainedConfig] def get_hf_config(self) -> Mistral3Config:
image_token_index: Final[int]
vision_feature_select_strategy: Final[str]
vision_feature_layer: Final[int | list[int]]
class LlavaLikeProcessor(Protocol):
image_token: Final[str]
class BaseLlavaProcessingInfo(BaseProcessingInfo):
def get_hf_config(self) -> LlavaLikeConfig:
return self.ctx.get_hf_config(Mistral3Config) return self.ctx.get_hf_config(Mistral3Config)
def get_vision_encoder_info(self): def get_vision_encoder_info(self):
return get_vision_encoder_info(self.get_hf_config()) return get_vision_encoder_info(self.get_hf_config())
@abstractmethod def get_hf_processor(self, **kwargs: object):
def get_hf_processor(self, **kwargs: object) -> LlavaLikeProcessor: return self.ctx.get_hf_processor(PixtralProcessor, **kwargs)
raise NotImplementedError
def get_supported_mm_limits(self) -> Mapping[str, int | None]: def get_supported_mm_limits(self) -> Mapping[str, int | None]:
return {"image": None} return {"image": None}
...@@ -221,10 +201,7 @@ class BaseLlavaProcessingInfo(BaseProcessingInfo): ...@@ -221,10 +201,7 @@ class BaseLlavaProcessingInfo(BaseProcessingInfo):
return ImageSize(width=width, height=height) return ImageSize(width=width, height=height)
_I = TypeVar("_I", bound=BaseLlavaProcessingInfo) class Mistral3DummyInputsBuilder(BaseDummyInputsBuilder[Mistral3ProcessingInfo]):
class Mistral3DummyInputsBuilder(BaseDummyInputsBuilder[_I]):
def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str: def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
num_images = mm_counts.get("image", 0) num_images = mm_counts.get("image", 0)
...@@ -255,11 +232,6 @@ class Mistral3DummyInputsBuilder(BaseDummyInputsBuilder[_I]): ...@@ -255,11 +232,6 @@ class Mistral3DummyInputsBuilder(BaseDummyInputsBuilder[_I]):
} }
class Mistral3ProcessingInfo(BaseLlavaProcessingInfo):
def get_hf_processor(self, **kwargs: object):
return self.ctx.get_hf_processor(PixtralProcessor, **kwargs)
class Mistral3MultiModalProcessor(BaseMultiModalProcessor[Mistral3ProcessingInfo]): class Mistral3MultiModalProcessor(BaseMultiModalProcessor[Mistral3ProcessingInfo]):
def _call_hf_processor( def _call_hf_processor(
self, self,
...@@ -339,29 +311,7 @@ class Mistral3MultiModalProcessor(BaseMultiModalProcessor[Mistral3ProcessingInfo ...@@ -339,29 +311,7 @@ class Mistral3MultiModalProcessor(BaseMultiModalProcessor[Mistral3ProcessingInfo
] ]
def _build_mistral3_info( def _get_num_hidden_layers(hf_config: Mistral3Config) -> int:
ctx: InputProcessingContext,
) -> BaseLlavaProcessingInfo:
hf_config = ctx.get_hf_config(Mistral3Config)
assert isinstance(hf_config.vision_config, PixtralVisionConfig)
return Mistral3ProcessingInfo(ctx)
def _build_mistral3_processor(
info: _I,
dummy_inputs: BaseDummyInputsBuilder[_I],
*,
cache: BaseMultiModalProcessorCache | None = None,
) -> BaseMultiModalProcessor:
assert isinstance(info, Mistral3ProcessingInfo)
return Mistral3MultiModalProcessor(
info,
dummy_inputs, # type: ignore
cache=cache,
)
def _get_num_hidden_layers(hf_config: LlavaLikeConfig) -> int:
"""Determine the number of hidden layers to initialize up to in the """Determine the number of hidden layers to initialize up to in the
visual encoder. visual encoder.
...@@ -381,8 +331,8 @@ def _get_num_hidden_layers(hf_config: LlavaLikeConfig) -> int: ...@@ -381,8 +331,8 @@ def _get_num_hidden_layers(hf_config: LlavaLikeConfig) -> int:
) )
def init_vision_tower_for_llava( def init_vision_tower_for_mistral3(
hf_config: LlavaLikeConfig, hf_config: Mistral3Config,
quant_config: QuantizationConfig | None, quant_config: QuantizationConfig | None,
*, *,
require_post_norm: bool | None = None, require_post_norm: bool | None = None,
...@@ -405,8 +355,8 @@ def init_vision_tower_for_llava( ...@@ -405,8 +355,8 @@ def init_vision_tower_for_llava(
@MULTIMODAL_REGISTRY.register_processor( @MULTIMODAL_REGISTRY.register_processor(
_build_mistral3_processor, Mistral3MultiModalProcessor,
info=_build_mistral3_info, info=Mistral3ProcessingInfo,
dummy_inputs=Mistral3DummyInputsBuilder, dummy_inputs=Mistral3DummyInputsBuilder,
) )
class Mistral3ForConditionalGeneration( class Mistral3ForConditionalGeneration(
...@@ -466,7 +416,7 @@ class Mistral3ForConditionalGeneration( ...@@ -466,7 +416,7 @@ class Mistral3ForConditionalGeneration(
config.projector_hidden_act = "gelu" config.projector_hidden_act = "gelu"
with self._mark_tower_model(vllm_config, "image"): with self._mark_tower_model(vllm_config, "image"):
self.vision_tower = init_vision_tower_for_llava( self.vision_tower = init_vision_tower_for_mistral3(
config, config,
quant_config=quant_config, quant_config=quant_config,
require_post_norm=False, require_post_norm=False,
......
...@@ -52,7 +52,7 @@ from vllm.model_executor.model_loader.weight_utils import ( ...@@ -52,7 +52,7 @@ from vllm.model_executor.model_loader.weight_utils import (
maybe_remap_kv_scale_name, maybe_remap_kv_scale_name,
) )
from vllm.sequence import IntermediateTensors from vllm.sequence import IntermediateTensors
from vllm.transformers_utils.configs import NemotronConfig from vllm.transformers_utils.configs.nemotron import NemotronConfig
from .interfaces import SupportsLoRA, SupportsPP from .interfaces import SupportsLoRA, SupportsPP
from .utils import ( from .utils import (
......
...@@ -81,7 +81,7 @@ from vllm.model_executor.models.utils import ( ...@@ -81,7 +81,7 @@ from vllm.model_executor.models.utils import (
sequence_parallel_chunk, sequence_parallel_chunk,
) )
from vllm.sequence import IntermediateTensors from vllm.sequence import IntermediateTensors
from vllm.transformers_utils.configs import NemotronHConfig from vllm.transformers_utils.configs.nemotron_h import NemotronHConfig
class NemotronHMLP(nn.Module): class NemotronHMLP(nn.Module):
......
...@@ -26,7 +26,7 @@ from vllm.model_executor.models.utils import ( ...@@ -26,7 +26,7 @@ from vllm.model_executor.models.utils import (
maybe_prefix, maybe_prefix,
) )
from vllm.sequence import IntermediateTensors from vllm.sequence import IntermediateTensors
from vllm.transformers_utils.configs import NemotronHConfig from vllm.transformers_utils.configs.nemotron_h import NemotronHConfig
from .interfaces import SupportsPP from .interfaces import SupportsPP
from .nemotron_h import ( from .nemotron_h import (
......
...@@ -63,7 +63,7 @@ from vllm.model_executor.models.utils import ( ...@@ -63,7 +63,7 @@ from vllm.model_executor.models.utils import (
maybe_prefix, maybe_prefix,
) )
from vllm.sequence import IntermediateTensors from vllm.sequence import IntermediateTensors
from vllm.transformers_utils.configs import Olmo3Config from vllm.transformers_utils.configs.olmo3 import Olmo3Config
class Olmo2Attention(nn.Module): class Olmo2Attention(nn.Module):
......
...@@ -80,7 +80,7 @@ from vllm.model_executor.models.utils import sequence_parallel_chunk ...@@ -80,7 +80,7 @@ from vllm.model_executor.models.utils import sequence_parallel_chunk
from vllm.model_executor.utils import set_weight_attrs from vllm.model_executor.utils import set_weight_attrs
from vllm.platforms import current_platform from vllm.platforms import current_platform
from vllm.sequence import IntermediateTensors from vllm.sequence import IntermediateTensors
from vllm.transformers_utils.configs import Qwen3NextConfig from vllm.transformers_utils.configs.qwen3_next import Qwen3NextConfig
from vllm.triton_utils import tl, triton from vllm.triton_utils import tl, triton
from vllm.utils.multi_stream_utils import maybe_execute_in_parallel from vllm.utils.multi_stream_utils import maybe_execute_in_parallel
from vllm.utils.torch_utils import ( from vllm.utils.torch_utils import (
......
...@@ -25,7 +25,7 @@ from vllm.model_executor.models.qwen3_next import ( ...@@ -25,7 +25,7 @@ from vllm.model_executor.models.qwen3_next import (
QwenNextMixtureOfExperts, QwenNextMixtureOfExperts,
) )
from vllm.sequence import IntermediateTensors from vllm.sequence import IntermediateTensors
from vllm.transformers_utils.configs import Qwen3NextConfig from vllm.transformers_utils.configs.qwen3_next import Qwen3NextConfig
from .utils import ( from .utils import (
AutoWeightsLoader, AutoWeightsLoader,
......
...@@ -2,18 +2,13 @@ ...@@ -2,18 +2,13 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import math import math
from collections.abc import Iterable, Mapping, Sequence from collections.abc import Iterable, Mapping, Sequence
from itertools import product from math import sqrt
from math import ceil, sqrt
from typing import Annotated, Any, Literal, TypeAlias from typing import Annotated, Any, Literal, TypeAlias
import numpy as np
import torch import torch
import torch.nn as nn import torch.nn as nn
import torch.nn.functional as F import torch.nn.functional as F
from PIL import Image from transformers import BatchFeature
from torchvision import transforms
from torchvision.transforms.functional import InterpolationMode
from transformers import BatchFeature, PretrainedConfig, TensorType
from vllm.config import VllmConfig from vllm.config import VllmConfig
from vllm.config.multimodal import BaseDummyOptions from vllm.config.multimodal import BaseDummyOptions
...@@ -43,8 +38,8 @@ from vllm.multimodal.processing import ( ...@@ -43,8 +38,8 @@ from vllm.multimodal.processing import (
PromptUpdateDetails, PromptUpdateDetails,
) )
from vllm.sequence import IntermediateTensors from vllm.sequence import IntermediateTensors
from vllm.tokenizers import TokenizerLike from vllm.transformers_utils.configs.step3_vl import Step3VisionEncoderConfig
from vllm.transformers_utils.configs import Step3VisionEncoderConfig from vllm.transformers_utils.processors.step3_vl import Step3VLProcessor
from vllm.utils.tensor_schema import TensorSchema, TensorShape from vllm.utils.tensor_schema import TensorSchema, TensorShape
from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
...@@ -89,430 +84,6 @@ class Step3VLImageEmbeddingInputs(TensorSchema): ...@@ -89,430 +84,6 @@ class Step3VLImageEmbeddingInputs(TensorSchema):
Step3VLImageInputs: TypeAlias = Step3VLImagePixelInputs | Step3VLImageEmbeddingInputs Step3VLImageInputs: TypeAlias = Step3VLImagePixelInputs | Step3VLImageEmbeddingInputs
ImageWithPatches = tuple[Image.Image, list[Image.Image], list[bool] | None]
MAX_IMAGE_SIZE: int = 3024
class Step3VisionProcessor:
def __init__(self, size, interpolation_mode="bicubic", patch_size=None):
mean = [0.48145466, 0.4578275, 0.40821073]
std = [0.26862954, 0.26130258, 0.27577711]
patch_size = patch_size if patch_size is not None else size
self.transform = transforms.Compose(
[
transforms.ToTensor(),
transforms.Normalize(mean, std),
transforms.Resize(
(size, size),
interpolation=InterpolationMode.BICUBIC
if interpolation_mode == "bicubic"
else InterpolationMode.BILINEAR,
antialias=True,
),
]
)
self.patch_transform = (
transforms.Compose(
[
transforms.ToTensor(),
transforms.Normalize(mean, std),
transforms.Resize(
(patch_size, patch_size),
interpolation=InterpolationMode.BICUBIC
if interpolation_mode == "bicubic"
else InterpolationMode.BILINEAR,
antialias=True,
),
]
)
if patch_size is not None
else None
)
def __call__(self, image, is_patch=False):
if is_patch:
return {"pixel_values": self.patch_transform(image).unsqueeze(0)}
else:
return {"pixel_values": self.transform(image).unsqueeze(0)}
class ImagePatcher:
def __init__(self, enable_patch: bool = True) -> None:
self.enable_patch = enable_patch
def determine_window_size(self, long: int, short: int) -> int:
if long < 728:
return short if long / short > 1.5 else 0
return min(short, 504) if long / short > 4 else 504
def slide_window(
self,
width: int,
height: int,
sizes: list[tuple[int, int]],
steps: list[tuple[int, int]],
img_rate_thr: float = 0.6,
) -> tuple[list[tuple[int, int, int, int]], tuple[int, int]]:
assert 1 >= img_rate_thr >= 0, "The `in_rate_thr` should lie in 0~1"
windows = []
# Sliding windows.
for size, step in zip(sizes, steps):
size_w, size_h = size
step_w, step_h = step
x_num = 1 if width <= size_w else ceil((width - size_w) / step_w + 1)
x_start = [step_w * i for i in range(x_num)]
if len(x_start) > 1 and x_start[-1] + size_w > width:
x_start[-1] = width - size_w
y_num = 1 if height <= size_h else ceil((height - size_h) / step_h + 1)
y_start = [step_h * i for i in range(y_num)]
if len(y_start) > 1 and y_start[-1] + size_h > height:
y_start[-1] = height - size_h
start = np.array(list(product(y_start, x_start)), dtype=int)
start[:, [0, 1]] = start[:, [1, 0]]
windows.append(np.concatenate([start, start + size], axis=1))
windows = np.concatenate(windows, axis=0)
return [
(int(box[0]), int(box[1]), int(box[2] - box[0]), int(box[3] - box[1]))
for box in windows
], (x_num, y_num)
def square_pad(self, img: Image.Image) -> Image.Image:
w, h = img.size
if w == h:
return img
size = max(w, h)
padded = Image.new(img.mode, (size, size), 0)
padded.paste(img, (0, 0))
return padded
def get_image_size_for_padding(
self, img_width: int, img_height: int
) -> tuple[int, int]:
ratio = img_width / img_height
if min(img_height, img_width) < 32 and (ratio > 4 or ratio < 1 / 4):
new_size = max(img_height, img_width)
return new_size, new_size
return img_width, img_height
def get_image_size_for_preprocess(
self, img_width: int, img_height: int
) -> tuple[int, int]:
if max(img_height, img_width) > MAX_IMAGE_SIZE:
scale_factor = MAX_IMAGE_SIZE / max(img_height, img_width)
img_width = int(img_width * scale_factor)
img_height = int(img_height * scale_factor)
return img_width, img_height
def get_image_size_for_crop(
self, img_width: int, img_height: int, window_size: int
):
w_ratio = img_width / window_size
h_ratio = img_height / window_size
if w_ratio < 1:
width_new = img_width
else:
decimal_w = w_ratio - img_width // window_size
w_ratio = int(w_ratio) + 1 if decimal_w > 0.2 else int(w_ratio)
width_new = window_size * w_ratio
if h_ratio < 1:
height_new = img_height
else:
decimal_h = h_ratio - img_height // window_size
h_ratio = int(h_ratio) + 1 if decimal_h > 0.2 else int(h_ratio)
height_new = window_size * h_ratio
return int(width_new), int(height_new)
def patch_crop(self, img: Image.Image, i: int, j: int, th: int, tw: int):
target = img.crop((j, i, j + tw, i + th))
return target
def get_num_patches(self, img_width: int, img_height: int) -> tuple[int, int]:
img_width, img_height = self.get_image_size_for_padding(img_width, img_height)
img_width, img_height = self.get_image_size_for_preprocess(
img_width, img_height
)
window_size = self.determine_window_size(
max(img_height, img_width), min(img_height, img_width)
)
if window_size == 0 or not self.enable_patch:
return 0, 0
else:
img_width, img_height = self.get_image_size_for_crop(
img_width, img_height, window_size
)
center_list, (x_num, y_num) = self.slide_window(
img_width,
img_height,
[(window_size, window_size)],
[(window_size, window_size)],
)
full_rows = (len(center_list) - 1) // x_num + 1
if len(center_list) > 0 and len(center_list) % x_num == 0:
full_rows -= 1
return len(center_list), full_rows
def __call__(
self, img: Image.Image
) -> tuple[Image.Image, list[Image.Image], list[bool] | None]:
img_width, img_height = img.size
new_img_width, new_img_height = self.get_image_size_for_padding(
img_width, img_height
)
if new_img_width != img_width or new_img_height != img_height:
img = self.square_pad(img)
img_width, img_height = img.size
new_img_width, new_img_height = self.get_image_size_for_preprocess(
img_width, img_height
)
img = img.resize((new_img_width, new_img_height), Image.Resampling.BILINEAR)
window_size = self.determine_window_size(
max(new_img_height, new_img_width), min(new_img_height, new_img_width)
)
if window_size == 0 or not self.enable_patch:
return img, [], None
else:
new_img_width, new_img_height = self.get_image_size_for_crop(
new_img_width, new_img_height, window_size
)
if (new_img_width, new_img_height) != (img_width, img_height):
img_for_crop = img.resize(
(new_img_width, new_img_height), Image.Resampling.BILINEAR
)
else:
img_for_crop = img
patches = []
newlines = []
center_list, (x_num, y_num) = self.slide_window(
new_img_width,
new_img_height,
[(window_size, window_size)],
[(window_size, window_size)],
)
for patch_id, center_lf_point in enumerate(center_list):
x, y, patch_w, patch_h = center_lf_point
big_patch = self.patch_crop(img_for_crop, y, x, patch_h, patch_w)
patches.append(big_patch)
if (patch_id + 1) % x_num == 0:
newlines.append(patch_id)
if newlines and newlines[-1] == len(patches) - 1:
newlines.pop()
return (
img,
patches,
[i in newlines for i in range(len(patches))]
if len(patches) > 0
else None,
)
class Step3VLProcessor:
def __init__(
self,
config: PretrainedConfig,
tokenizer: TokenizerLike,
) -> None:
super().__init__()
self.config = config
self.tokenizer = tokenizer
self.image_size = 728
self.patch_size = 504
self.image_preprocessor = Step3VisionProcessor(
self.image_size, "bilinear", self.patch_size
)
self.num_image_feature_size = 169
self.num_patch_feature_size = 81
self.image_token = "<im_patch>"
self.image_feature_placeholder = self.image_token * self.num_image_feature_size
self.patch_feature_placeholder = self.image_token * self.num_patch_feature_size
# Respect vision config switch to enable/disable patch extraction.
# For video understanding, it's preferable to disable patch.
enable_patch = getattr(self.config.vision_config, "enable_patch", True)
self.patcher = ImagePatcher(enable_patch=enable_patch)
@property
def image_token_id(self) -> int:
return self.tokenizer.get_vocab()[self.image_token]
def get_num_image_tokens(self, img_width: int, img_height: int) -> int:
num_patches, num_newlines = self.patcher.get_num_patches(img_width, img_height)
return (
num_patches * (self.num_patch_feature_size + 2)
+ self.num_image_feature_size
+ 2
+ num_newlines
)
def _split_images(self, images: list[Image.Image]) -> list[ImageWithPatches]:
result = []
for img in images:
result.append(self.patcher(img))
return result
def _convert_images_to_pixel_values(
self,
images: list[Image.Image],
is_patch: bool = False,
) -> list[torch.Tensor]:
return [
self.image_preprocessor(img, is_patch=is_patch)["pixel_values"]
for img in images
]
def _get_patch_repl(
self,
num_patches: int,
patch_newline_mask: list[bool] | None,
) -> tuple[str, list[int]]:
text = ""
token_ids = []
for i in range(num_patches):
assert len(patch_newline_mask) == num_patches
text += f"<patch_start>{self.patch_feature_placeholder}<patch_end>"
token_ids.extend(
[self.tokenizer.convert_tokens_to_ids("<patch_start>")]
+ [self.image_token_id] * self.num_patch_feature_size
+ [self.tokenizer.convert_tokens_to_ids("<patch_end>")]
)
if patch_newline_mask and patch_newline_mask[i]:
text += "<patch_newline>"
token_ids.append(
self.tokenizer.convert_tokens_to_ids("<patch_newline>")
)
return text, token_ids
def _get_image_repl(
self,
num_images: int,
) -> tuple[str, list[int]]:
text = f"<im_start>{self.image_feature_placeholder}<im_end>"
token_ids = (
[self.tokenizer.convert_tokens_to_ids("<im_start>")]
+ [self.image_token_id] * self.num_image_feature_size
+ [self.tokenizer.convert_tokens_to_ids("<im_end>")]
)
return text * num_images, token_ids * num_images
def _get_image_repl_features(
self,
num_images: int,
num_patches: int,
patch_new_line_idx: list[bool] | None,
) -> tuple[str, list[int]]:
if num_patches > 0:
patch_repl, patch_repl_ids = self._get_patch_repl(
num_patches, patch_new_line_idx
)
else:
patch_repl = ""
patch_repl_ids = []
image_repl, image_repl_ids = self._get_image_repl(num_images)
return patch_repl + image_repl, patch_repl_ids + image_repl_ids
def replace_placeholder(self, text: str, placeholder: str, repls: list[str]) -> str:
parts = text.split(placeholder)
if len(parts) - 1 != len(repls):
raise ValueError(
"The number of placeholders does not match the number of replacements."
)
result = [parts[0]]
for i, repl in enumerate(repls):
result.append(repl)
result.append(parts[i + 1])
return "".join(result)
def __call__(
self,
text: str | list[str] | None = None,
images: Image.Image | list[Image.Image] | None = None,
return_tensors: str | TensorType | None = None,
) -> BatchFeature:
if text is None:
text = []
if not isinstance(text, list):
text = [text]
if images is None:
images = []
if not isinstance(images, list):
images = [images]
if len(images) == 0:
image_inputs = {}
text_inputs = self.tokenizer(text)
else:
split_images_data = self._split_images(images)
pixel_values_lst = []
patch_pixel_values_lst = []
patch_newline_mask_lst = []
image_repl_str_lst = []
image_repl_ids_lst = []
num_patches = []
for raw_img, img_patches, patch_newline_mask in split_images_data:
pixel_values_lst.extend(self._convert_images_to_pixel_values([raw_img]))
if len(img_patches) > 0:
patch_pixel_values_lst.extend(
self._convert_images_to_pixel_values(img_patches, is_patch=True)
)
num_patches.append(len(img_patches))
image_repl_str, image_repl_ids = self._get_image_repl_features(
1, len(img_patches), patch_newline_mask
)
image_repl_str_lst.append(image_repl_str)
image_repl_ids_lst.extend(image_repl_ids)
if patch_newline_mask is not None:
patch_newline_mask_lst.extend(patch_newline_mask)
pixel_values = torch.cat(pixel_values_lst)
patch_size = self.patch_size
image_inputs = {
"pixel_values": pixel_values,
"num_patches": num_patches,
"patch_pixel_values": (
torch.cat(patch_pixel_values_lst)
if patch_pixel_values_lst
else pixel_values.new_empty((0, 3, patch_size, patch_size))
),
"patch_newline_mask": torch.tensor(
patch_newline_mask_lst, dtype=torch.bool
),
}
text = [
self.replace_placeholder(t, self.image_token, image_repl_str_lst)
for t in text
]
text_inputs = self.tokenizer(text)
return BatchFeature(
{
**text_inputs,
**image_inputs,
},
tensor_type=return_tensors,
)
class Step3VLProcessingInfo(BaseProcessingInfo): class Step3VLProcessingInfo(BaseProcessingInfo):
def get_hf_processor(self) -> Step3VLProcessor: def get_hf_processor(self) -> Step3VLProcessor:
......
...@@ -25,7 +25,6 @@ from vllm.model_executor.layers.linear import ColumnParallelLinear, RowParallelL ...@@ -25,7 +25,6 @@ from vllm.model_executor.layers.linear import ColumnParallelLinear, RowParallelL
from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.quantization import QuantizationConfig
from vllm.model_executor.models.llava import LlavaDummyInputsBuilder from vllm.model_executor.models.llava import LlavaDummyInputsBuilder
from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.cache import BaseMultiModalProcessorCache
from vllm.multimodal.inputs import MultiModalFieldConfig, MultiModalKwargsItems from vllm.multimodal.inputs import MultiModalFieldConfig, MultiModalKwargsItems
from vllm.multimodal.parse import ( from vllm.multimodal.parse import (
ImageEmbeddingItems, ImageEmbeddingItems,
...@@ -34,10 +33,8 @@ from vllm.multimodal.parse import ( ...@@ -34,10 +33,8 @@ from vllm.multimodal.parse import (
MultiModalDataItems, MultiModalDataItems,
) )
from vllm.multimodal.processing import ( from vllm.multimodal.processing import (
BaseDummyInputsBuilder,
BaseMultiModalProcessor, BaseMultiModalProcessor,
BaseProcessingInfo, BaseProcessingInfo,
InputProcessingContext,
PromptReplacement, PromptReplacement,
PromptUpdate, PromptUpdate,
) )
...@@ -329,25 +326,6 @@ class TarsierMultiModalProcessor(BaseMultiModalProcessor[_I_Tarsier]): ...@@ -329,25 +326,6 @@ class TarsierMultiModalProcessor(BaseMultiModalProcessor[_I_Tarsier]):
] ]
def _build_tarsier_hf_info(ctx: InputProcessingContext) -> TarsierProcessingInfo:
return TarsierProcessingInfo(ctx)
def _build_tarsier_hf_processor(
info: _I_Tarsier,
dummy_inputs: BaseDummyInputsBuilder[_I_Tarsier],
*,
cache: BaseMultiModalProcessorCache | None = None,
) -> BaseMultiModalProcessor:
if isinstance(info, TarsierProcessingInfo):
return TarsierMultiModalProcessor(
info,
dummy_inputs,
cache=cache,
)
raise NotImplementedError(type(info))
def init_vision_tower_for_tarsier( def init_vision_tower_for_tarsier(
hf_config: TarsierHfConfig, # Use the Tarsier specific config protocol hf_config: TarsierHfConfig, # Use the Tarsier specific config protocol
quant_config: QuantizationConfig | None, quant_config: QuantizationConfig | None,
...@@ -395,8 +373,8 @@ def init_vision_tower_for_tarsier( ...@@ -395,8 +373,8 @@ def init_vision_tower_for_tarsier(
@MULTIMODAL_REGISTRY.register_processor( @MULTIMODAL_REGISTRY.register_processor(
_build_tarsier_hf_processor, TarsierMultiModalProcessor,
info=_build_tarsier_hf_info, info=TarsierProcessingInfo,
dummy_inputs=TarsierDummyInputsBuilder, dummy_inputs=TarsierDummyInputsBuilder,
) )
class TarsierForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP): class TarsierForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment