Unverified Commit 4e256cad authored by Harry Mellor's avatar Harry Mellor Committed by GitHub
Browse files

Remove all references to `yapf` as it's no longer used (#26251)


Signed-off-by: default avatarHarry Mellor <19981378+hmellor@users.noreply.github.com>
parent d6953beb
......@@ -6,14 +6,16 @@ from typing import Annotated, Any, Literal, Optional, Union, cast
import numpy as np
import torch
# yapf: disable
from torch import nn
from transformers import AutoModel, BatchFeature
from transformers.models.gemma3n import (Gemma3nAudioConfig,
Gemma3nAudioFeatureExtractor,
Gemma3nConfig, Gemma3nProcessor,
Gemma3nTextConfig,
Gemma3nVisionConfig)
from transformers.models.gemma3n import (
Gemma3nAudioConfig,
Gemma3nAudioFeatureExtractor,
Gemma3nConfig,
Gemma3nProcessor,
Gemma3nTextConfig,
Gemma3nVisionConfig,
)
from transformers.models.siglip import SiglipImageProcessorFast
from vllm.config import ModelConfig, SpeechToTextConfig, VllmConfig
......@@ -22,25 +24,32 @@ from vllm.inputs.data import PromptType
from vllm.logger import init_logger
from vllm.model_executor.layers.layernorm import RMSNorm
from vllm.model_executor.layers.linear import RowParallelLinear
from vllm.model_executor.layers.vocab_parallel_embedding import (
VocabParallelEmbedding)
from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding
from vllm.model_executor.models.gemma3n import Gemma3nForCausalLM
from vllm.model_executor.models.module_mapping import MultiModelKeys
from vllm.model_executor.models.whisper import ISO639_1_SUPPORTED_LANGS
from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
MultiModalKwargsItems)
from vllm.multimodal.parse import (ImageProcessorItems, MultiModalDataItems,
MultiModalDataParser)
from vllm.multimodal.processing import (BaseMultiModalProcessor,
BaseProcessingInfo,
MultiModalPromptUpdates,
MultiModalPromptUpdatesApplyResult,
PlaceholderFeaturesInfo,
PromptReplacement, PromptUpdate,
PromptUpdateDetails,
replace_token_matches)
# yapf: enable
from vllm.multimodal.inputs import (
MultiModalDataDict,
MultiModalFieldConfig,
MultiModalKwargsItems,
)
from vllm.multimodal.parse import (
ImageProcessorItems,
MultiModalDataItems,
MultiModalDataParser,
)
from vllm.multimodal.processing import (
BaseMultiModalProcessor,
BaseProcessingInfo,
MultiModalPromptUpdates,
MultiModalPromptUpdatesApplyResult,
PlaceholderFeaturesInfo,
PromptReplacement,
PromptUpdate,
PromptUpdateDetails,
replace_token_matches,
)
from vllm.multimodal.profiling import BaseDummyInputsBuilder
from vllm.sequence import IntermediateTensors
from vllm.utils.tensor_schema import TensorSchema, TensorShape
......
......@@ -43,9 +43,6 @@ from vllm.multimodal.inputs import (
MultiModalKwargsItems,
)
from vllm.multimodal.parse import ImageProcessorItems, ImageSize
# yapf conflicts with isort for this block
# yapf: disable
from vllm.multimodal.processing import (
BaseMultiModalProcessor,
BaseProcessingInfo,
......@@ -54,18 +51,13 @@ from vllm.multimodal.processing import (
PromptUpdate,
PromptUpdateDetails,
)
# yapf: enable
from vllm.multimodal.profiling import BaseDummyInputsBuilder
from vllm.sequence import IntermediateTensors
from vllm.utils.tensor_schema import TensorSchema, TensorShape
# yapf: disable
from .idefics2_vision_model import (
Idefics2VisionTransformer as Idefics3VisionTransformer,
)
# yapf: enable
from .interfaces import MultiModalEmbeddings, SupportsLoRA, SupportsMultiModal
from .llama import LlamaModel
from .utils import AutoWeightsLoader, maybe_prefix
......
......@@ -45,9 +45,6 @@ from vllm.multimodal.parse import (
ImageSize,
MultiModalDataItems,
)
# yapf conflicts with isort for this block
# yapf: disable
from vllm.multimodal.processing import (
BaseMultiModalProcessor,
BaseProcessingInfo,
......@@ -57,8 +54,6 @@ from vllm.multimodal.processing import (
PromptUpdate,
ResolvedPromptUpdate,
)
# yapf: enable
from vllm.multimodal.profiling import BaseDummyInputsBuilder
from vllm.sequence import IntermediateTensors
from vllm.utils import is_list_of
......
......@@ -52,16 +52,12 @@ from vllm.distributed import utils as dist_utils
from vllm.logger import init_logger
from vllm.model_executor.layers.activation import get_act_and_mul_fn
from vllm.model_executor.layers.layernorm import RMSNorm
# yapf: disable
from vllm.model_executor.layers.linear import (
ColumnParallelLinear,
MergedColumnParallelLinear,
QKVParallelLinear,
RowParallelLinear,
)
# yapf: enable
from vllm.model_executor.layers.quantization import QuantizationConfig
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
from vllm.model_executor.models.module_mapping import MultiModelKeys
......
......@@ -37,12 +37,7 @@ from vllm.model_executor.layers.fla.ops import (
fused_recurrent_gated_delta_rule,
)
from vllm.model_executor.layers.fused_moe import FusedMoE
# yapf conflicts with isort for this block
# yapf: disable
from vllm.model_executor.layers.layernorm import GemmaRMSNorm as Qwen3NextRMSNorm
# yapf: enable
from vllm.model_executor.layers.linear import (
ColumnParallelLinear,
QKVParallelLinear,
......
......@@ -54,7 +54,6 @@ from .interfaces_base import (
logger = init_logger(__name__)
# yapf: disable
_TEXT_GENERATION_MODELS = {
# [Decoder-only]
"ApertusForCausalLM": ("apertus", "ApertusForCausalLM"),
......@@ -106,8 +105,8 @@ _TEXT_GENERATION_MODELS = {
"GPTNeoXForCausalLM": ("gpt_neox", "GPTNeoXForCausalLM"),
"GraniteForCausalLM": ("granite", "GraniteForCausalLM"),
"GraniteMoeForCausalLM": ("granitemoe", "GraniteMoeForCausalLM"),
"GraniteMoeHybridForCausalLM": ("granitemoehybrid", "GraniteMoeHybridForCausalLM"), # noqa: E501
"GraniteMoeSharedForCausalLM": ("granitemoeshared", "GraniteMoeSharedForCausalLM"), # noqa: E501
"GraniteMoeHybridForCausalLM": ("granitemoehybrid", "GraniteMoeHybridForCausalLM"), # noqa: E501
"GraniteMoeSharedForCausalLM": ("granitemoeshared", "GraniteMoeSharedForCausalLM"), # noqa: E501
"GritLM": ("gritlm", "GritLM"),
"Grok1ModelForCausalLM": ("grok1", "Grok1ForCausalLM"),
"HunYuanMoEV1ForCausalLM": ("hunyuan_v1", "HunYuanMoEV1ForCausalLM"),
......@@ -127,7 +126,7 @@ _TEXT_GENERATION_MODELS = {
"LongcatFlashForCausalLM": ("longcat_flash", "LongcatFlashForCausalLM"),
"MambaForCausalLM": ("mamba", "MambaForCausalLM"),
"FalconMambaForCausalLM": ("mamba", "MambaForCausalLM"),
"FalconH1ForCausalLM":("falcon_h1", "FalconH1ForCausalLM"),
"FalconH1ForCausalLM": ("falcon_h1", "FalconH1ForCausalLM"),
"Mamba2ForCausalLM": ("mamba2", "Mamba2ForCausalLM"),
"MiniCPMForCausalLM": ("minicpm", "MiniCPMForCausalLM"),
"MiniCPM3ForCausalLM": ("minicpm3", "MiniCPM3ForCausalLM"),
......@@ -184,7 +183,8 @@ _EMBEDDING_MODELS = {
"LlamaModel": ("llama", "LlamaForCausalLM"),
**{
# Multiple models share the same architecture, so we include them all
k: (mod, arch) for k, (mod, arch) in _TEXT_GENERATION_MODELS.items()
k: (mod, arch)
for k, (mod, arch) in _TEXT_GENERATION_MODELS.items()
if arch == "LlamaForCausalLM"
},
"MistralModel": ("llama", "LlamaForCausalLM"),
......@@ -201,7 +201,10 @@ _EMBEDDING_MODELS = {
"XLMRobertaModel": ("roberta", "RobertaEmbeddingModel"),
# [Multimodal]
"CLIPModel": ("clip", "CLIPEmbeddingModel"),
"LlavaNextForConditionalGeneration": ("llava_next", "LlavaNextForConditionalGeneration"), # noqa: E501
"LlavaNextForConditionalGeneration": (
"llava_next",
"LlavaNextForConditionalGeneration",
), # noqa: E501
"Phi3VForCausalLM": ("phi3v", "Phi3VForCausalLM"),
"Qwen2VLForConditionalGeneration": ("qwen2_vl", "Qwen2VLForConditionalGeneration"), # noqa: E501
# Technically Terratorch models work on images, both in
......@@ -214,79 +217,150 @@ _EMBEDDING_MODELS = {
_CROSS_ENCODER_MODELS = {
"BertForSequenceClassification": ("bert", "BertForSequenceClassification"),
"BertForTokenClassification": ("bert", "BertForTokenClassification"),
"GteNewForSequenceClassification": ("bert_with_rope",
"GteNewForSequenceClassification"),
"ModernBertForSequenceClassification": ("modernbert",
"ModernBertForSequenceClassification"),
"RobertaForSequenceClassification": ("roberta",
"RobertaForSequenceClassification"),
"XLMRobertaForSequenceClassification": ("roberta",
"RobertaForSequenceClassification"),
"GteNewForSequenceClassification": (
"bert_with_rope",
"GteNewForSequenceClassification",
),
"ModernBertForSequenceClassification": (
"modernbert",
"ModernBertForSequenceClassification",
),
"RobertaForSequenceClassification": ("roberta", "RobertaForSequenceClassification"),
"XLMRobertaForSequenceClassification": (
"roberta",
"RobertaForSequenceClassification",
),
# [Auto-converted (see adapters.py)]
"JinaVLForRanking": ("jina_vl", "JinaVLForSequenceClassification"), # noqa: E501,
"JinaVLForRanking": ("jina_vl", "JinaVLForSequenceClassification"), # noqa: E501,
}
_MULTIMODAL_MODELS = {
# [Decoder-only]
"AriaForConditionalGeneration": ("aria", "AriaForConditionalGeneration"),
"AyaVisionForConditionalGeneration": ("aya_vision", "AyaVisionForConditionalGeneration"), # noqa: E501
"AyaVisionForConditionalGeneration": (
"aya_vision",
"AyaVisionForConditionalGeneration",
), # noqa: E501
"Blip2ForConditionalGeneration": ("blip2", "Blip2ForConditionalGeneration"),
"ChameleonForConditionalGeneration": ("chameleon", "ChameleonForConditionalGeneration"), # noqa: E501
"Cohere2VisionForConditionalGeneration": ("cohere2_vision", "Cohere2VisionForConditionalGeneration"), # noqa: E501
"ChameleonForConditionalGeneration": (
"chameleon",
"ChameleonForConditionalGeneration",
), # noqa: E501
"Cohere2VisionForConditionalGeneration": (
"cohere2_vision",
"Cohere2VisionForConditionalGeneration",
), # noqa: E501
"DeepseekVLV2ForCausalLM": ("deepseek_vl2", "DeepseekVLV2ForCausalLM"),
"DotsOCRForCausalLM": ("dots_ocr", "DotsOCRForCausalLM"),
"Ernie4_5_VLMoeForConditionalGeneration": ("ernie45_vl", "Ernie4_5_VLMoeForConditionalGeneration"), # noqa: E501
"Ernie4_5_VLMoeForConditionalGeneration": (
"ernie45_vl",
"Ernie4_5_VLMoeForConditionalGeneration",
), # noqa: E501
"FuyuForCausalLM": ("fuyu", "FuyuForCausalLM"),
"Gemma3ForConditionalGeneration": ("gemma3_mm", "Gemma3ForConditionalGeneration"), # noqa: E501
"Gemma3nForConditionalGeneration": ("gemma3n_mm", "Gemma3nForConditionalGeneration"), # noqa: E501
"Gemma3nForConditionalGeneration": (
"gemma3n_mm",
"Gemma3nForConditionalGeneration",
), # noqa: E501
"GLM4VForCausalLM": ("glm4v", "GLM4VForCausalLM"),
"Glm4vForConditionalGeneration": ("glm4_1v", "Glm4vForConditionalGeneration"), # noqa: E501
"Glm4vMoeForConditionalGeneration": ("glm4_1v", "Glm4vMoeForConditionalGeneration"), # noqa: E501
"GraniteSpeechForConditionalGeneration": ("granite_speech", "GraniteSpeechForConditionalGeneration"), # noqa: E501
"GraniteSpeechForConditionalGeneration": (
"granite_speech",
"GraniteSpeechForConditionalGeneration",
), # noqa: E501
"H2OVLChatModel": ("h2ovl", "H2OVLChatModel"),
"InternVLChatModel": ("internvl", "InternVLChatModel"),
"NemotronH_Nano_VL_V2": ("nano_nemotron_vl", "NemotronH_Nano_VL_V2"),
"InternS1ForConditionalGeneration": ("interns1", "InternS1ForConditionalGeneration"), # noqa: E501
"InternVLForConditionalGeneration": ("interns1", "InternS1ForConditionalGeneration"), # noqa: E501
"Idefics3ForConditionalGeneration":("idefics3","Idefics3ForConditionalGeneration"),
"SmolVLMForConditionalGeneration": ("smolvlm","SmolVLMForConditionalGeneration"), # noqa: E501
"InternS1ForConditionalGeneration": (
"interns1",
"InternS1ForConditionalGeneration",
), # noqa: E501
"InternVLForConditionalGeneration": (
"interns1",
"InternS1ForConditionalGeneration",
), # noqa: E501
"Idefics3ForConditionalGeneration": (
"idefics3",
"Idefics3ForConditionalGeneration",
),
"SmolVLMForConditionalGeneration": ("smolvlm", "SmolVLMForConditionalGeneration"), # noqa: E501
"KeyeForConditionalGeneration": ("keye", "KeyeForConditionalGeneration"),
"KeyeVL1_5ForConditionalGeneration": ("keye_vl1_5", "KeyeVL1_5ForConditionalGeneration"), # noqa: E501
"KeyeVL1_5ForConditionalGeneration": (
"keye_vl1_5",
"KeyeVL1_5ForConditionalGeneration",
), # noqa: E501
"RForConditionalGeneration": ("rvl", "RForConditionalGeneration"),
"KimiVLForConditionalGeneration": ("kimi_vl", "KimiVLForConditionalGeneration"), # noqa: E501
"Llama_Nemotron_Nano_VL": ("nemotron_vl", "LlamaNemotronVLChatModel"),
"Llama4ForConditionalGeneration": ("mllama4", "Llama4ForConditionalGeneration"), # noqa: E501
"LlavaForConditionalGeneration": ("llava", "LlavaForConditionalGeneration"),
"LlavaNextForConditionalGeneration": ("llava_next", "LlavaNextForConditionalGeneration"), # noqa: E501
"LlavaNextVideoForConditionalGeneration": ("llava_next_video", "LlavaNextVideoForConditionalGeneration"), # noqa: E501
"LlavaOnevisionForConditionalGeneration": ("llava_onevision", "LlavaOnevisionForConditionalGeneration"), # noqa: E501
"LlavaNextForConditionalGeneration": (
"llava_next",
"LlavaNextForConditionalGeneration",
), # noqa: E501
"LlavaNextVideoForConditionalGeneration": (
"llava_next_video",
"LlavaNextVideoForConditionalGeneration",
), # noqa: E501
"LlavaOnevisionForConditionalGeneration": (
"llava_onevision",
"LlavaOnevisionForConditionalGeneration",
), # noqa: E501
"MantisForConditionalGeneration": ("llava", "MantisForConditionalGeneration"), # noqa: E501
"MiDashengLMModel": ("midashenglm", "MiDashengLMModel"),
"MiniMaxVL01ForConditionalGeneration": ("minimax_vl_01", "MiniMaxVL01ForConditionalGeneration"), # noqa: E501
"MiniMaxVL01ForConditionalGeneration": (
"minimax_vl_01",
"MiniMaxVL01ForConditionalGeneration",
), # noqa: E501
"MiniCPMO": ("minicpmo", "MiniCPMO"),
"MiniCPMV": ("minicpmv", "MiniCPMV"),
"Mistral3ForConditionalGeneration": ("mistral3", "Mistral3ForConditionalGeneration"), # noqa: E501
"Mistral3ForConditionalGeneration": (
"mistral3",
"Mistral3ForConditionalGeneration",
), # noqa: E501
"MolmoForCausalLM": ("molmo", "MolmoForCausalLM"),
"NVLM_D": ("nvlm_d", "NVLM_D_Model"),
"Ovis": ("ovis", "Ovis"),
"Ovis2_5": ("ovis2_5", "Ovis2_5"),
"PaliGemmaForConditionalGeneration": ("paligemma", "PaliGemmaForConditionalGeneration"), # noqa: E501
"PaliGemmaForConditionalGeneration": (
"paligemma",
"PaliGemmaForConditionalGeneration",
), # noqa: E501
"Phi3VForCausalLM": ("phi3v", "Phi3VForCausalLM"),
"Phi4MMForCausalLM": ("phi4mm", "Phi4MMForCausalLM"),
"Phi4MultimodalForCausalLM": ("phi4_multimodal", "Phi4MultimodalForCausalLM"), # noqa: E501
"PixtralForConditionalGeneration": ("pixtral", "PixtralForConditionalGeneration"), # noqa: E501
"QwenVLForConditionalGeneration": ("qwen_vl", "QwenVLForConditionalGeneration"), # noqa: E501
"Qwen2VLForConditionalGeneration": ("qwen2_vl", "Qwen2VLForConditionalGeneration"), # noqa: E501
"Qwen2_5_VLForConditionalGeneration": ("qwen2_5_vl", "Qwen2_5_VLForConditionalGeneration"), # noqa: E501
"Qwen2AudioForConditionalGeneration": ("qwen2_audio", "Qwen2AudioForConditionalGeneration"), # noqa: E501
"Qwen2_5OmniModel": ("qwen2_5_omni_thinker", "Qwen2_5OmniThinkerForConditionalGeneration"), # noqa: E501
"Qwen2_5OmniForConditionalGeneration": ("qwen2_5_omni_thinker", "Qwen2_5OmniThinkerForConditionalGeneration"), # noqa: E501
"Qwen2_5_VLForConditionalGeneration": (
"qwen2_5_vl",
"Qwen2_5_VLForConditionalGeneration",
), # noqa: E501
"Qwen2AudioForConditionalGeneration": (
"qwen2_audio",
"Qwen2AudioForConditionalGeneration",
), # noqa: E501
"Qwen2_5OmniModel": (
"qwen2_5_omni_thinker",
"Qwen2_5OmniThinkerForConditionalGeneration",
), # noqa: E501
"Qwen2_5OmniForConditionalGeneration": (
"qwen2_5_omni_thinker",
"Qwen2_5OmniThinkerForConditionalGeneration",
), # noqa: E501
"Qwen3VLForConditionalGeneration": ("qwen3_vl", "Qwen3VLForConditionalGeneration"), # noqa: E501
"Qwen3VLMoeForConditionalGeneration": ("qwen3_vl_moe", "Qwen3VLMoeForConditionalGeneration"), # noqa: E501
"Qwen3VLMoeForConditionalGeneration": (
"qwen3_vl_moe",
"Qwen3VLMoeForConditionalGeneration",
), # noqa: E501
"SkyworkR1VChatModel": ("skyworkr1v", "SkyworkR1VChatModel"),
"Step3VLForConditionalGeneration": ("step3_vl", "Step3VLForConditionalGeneration"), # noqa: E501
"TarsierForConditionalGeneration": ("tarsier", "TarsierForConditionalGeneration"), # noqa: E501
"Tarsier2ForConditionalGeneration": ("qwen2_vl", "Tarsier2ForConditionalGeneration"), # noqa: E501
"Tarsier2ForConditionalGeneration": (
"qwen2_vl",
"Tarsier2ForConditionalGeneration",
), # noqa: E501
"UltravoxModel": ("ultravox", "UltravoxModel"),
"VoxtralForConditionalGeneration": ("voxtral", "VoxtralForConditionalGeneration"), # noqa: E501
# [Encoder-decoder]
......@@ -324,13 +398,27 @@ _TRANSFORMERS_BACKEND_MODELS = {
"TransformersForCausalLM": ("transformers", "TransformersForCausalLM"),
"TransformersForMultimodalLM": ("transformers", "TransformersForMultimodalLM"), # noqa: E501
"TransformersMoEForCausalLM": ("transformers_moe", "TransformersMoEForCausalLM"), # noqa: E501
"TransformersMoEForMultimodalLM": ("transformers_moe", "TransformersMoEForMultimodalLM"), # noqa: E501
"TransformersEmbeddingModel": ("transformers_pooling", "TransformersEmbeddingModel"), # noqa: E501
"TransformersForSequenceClassification": ("transformers_pooling", "TransformersForSequenceClassification"), # noqa: E501
"TransformersMoEForSequenceClassification": ("transformers_pooling", "TransformersMoEForSequenceClassification"), # noqa: E501
"TransformersMoEEmbeddingModel": ("transformers_pooling", "TransformersMoEEmbeddingModel"), # noqa: E501
"TransformersMoEForMultimodalLM": (
"transformers_moe",
"TransformersMoEForMultimodalLM",
), # noqa: E501
"TransformersEmbeddingModel": (
"transformers_pooling",
"TransformersEmbeddingModel",
), # noqa: E501
"TransformersForSequenceClassification": (
"transformers_pooling",
"TransformersForSequenceClassification",
), # noqa: E501
"TransformersMoEForSequenceClassification": (
"transformers_pooling",
"TransformersMoEForSequenceClassification",
), # noqa: E501
"TransformersMoEEmbeddingModel": (
"transformers_pooling",
"TransformersMoEEmbeddingModel",
), # noqa: E501
}
# yapf: enable
_VLLM_MODELS = {
**_TEXT_GENERATION_MODELS,
......
......@@ -8,13 +8,10 @@ from transformers import SmolVLMProcessor
from vllm.config import VllmConfig
from vllm.multimodal import MULTIMODAL_REGISTRY
# yapf: disable
from .idefics3 import Idefics3DummyInputsBuilder as SmolVLMDummyInputsBuilder
from .idefics3 import Idefics3ForConditionalGeneration, Idefics3ProcessingInfo
from .idefics3 import Idefics3MultiModalProcessor as SmolVLMMultiModalProcessor
# yapf: enable
class SmolVLMProcessingInfo(Idefics3ProcessingInfo):
def get_hf_processor(self, **kwargs: object) -> SmolVLMProcessor:
......
......@@ -32,11 +32,7 @@ from vllm.model_executor.layers.quantization import QuantizationConfig
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
from vllm.model_executor.models import SupportsPP
from vllm.model_executor.models.module_mapping import MultiModelKeys
# yapf: disable
from vllm.model_executor.models.whisper import WhisperEncoder
# yapf: enable
from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import (
MultiModalDataDict,
......
......@@ -28,7 +28,6 @@ def _get_minicpmv_chat_template_fallback(tokenizer_name_or_path: str) -> Optiona
return CHAT_TEMPLATES_DIR / "template_chatml.jinja"
# yapf: disable
_MODEL_TYPE_TO_CHAT_TEMPLATE_FALLBACK: dict[str, ChatTemplatePath] = {
"blip-2": CHAT_TEMPLATES_DIR / "template_blip2.jinja",
"clip": CHAT_TEMPLATES_DIR / "template_basic.jinja",
......@@ -39,7 +38,6 @@ _MODEL_TYPE_TO_CHAT_TEMPLATE_FALLBACK: dict[str, ChatTemplatePath] = {
"paligemma": CHAT_TEMPLATES_DIR / "template_basic.jinja",
"qwen": _get_qwen_chat_template_fallback,
}
# yapf: enable
def register_chat_template_fallback_path(
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# yapf: disable
# ruff: noqa: E501
# coding=utf-8
# Copied from
# https://huggingface.co/Snowflake/snowflake-arctic-instruct/blob/main/configuration_arctic.py
""" Arctic model configuration"""
"""Arctic model configuration"""
from dataclasses import asdict, dataclass
from typing import Any
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# yapf: disable
# ruff: noqa: E501
# Adapted from
# https://huggingface.co/nvidia/Llama-3.1-Nemotron-Nano-VL-8B-V1/blob/main/configuration.py
......@@ -16,7 +15,7 @@ from transformers.dynamic_module_utils import get_class_from_dynamic_module
class Nemotron_Nano_VL_Config(PretrainedConfig):
model_type = 'Llama_Nemotron_Nano_VL'
model_type = "Llama_Nemotron_Nano_VL"
is_composition = True
def __init__(
......@@ -26,17 +25,22 @@ class Nemotron_Nano_VL_Config(PretrainedConfig):
force_image_size=None,
downsample_ratio=0.5,
template=None,
ps_version='v1',
ps_version="v1",
image_tag_type="internvl",
projector_hidden_size=4096,
vit_hidden_size=1280,
**kwargs
**kwargs,
):
super().__init__(**kwargs)
if vision_config is not None:
assert "auto_map" in vision_config and "AutoConfig" in vision_config["auto_map"]
vision_auto_config = get_class_from_dynamic_module(*vision_config["auto_map"]["AutoConfig"].split("--")[::-1])
assert (
"auto_map" in vision_config
and "AutoConfig" in vision_config["auto_map"]
)
vision_auto_config = get_class_from_dynamic_module(
*vision_config["auto_map"]["AutoConfig"].split("--")[::-1]
)
self.vision_config = vision_auto_config(**vision_config)
else:
self.vision_config = PretrainedConfig()
......@@ -51,6 +55,6 @@ class Nemotron_Nano_VL_Config(PretrainedConfig):
self.downsample_ratio = downsample_ratio
self.template = template # TODO move out of here and into the tokenizer
self.ps_version = ps_version # Pixel shuffle version
self.image_tag_type = image_tag_type # TODO: into the tokenizer too?
self.image_tag_type = image_tag_type # TODO: into the tokenizer too?
self.projector_hidden_size = projector_hidden_size
self.vit_hidden_size = vit_hidden_size
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# yapf: disable
# ruff: noqa: E501
# adapted from https://huggingface.co/AIDC-AI/Ovis2-1B/blob/main/configuration_aimv2.py
# and https://huggingface.co/AIDC-AI/Ovis2-1B/blob/main/configuration_ovis.py
......@@ -70,34 +69,37 @@ class AIMv2Config(PretrainedConfig):
# Visual Tokenizer Configuration
# ----------------------------------------------------------------------
class BaseVisualTokenizerConfig(PretrainedConfig):
def __init__(self,
vocab_size=16384,
tokenize_function="softmax",
tau=1.0,
depths=None,
drop_cls_token=False,
backbone_config: Optional[Union[PretrainedConfig,
dict]] = None,
hidden_stride: int = 1,
**kwargs):
def __init__(
self,
vocab_size=16384,
tokenize_function="softmax",
tau=1.0,
depths=None,
drop_cls_token=False,
backbone_config: Optional[Union[PretrainedConfig, dict]] = None,
hidden_stride: int = 1,
**kwargs,
):
super().__init__(**kwargs)
self.vocab_size = vocab_size
self.tokenize_function = tokenize_function
self.tau = tau
if isinstance(depths, str):
depths = [int(x) for x in depths.split('|')]
depths = [int(x) for x in depths.split("|")]
self.depths = depths
self.backbone_kwargs = dict[str, Any]()
self.drop_cls_token = drop_cls_token
if backbone_config is not None:
assert isinstance(backbone_config, (PretrainedConfig, dict)), \
assert isinstance(backbone_config, (PretrainedConfig, dict)), (
f"expect `backbone_config` to be instance of PretrainedConfig or dict, but got {type(backbone_config)} type"
)
if not isinstance(backbone_config, PretrainedConfig):
model_type = backbone_config['model_type']
model_type = backbone_config["model_type"]
if model_type != "aimv2":
backbone_config.pop('model_type')
backbone_config = AutoConfig.for_model(model_type, **backbone_config)
backbone_config.pop("model_type")
backbone_config = AutoConfig.for_model(
model_type, **backbone_config
)
else:
backbone_config = AIMv2Config(**backbone_config)
self.backbone_config = backbone_config
......@@ -113,7 +115,7 @@ class Aimv2VisualTokenizerConfig(BaseVisualTokenizerConfig):
self.drop_cls_token = False
if self.depths:
assert len(self.depths) == 1
self.backbone_kwargs['num_hidden_layers'] = self.depths[0]
self.backbone_kwargs["num_hidden_layers"] = self.depths[0]
class SiglipVisualTokenizerConfig(BaseVisualTokenizerConfig):
......@@ -125,7 +127,7 @@ class SiglipVisualTokenizerConfig(BaseVisualTokenizerConfig):
self.drop_cls_token = False
if self.depths:
assert len(self.depths) == 1
self.backbone_kwargs['num_hidden_layers'] = self.depths[0]
self.backbone_kwargs["num_hidden_layers"] = self.depths[0]
AutoConfig.register("siglip_visual_tokenizer", SiglipVisualTokenizerConfig)
......@@ -138,35 +140,39 @@ AutoConfig.register("aimv2_visual_tokenizer", Aimv2VisualTokenizerConfig)
class OvisConfig(PretrainedConfig):
model_type = "ovis"
def __init__(self,
llm_config: Optional[Union[PretrainedConfig, dict]] = None,
visual_tokenizer_config: Optional[Union[PretrainedConfig,
dict]] = None,
multimodal_max_length=8192,
hidden_size=None,
conversation_formatter_class=None,
llm_attn_implementation=None,
disable_tie_weight=False,
**kwargs):
def __init__(
self,
llm_config: Optional[Union[PretrainedConfig, dict]] = None,
visual_tokenizer_config: Optional[Union[PretrainedConfig, dict]] = None,
multimodal_max_length=8192,
hidden_size=None,
conversation_formatter_class=None,
llm_attn_implementation=None,
disable_tie_weight=False,
**kwargs,
):
super().__init__(**kwargs)
if llm_config is not None:
assert isinstance(llm_config, (PretrainedConfig, dict)), \
assert isinstance(llm_config, (PretrainedConfig, dict)), (
f"expect `llm_config` to be instance of PretrainedConfig or dict, but got {type(llm_config)} type"
)
if not isinstance(llm_config, PretrainedConfig):
model_type = llm_config['model_type']
llm_config.pop('model_type')
model_type = llm_config["model_type"]
llm_config.pop("model_type")
llm_config = AutoConfig.for_model(model_type, **llm_config)
# map llm_config to text_config
self.text_config = llm_config
if visual_tokenizer_config is not None:
assert isinstance(visual_tokenizer_config, (PretrainedConfig, dict)), \
assert isinstance(visual_tokenizer_config, (PretrainedConfig, dict)), (
f"expect `visual_tokenizer_config` to be instance of PretrainedConfig or dict, but got {type(visual_tokenizer_config)} type"
)
if not isinstance(visual_tokenizer_config, PretrainedConfig):
model_type = visual_tokenizer_config['model_type']
visual_tokenizer_config.pop('model_type')
model_type = visual_tokenizer_config["model_type"]
visual_tokenizer_config.pop("model_type")
visual_tokenizer_config = AutoConfig.for_model(
model_type, **visual_tokenizer_config)
model_type, **visual_tokenizer_config
)
self.visual_tokenizer_config = visual_tokenizer_config
self.multimodal_max_length = multimodal_max_length
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# yapf: disable
# ruff: noqa: E501
# coding=utf-8
# adapted from https://github.com/deepseek-ai/DeepSeek-VL2/blob/ff23960c5cf9e6874b44be38af930cfb0ccbb620/deepseek_vl2/models/processing_deepseek_vl_v2.py
......@@ -35,11 +34,12 @@ from transformers.processing_utils import ProcessorMixin
class ImageTransform:
def __init__(self,
mean: tuple[float, float, float] = (0.5, 0.5, 0.5),
std: tuple[float, float, float] = (0.5, 0.5, 0.5),
normalize: bool = True):
def __init__(
self,
mean: tuple[float, float, float] = (0.5, 0.5, 0.5),
std: tuple[float, float, float] = (0.5, 0.5, 0.5),
normalize: bool = True,
):
self.mean = mean
self.std = std
self.normalize = normalize
......@@ -77,7 +77,6 @@ class DeepseekVLV2Processor(ProcessorMixin):
ignore_id: int = -100,
**kwargs,
):
self.candidate_resolutions = candidate_resolutions
self.image_size = candidate_resolutions[0][0]
self.patch_size = patch_size
......@@ -86,13 +85,15 @@ class DeepseekVLV2Processor(ProcessorMixin):
self.normalize = normalize
self.downsample_ratio = downsample_ratio
self.image_transform = ImageTransform(mean=image_mean, std=image_std, normalize=normalize)
self.image_transform = ImageTransform(
mean=image_mean, std=image_std, normalize=normalize
)
self.tokenizer = tokenizer
self.tokenizer.padding_side = 'left' # must set this,padding side with make a difference in batch inference
self.tokenizer.padding_side = "left" # must set this,padding side with make a difference in batch inference
# add the pad_token as special token to use 'tokenizer.pad_token' and 'tokenizer.pad_token_id'
if tokenizer.pad_token is None:
self.tokenizer.add_special_tokens({'pad_token': pad_token})
self.tokenizer.add_special_tokens({"pad_token": pad_token})
# add image token
image_token_id = self.tokenizer.vocab.get(image_token)
......@@ -104,7 +105,7 @@ class DeepseekVLV2Processor(ProcessorMixin):
# add five special tokens for grounding-related tasks
# <|ref|>, <|/ref|>, <|det|>, <|/det|>, <|grounding|>
special_tokens = ['<|ref|>', '<|/ref|>', '<|det|>', '<|/det|>', '<|grounding|>']
special_tokens = ["<|ref|>", "<|/ref|>", "<|det|>", "<|/det|>", "<|grounding|>"]
special_tokens_dict = {"additional_special_tokens": special_tokens}
self.tokenizer.add_special_tokens(special_tokens_dict)
......@@ -134,15 +135,19 @@ class DeepseekVLV2Processor(ProcessorMixin):
for width, height in self.candidate_resolutions:
scale = min(width / original_width, height / original_height)
downscaled_width, downscaled_height = int(
original_width * scale), int(original_height * scale)
effective_resolution = min(downscaled_width * downscaled_height,
original_width * original_height)
downscaled_width, downscaled_height = (
int(original_width * scale),
int(original_height * scale),
)
effective_resolution = min(
downscaled_width * downscaled_height, original_width * original_height
)
wasted_resolution = (width * height) - effective_resolution
if effective_resolution > max_effective_resolution or (
effective_resolution == max_effective_resolution
and wasted_resolution < min_wasted_resolution):
effective_resolution == max_effective_resolution
and wasted_resolution < min_wasted_resolution
):
max_effective_resolution = effective_resolution
min_wasted_resolution = wasted_resolution
best_fit = (width, height)
......@@ -198,12 +203,20 @@ class DeepseekVLV2Processor(ProcessorMixin):
- num_image_tokens (list[int]): the number of image tokens
"""
assert (prompt is not None and images is not None
), "prompt and images must be used at the same time."
assert prompt is not None and images is not None, (
"prompt and images must be used at the same time."
)
sft_format = prompt
tokenized_str, images_list, images_seq_mask, images_spatial_crop, num_image_tokens = self.tokenize_with_images(
sft_format, images, bos=True, eos=True, cropping=len(images) <= 2)
(
tokenized_str,
images_list,
images_seq_mask,
images_spatial_crop,
num_image_tokens,
) = self.tokenize_with_images(
sft_format, images, bos=True, eos=True, cropping=len(images) <= 2
)
masked_tokenized_str = []
for token_index in tokenized_str:
if token_index != self.image_token_id:
......@@ -211,17 +224,21 @@ class DeepseekVLV2Processor(ProcessorMixin):
else:
masked_tokenized_str.append(self.ignore_id)
assert len(tokenized_str) == len(images_seq_mask) == len(masked_tokenized_str), \
(f"tokenized_str's length {len(tokenized_str)}, input_ids' length {len(masked_tokenized_str)}, "
f"imags_seq_mask's length {len(images_seq_mask)}, are not equal")
assert (
len(tokenized_str) == len(images_seq_mask) == len(masked_tokenized_str)
), (
f"tokenized_str's length {len(tokenized_str)}, input_ids' length {len(masked_tokenized_str)}, "
f"imags_seq_mask's length {len(images_seq_mask)}, are not equal"
)
input_ids = torch.LongTensor(tokenized_str)
target_ids = torch.LongTensor(masked_tokenized_str)
images_seq_mask = torch.tensor(images_seq_mask, dtype=torch.bool)
# set input_ids < 0 | input_ids == self.image_token_id as ignore_id
target_ids[(input_ids < 0) |
(input_ids == self.image_token_id)] = self.ignore_id
target_ids[(input_ids < 0) | (input_ids == self.image_token_id)] = (
self.ignore_id
)
input_ids[input_ids < 0] = self.pad_id
if inference_mode:
......@@ -311,30 +328,50 @@ class DeepseekVLV2Processor(ProcessorMixin):
best_width, best_height = self.image_size, self.image_size
"""process the global view"""
global_view = ImageOps.pad(image, (self.image_size, self.image_size),
color=tuple(int(x * 255) for x in self.image_transform.mean))
global_view = ImageOps.pad(
image,
(self.image_size, self.image_size),
color=tuple(int(x * 255) for x in self.image_transform.mean),
)
images_list.append(self.image_transform(global_view))
"""process the local views"""
local_view = ImageOps.pad(image, (best_width, best_height),
color=tuple(int(x * 255) for x in self.image_transform.mean))
local_view = ImageOps.pad(
image,
(best_width, best_height),
color=tuple(int(x * 255) for x in self.image_transform.mean),
)
for i in range(0, best_height, self.image_size):
for j in range(0, best_width, self.image_size):
images_list.append(
self.image_transform(local_view.crop((j, i, j + self.image_size, i + self.image_size))))
self.image_transform(
local_view.crop(
(j, i, j + self.image_size, i + self.image_size)
)
)
)
"""record height / width crop num"""
num_width_tiles, num_height_tiles = best_width // self.image_size, best_height // self.image_size
num_width_tiles, num_height_tiles = (
best_width // self.image_size,
best_height // self.image_size,
)
images_spatial_crop.append([num_width_tiles, num_height_tiles])
"""add image tokens"""
h = w = math.ceil((self.image_size // self.patch_size) / self.downsample_ratio)
h = w = math.ceil(
(self.image_size // self.patch_size) / self.downsample_ratio
)
# global views tokens h * (w + 1), 1 is for line separator
tokenized_image = [self.image_token_id] * h * (w + 1)
# add a separator between global and local views
tokenized_image += [self.image_token_id]
# local views tokens, (num_height_tiles * h) * (num_width_tiles * w + 1)
tokenized_image += [self.image_token_id] * (num_height_tiles * h) * (num_width_tiles * w + 1)
tokenized_image += (
[self.image_token_id]
* (num_height_tiles * h)
* (num_width_tiles * w + 1)
)
tokenized_str += tokenized_image
images_seq_mask += [True] * len(tokenized_image)
......@@ -353,10 +390,17 @@ class DeepseekVLV2Processor(ProcessorMixin):
tokenized_str = tokenized_str + [self.eos_id]
images_seq_mask = images_seq_mask + [False]
assert len(tokenized_str) == len(
images_seq_mask), f"tokenize_with_images func: tokenized_str's length {len(tokenized_str)} is not equal to imags_seq_mask's length {len(images_seq_mask)}"
assert len(tokenized_str) == len(images_seq_mask), (
f"tokenize_with_images func: tokenized_str's length {len(tokenized_str)} is not equal to imags_seq_mask's length {len(images_seq_mask)}"
)
return tokenized_str, images_list, images_seq_mask, images_spatial_crop, num_image_tokens
return (
tokenized_str,
images_list,
images_seq_mask,
images_spatial_crop,
num_image_tokens,
)
AutoProcessor.register("DeepseekVLV2Processor", DeepseekVLV2Processor)
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# yapf: disable
# ruff: noqa: E501
# coding=utf-8
# adapted from https://github.com/AIDC-AI/Ovis/blob/35ab51a1a1e3542fa6db260a1084cefbc8f164bb/ovis/vllm/processing_ovis.py
......@@ -35,23 +34,24 @@ from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
from vllm.multimodal.image import convert_image_mode
__all__ = ['OvisProcessor']
__all__ = ["OvisProcessor"]
IGNORE_ID = -100
class OvisProcessorKwargs(ProcessingKwargs, total=False): # type: ignore[call-arg]
class OvisProcessorKwargs(ProcessingKwargs, total=False): # type: ignore[call-arg]
_defaults = {
"text_kwargs": {
"padding": False,
},
"images_kwargs": {
'max_partition':9,
'covering_threshold':0.9,
'convert_to_rgb':True,
'return_tensors':'pt'},
"max_partition": 9,
"covering_threshold": 0.9,
"convert_to_rgb": True,
"return_tensors": "pt",
},
}
class OvisProcessor(ProcessorMixin):
r"""
Constructs an Ovis processor which wraps an Ovis image processor and a Qwen2 tokenizer into a single processor.
......@@ -97,14 +97,16 @@ class OvisProcessor(ProcessorMixin):
"image_col_sep": -303,
"image_row_sep": -304,
"image_end": -305,
'image_pad': image_pad_token_id,
"image_pad": image_pad_token_id,
}
return extra_special_tokens
def __call__(
self,
images: ImageInput = None,
text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
text: Union[
TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]
] = None,
**kwargs: Unpack[OvisProcessorKwargs],
) -> BatchFeature:
"""
......@@ -169,7 +171,6 @@ class OvisProcessor(ProcessorMixin):
# Process text input
if text is not None:
if not isinstance(text, list):
text = [text]
......@@ -178,7 +179,10 @@ class OvisProcessor(ProcessorMixin):
replaced_ids_list = []
idx = 0
for ids_tensor in tokenized_batched_text:
if image_token_id in ids_tensor and "image_placeholders" in image_features:
if (
image_token_id in ids_tensor
and "image_placeholders" in image_features
):
if idx < len(image_features["image_placeholders"]):
# Converts in list for ease of use
ids_list = ids_tensor.tolist()
......@@ -188,7 +192,9 @@ class OvisProcessor(ProcessorMixin):
# replace placeholders
for i, token_id in enumerate(ids_list):
if token_id == image_token_id:
placeholder_ids = image_features["image_placeholders"][idx]
placeholder_ids = image_features["image_placeholders"][
idx
]
new_ids.extend(placeholder_ids)
idx += 1
else:
......@@ -198,7 +204,8 @@ class OvisProcessor(ProcessorMixin):
ids_tensor = torch.tensor(new_ids, dtype=torch.long)
else:
raise RuntimeError(
'Mismatch between the images you provided and the number of placeholder present in the text')
"Mismatch between the images you provided and the number of placeholder present in the text"
)
replaced_ids_list.append(ids_tensor)
......@@ -217,7 +224,7 @@ class OvisProcessor(ProcessorMixin):
# Add image features if present
if image_features:
output["pixel_values"] = processed_images
output['grids'] = grids
output["grids"] = grids
return output
......@@ -227,8 +234,10 @@ class OvisProcessor(ProcessorMixin):
def _tokenize_with_image_symbol(self, text_list: list[str]) -> torch.LongTensor:
batch_token_ids = []
for text in text_list:
text_chunks = [self.tokenizer(chunk, add_special_tokens=False).input_ids for chunk in
text.split(self.image_token)]
text_chunks = [
self.tokenizer(chunk, add_special_tokens=False).input_ids
for chunk in text.split(self.image_token)
]
token_ids = []
num_chuck = len(text_chunks)
for i, chunk in enumerate(text_chunks):
......@@ -240,50 +249,60 @@ class OvisProcessor(ProcessorMixin):
def get_image_size(self):
size = self.image_processor.size
if 'shortest_edge' in size:
width = height = size['shortest_edge']
if "shortest_edge" in size:
width = height = size["shortest_edge"]
elif "height" in size and "width" in size:
width = size['width']
height = size['height']
width = size["width"]
height = size["height"]
else:
raise ValueError( "Can't parse image size from image_processor config.")
raise ValueError("Can't parse image size from image_processor config.")
return height, width
def get_token_value(self, tok):
return self.extra_special_tokens[tok]
def construct_image_indicators(self, grid):
image_placeholders = [self.get_token_value('image_start'),
self.get_token_value('image_atom'),
self.get_token_value('image_prefix')]
image_placeholders = [
self.get_token_value("image_start"),
self.get_token_value("image_atom"),
self.get_token_value("image_prefix"),
]
if grid[0] * grid[1] > 1:
for r in range(grid[0]):
for c in range(grid[1]):
image_placeholders.append(self.get_token_value('image_atom') )
image_placeholders.append(self.get_token_value("image_atom"))
if c < grid[1] - 1:
image_placeholders.append(self.get_token_value('image_col_sep'))
image_placeholders.append(self.get_token_value("image_col_sep"))
if r < grid[0] - 1:
image_placeholders.append(self.get_token_value('image_row_sep'))
image_placeholders.append(self.get_token_value('image_end'))
image_placeholders.append(self.get_token_value("image_row_sep"))
image_placeholders.append(self.get_token_value("image_end"))
return image_placeholders
def construct_image_placeholders(self, grid):
image_placeholders = self.construct_image_indicators(grid)
image_atom_token_id = self.get_token_value('image_atom')
image_atom_token_id = self.get_token_value("image_atom")
# Extract the padding token ID from tokenizer
image_padding_token_id = self.get_token_value('image_pad')
image_padding_token_id = self.get_token_value("image_pad")
# Create a new list with padding tokens inserted
padded_placeholder_tokens = []
for token in image_placeholders:
padded_placeholder_tokens.append(image_padding_token_id)
if token == image_atom_token_id:
padded_placeholder_tokens.extend([image_padding_token_id] * self.image_segment_len)
padded_placeholder_tokens.extend(
[image_padding_token_id] * self.image_segment_len
)
return padded_placeholder_tokens
def preprocess_image(self, image: PIL.Image.Image, max_partition, covering_threshold, convert_to_rgb, return_tensors):
def preprocess_image(
self,
image: PIL.Image.Image,
max_partition,
covering_threshold,
convert_to_rgb,
return_tensors,
):
def _preprocess(img: PIL.Image.Image, side):
# first resize and preprocess
w, h = img.size
......@@ -296,19 +315,27 @@ class OvisProcessor(ProcessorMixin):
new_height = side
new_width = int(w / h * new_height)
new_size = dict(height=new_height, width=new_width)
pixel_values = self.image_processor.preprocess(img, size=new_size, return_tensors=return_tensors)['pixel_values']
pixel_values = self.image_processor.preprocess(
img, size=new_size, return_tensors=return_tensors
)["pixel_values"]
# then pad to square
square_values = torch.zeros([1, 3, side, side], dtype=pixel_values.dtype, device=pixel_values.device)
square_values = torch.zeros(
[1, 3, side, side], dtype=pixel_values.dtype, device=pixel_values.device
)
new_height, new_width = pixel_values.shape[2:]
if new_height == new_width:
square_values[:, :, :, :] = pixel_values
elif new_height > new_width:
from_index = (side - new_width) // 2
square_values[:, :, :, from_index:from_index + new_width] = pixel_values
square_values[:, :, :, from_index : from_index + new_width] = (
pixel_values
)
else:
from_index = (side - new_height) // 2
square_values[:, :, from_index:from_index + new_height, :] = pixel_values
square_values[:, :, from_index : from_index + new_height, :] = (
pixel_values
)
return square_values
......@@ -350,7 +377,9 @@ class OvisProcessor(ProcessorMixin):
good_grids = []
for grid in candidate_grids:
partition = _partition(img, grid)
covering_ratio = sum([_covering_area(*p, side) for p in partition]) / img_area
covering_ratio = (
sum([_covering_area(*p, side) for p in partition]) / img_area
)
assert covering_ratio <= 1.0
all_grids.append((grid, covering_ratio))
if covering_ratio > covering_threshold:
......@@ -358,18 +387,19 @@ class OvisProcessor(ProcessorMixin):
if len(good_grids) > 0:
# pick the good partition with minimum #sub_images and break the tie using covering_ratio
return sorted(good_grids, key=lambda x: (x[0][0] * x[0][1], -x[1]))[0][0]
return sorted(good_grids, key=lambda x: (x[0][0] * x[0][1], -x[1]))[0][
0
]
else:
# pick the partition with maximum covering_ratio and break the tie using #sub_images
return sorted(all_grids, key=lambda x: (-x[1], x[0][0] * x[0][1]))[0][0]
if convert_to_rgb:
image = convert_image_mode(image, 'RGB')
image = convert_image_mode(image, "RGB")
sides = self.get_image_size()
if sides[0] != sides[1]:
raise ValueError('get_image_size() returns non-square size')
raise ValueError("get_image_size() returns non-square size")
side = sides[0]
grid = _get_best_grid(image, side)
partition = _partition(image, grid)
......@@ -405,14 +435,18 @@ class OvisProcessor(ProcessorMixin):
`list[str]`: The decoded text.
"""
return self.tokenizer.batch_decode(
generated_outputs, skip_special_tokens=True, clean_up_tokenization_spaces=False
generated_outputs,
skip_special_tokens=True,
clean_up_tokenization_spaces=False,
)
@property
def model_input_names(self):
tokenizer_input_names = self.tokenizer.model_input_names
image_processor_input_names = self.image_processor.model_input_names
names_from_processor = list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
names_from_processor = list(
dict.fromkeys(tokenizer_input_names + image_processor_input_names)
)
return names_from_processor + ["second_per_grid_ts"]
......
......@@ -40,9 +40,6 @@ from vllm.utils.flashinfer import (
supports_trtllm_attention,
use_trtllm_attention,
)
# yapf conflicts with isort for this block
# yapf: disable
from vllm.v1.attention.backends.utils import (
AttentionCGSupport,
AttentionMetadataBuilder,
......@@ -52,8 +49,6 @@ from vllm.v1.attention.backends.utils import (
infer_global_hyperparameters,
split_decodes_and_prefills,
)
# yapf: enable
from vllm.v1.kv_cache_interface import AttentionSpec
FLASHINFER_WORKSPACE_BUFFER_SIZE = 256 * 1024 * 1024
......
......@@ -11,9 +11,6 @@ from vllm.attention.backends.abstract import AttentionLayer
from vllm.attention.ops.rocm_aiter_mla import aiter_mla_decode_fwd
from vllm.config import VllmConfig
from vllm.utils import cdiv
# yapf conflicts with isort for this docstring
# yapf: disable
from vllm.v1.attention.backends.mla.common import (
MLACommonBackend,
MLACommonDecodeMetadata,
......@@ -24,8 +21,6 @@ from vllm.v1.attention.backends.mla.common import (
from vllm.v1.attention.backends.utils import AttentionCGSupport
from vllm.v1.kv_cache_interface import AttentionSpec
# yapf: enable
def is_aiter_mla_enabled() -> bool:
return envs.VLLM_ROCM_USE_AITER and envs.VLLM_ROCM_USE_AITER_MLA
......
......@@ -18,8 +18,6 @@ from msgspec import msgpack
from vllm import envs
from vllm.logger import init_logger
# yapf: disable
from vllm.multimodal.inputs import (
BaseMultiModalField,
MultiModalBatchedField,
......@@ -32,8 +30,6 @@ from vllm.multimodal.inputs import (
MultiModalSharedField,
NestedTensors,
)
# yapf: enable
from vllm.v1.engine import UtilityResult
logger = init_logger(__name__)
......
......@@ -48,9 +48,6 @@ from vllm.model_executor.layers.mamba.abstract import MambaBase
from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding
from vllm.model_executor.model_loader import TensorizerLoader, get_model_loader
from vllm.model_executor.models.deepseek_v2 import DeepseekV32IndexerCache
# yapf conflicts with isort for this block
# yapf: disable
from vllm.model_executor.models.interfaces import (
SupportsMultiModal,
is_mixture_of_experts,
......@@ -59,8 +56,6 @@ from vllm.model_executor.models.interfaces import (
supports_multimodal_pruning,
supports_transcription,
)
# yapf: enable
from vllm.model_executor.models.interfaces_base import (
VllmModelForPooling,
is_pooling_model,
......@@ -101,9 +96,6 @@ from vllm.v1.attention.backends.utils import (
split_attn_metadata,
)
from vllm.v1.cudagraph_dispatcher import CudagraphDispatcher
# yapf conflicts with isort for this block
# yapf: disable
from vllm.v1.kv_cache_interface import (
AttentionSpec,
ChunkedLocalAttentionSpec,
......@@ -118,8 +110,6 @@ from vllm.v1.kv_cache_interface import (
SlidingWindowSpec,
UniformTypeKVCacheSpecs,
)
# yapf: enable
from vllm.v1.outputs import (
EMPTY_MODEL_RUNNER_OUTPUT,
AsyncModelRunnerOutput,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment