Remove all references to `yapf` as it's no longer used (#26251)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>

Remove all references to `yapf` as it's no longer used (#26251)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
4e256cad · Harry Mellor · GitHub · d6953beb · 4e256cad · 4e256cad
Unverified Commit 4e256cad authored Oct 05, 2025 by Harry Mellor Committed by GitHub Oct 05, 2025
18 changed files
--- a/vllm/model_executor/models/gemma3n_mm.py
+++ b/vllm/model_executor/models/gemma3n_mm.py
@@ -6,14 +6,16 @@ from typing import Annotated, Any, Literal, Optional, Union, cast
 import numpy as np
 import torch

-# yapf: disable
 from torch import nn
 from transformers import AutoModel, BatchFeature
-from transformers.models.gemma3n import (Gemma3nAudioConfig,
-                                         Gemma3nAudioFeatureExtractor,
-                                         Gemma3nConfig, Gemma3nProcessor,
-                                         Gemma3nTextConfig,
-                                         Gemma3nVisionConfig)
+from transformers.models.gemma3n import (
+    Gemma3nAudioConfig,
+    Gemma3nAudioFeatureExtractor,
+    Gemma3nConfig,
+    Gemma3nProcessor,
+    Gemma3nTextConfig,
+    Gemma3nVisionConfig,
+)
 from transformers.models.siglip import SiglipImageProcessorFast

 from vllm.config import ModelConfig, SpeechToTextConfig, VllmConfig
@@ -22,25 +24,32 @@ from vllm.inputs.data import PromptType
 from vllm.logger import init_logger
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import RowParallelLinear
-from vllm.model_executor.layers.vocab_parallel_embedding import (
-    VocabParallelEmbedding)
+from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding
 from vllm.model_executor.models.gemma3n import Gemma3nForCausalLM
 from vllm.model_executor.models.module_mapping import MultiModelKeys
 from vllm.model_executor.models.whisper import ISO639_1_SUPPORTED_LANGS
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
-                                    MultiModalKwargsItems)
-from vllm.multimodal.parse import (ImageProcessorItems, MultiModalDataItems,
-                                   MultiModalDataParser)
-from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        BaseProcessingInfo,
-                                        MultiModalPromptUpdates,
-                                        MultiModalPromptUpdatesApplyResult,
-                                        PlaceholderFeaturesInfo,
-                                        PromptReplacement, PromptUpdate,
-                                        PromptUpdateDetails,
-                                        replace_token_matches)
-# yapf: enable
+from vllm.multimodal.inputs import (
+    MultiModalDataDict,
+    MultiModalFieldConfig,
+    MultiModalKwargsItems,
+)
+from vllm.multimodal.parse import (
+    ImageProcessorItems,
+    MultiModalDataItems,
+    MultiModalDataParser,
+)
+from vllm.multimodal.processing import (
+    BaseMultiModalProcessor,
+    BaseProcessingInfo,
+    MultiModalPromptUpdates,
+    MultiModalPromptUpdatesApplyResult,
+    PlaceholderFeaturesInfo,
+    PromptReplacement,
+    PromptUpdate,
+    PromptUpdateDetails,
+    replace_token_matches,
+)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
 from vllm.utils.tensor_schema import TensorSchema, TensorShape

--- a/vllm/model_executor/models/idefics3.py
+++ b/vllm/model_executor/models/idefics3.py
@@ -43,9 +43,6 @@ from vllm.multimodal.inputs import (
    MultiModalKwargsItems,
 )
 from vllm.multimodal.parse import ImageProcessorItems, ImageSize
-
-# yapf conflicts with isort for this block
-# yapf: disable
 from vllm.multimodal.processing import (
    BaseMultiModalProcessor,
    BaseProcessingInfo,
@@ -54,18 +51,13 @@ from vllm.multimodal.processing import (
    PromptUpdate,
    PromptUpdateDetails,
 )
-
-# yapf: enable
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
 from vllm.utils.tensor_schema import TensorSchema, TensorShape

-# yapf: disable
 from .idefics2_vision_model import (
    Idefics2VisionTransformer as Idefics3VisionTransformer,
 )
-
-# yapf: enable
 from .interfaces import MultiModalEmbeddings, SupportsLoRA, SupportsMultiModal
 from .llama import LlamaModel
 from .utils import AutoWeightsLoader, maybe_prefix

--- a/vllm/model_executor/models/phi3v.py
+++ b/vllm/model_executor/models/phi3v.py
@@ -45,9 +45,6 @@ from vllm.multimodal.parse import (
    ImageSize,
    MultiModalDataItems,
 )
-
-# yapf conflicts with isort for this block
-# yapf: disable
 from vllm.multimodal.processing import (
    BaseMultiModalProcessor,
    BaseProcessingInfo,
@@ -57,8 +54,6 @@ from vllm.multimodal.processing import (
    PromptUpdate,
    ResolvedPromptUpdate,
 )
-
-# yapf: enable
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
 from vllm.utils import is_list_of

--- a/vllm/model_executor/models/qwen2_5_vl.py
+++ b/vllm/model_executor/models/qwen2_5_vl.py
@@ -52,16 +52,12 @@ from vllm.distributed import utils as dist_utils
 from vllm.logger import init_logger
 from vllm.model_executor.layers.activation import get_act_and_mul_fn
 from vllm.model_executor.layers.layernorm import RMSNorm
-
-# yapf: disable
 from vllm.model_executor.layers.linear import (
    ColumnParallelLinear,
    MergedColumnParallelLinear,
    QKVParallelLinear,
    RowParallelLinear,
 )
-
-# yapf: enable
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.models.module_mapping import MultiModelKeys

--- a/vllm/model_executor/models/qwen3_next.py
+++ b/vllm/model_executor/models/qwen3_next.py
@@ -37,12 +37,7 @@ from vllm.model_executor.layers.fla.ops import (
    fused_recurrent_gated_delta_rule,
 )
 from vllm.model_executor.layers.fused_moe import FusedMoE
-
-# yapf conflicts with isort for this block
-# yapf: disable
 from vllm.model_executor.layers.layernorm import GemmaRMSNorm as Qwen3NextRMSNorm
-
-# yapf: enable
 from vllm.model_executor.layers.linear import (
    ColumnParallelLinear,
    QKVParallelLinear,

--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -54,7 +54,6 @@ from .interfaces_base import (

 logger = init_logger(__name__)

-# yapf: disable
 _TEXT_GENERATION_MODELS = {
    # [Decoder-only]
    "ApertusForCausalLM": ("apertus", "ApertusForCausalLM"),
@@ -106,8 +105,8 @@ _TEXT_GENERATION_MODELS = {
    "GPTNeoXForCausalLM": ("gpt_neox", "GPTNeoXForCausalLM"),
    "GraniteForCausalLM": ("granite", "GraniteForCausalLM"),
    "GraniteMoeForCausalLM": ("granitemoe", "GraniteMoeForCausalLM"),
-    "GraniteMoeHybridForCausalLM": ("granitemoehybrid", "GraniteMoeHybridForCausalLM"),   # noqa: E501
-    "GraniteMoeSharedForCausalLM": ("granitemoeshared", "GraniteMoeSharedForCausalLM"),   # noqa: E501
+    "GraniteMoeHybridForCausalLM": ("granitemoehybrid", "GraniteMoeHybridForCausalLM"),  # noqa: E501
+    "GraniteMoeSharedForCausalLM": ("granitemoeshared", "GraniteMoeSharedForCausalLM"),  # noqa: E501
    "GritLM": ("gritlm", "GritLM"),
    "Grok1ModelForCausalLM": ("grok1", "Grok1ForCausalLM"),
    "HunYuanMoEV1ForCausalLM": ("hunyuan_v1", "HunYuanMoEV1ForCausalLM"),
@@ -127,7 +126,7 @@ _TEXT_GENERATION_MODELS = {
    "LongcatFlashForCausalLM": ("longcat_flash", "LongcatFlashForCausalLM"),
    "MambaForCausalLM": ("mamba", "MambaForCausalLM"),
    "FalconMambaForCausalLM": ("mamba", "MambaForCausalLM"),
-    "FalconH1ForCausalLM":("falcon_h1", "FalconH1ForCausalLM"),
+    "FalconH1ForCausalLM": ("falcon_h1", "FalconH1ForCausalLM"),
    "Mamba2ForCausalLM": ("mamba2", "Mamba2ForCausalLM"),
    "MiniCPMForCausalLM": ("minicpm", "MiniCPMForCausalLM"),
    "MiniCPM3ForCausalLM": ("minicpm3", "MiniCPM3ForCausalLM"),
@@ -184,7 +183,8 @@ _EMBEDDING_MODELS = {
    "LlamaModel": ("llama", "LlamaForCausalLM"),
    **{
        # Multiple models share the same architecture, so we include them all
-        k: (mod, arch) for k, (mod, arch) in _TEXT_GENERATION_MODELS.items()
+        k: (mod, arch)
+        for k, (mod, arch) in _TEXT_GENERATION_MODELS.items()
        if arch == "LlamaForCausalLM"
    },
    "MistralModel": ("llama", "LlamaForCausalLM"),
@@ -201,7 +201,10 @@ _EMBEDDING_MODELS = {
    "XLMRobertaModel": ("roberta", "RobertaEmbeddingModel"),
    # [Multimodal]
    "CLIPModel": ("clip", "CLIPEmbeddingModel"),
-    "LlavaNextForConditionalGeneration": ("llava_next", "LlavaNextForConditionalGeneration"),  # noqa: E501
+    "LlavaNextForConditionalGeneration": (
+        "llava_next",
+        "LlavaNextForConditionalGeneration",
+    ),  # noqa: E501
    "Phi3VForCausalLM": ("phi3v", "Phi3VForCausalLM"),
    "Qwen2VLForConditionalGeneration": ("qwen2_vl", "Qwen2VLForConditionalGeneration"),  # noqa: E501
    # Technically Terratorch models work on images, both in
@@ -214,79 +217,150 @@ _EMBEDDING_MODELS = {
 _CROSS_ENCODER_MODELS = {
    "BertForSequenceClassification": ("bert", "BertForSequenceClassification"),
    "BertForTokenClassification": ("bert", "BertForTokenClassification"),
-    "GteNewForSequenceClassification": ("bert_with_rope",
-                                        "GteNewForSequenceClassification"),
-    "ModernBertForSequenceClassification": ("modernbert",
-                                            "ModernBertForSequenceClassification"),
-    "RobertaForSequenceClassification": ("roberta",
-                                         "RobertaForSequenceClassification"),
-    "XLMRobertaForSequenceClassification": ("roberta",
-                                            "RobertaForSequenceClassification"),
+    "GteNewForSequenceClassification": (
+        "bert_with_rope",
+        "GteNewForSequenceClassification",
+    ),
+    "ModernBertForSequenceClassification": (
+        "modernbert",
+        "ModernBertForSequenceClassification",
+    ),
+    "RobertaForSequenceClassification": ("roberta", "RobertaForSequenceClassification"),
+    "XLMRobertaForSequenceClassification": (
+        "roberta",
+        "RobertaForSequenceClassification",
+    ),
    # [Auto-converted (see adapters.py)]
-    "JinaVLForRanking": ("jina_vl", "JinaVLForSequenceClassification"), # noqa: E501,
+    "JinaVLForRanking": ("jina_vl", "JinaVLForSequenceClassification"),  # noqa: E501,
 }

 _MULTIMODAL_MODELS = {
    # [Decoder-only]
    "AriaForConditionalGeneration": ("aria", "AriaForConditionalGeneration"),
-    "AyaVisionForConditionalGeneration": ("aya_vision", "AyaVisionForConditionalGeneration"),  # noqa: E501
+    "AyaVisionForConditionalGeneration": (
+        "aya_vision",
+        "AyaVisionForConditionalGeneration",
+    ),  # noqa: E501
    "Blip2ForConditionalGeneration": ("blip2", "Blip2ForConditionalGeneration"),
-    "ChameleonForConditionalGeneration": ("chameleon", "ChameleonForConditionalGeneration"),  # noqa: E501
-    "Cohere2VisionForConditionalGeneration": ("cohere2_vision", "Cohere2VisionForConditionalGeneration"),  # noqa: E501
+    "ChameleonForConditionalGeneration": (
+        "chameleon",
+        "ChameleonForConditionalGeneration",
+    ),  # noqa: E501
+    "Cohere2VisionForConditionalGeneration": (
+        "cohere2_vision",
+        "Cohere2VisionForConditionalGeneration",
+    ),  # noqa: E501
    "DeepseekVLV2ForCausalLM": ("deepseek_vl2", "DeepseekVLV2ForCausalLM"),
    "DotsOCRForCausalLM": ("dots_ocr", "DotsOCRForCausalLM"),
-    "Ernie4_5_VLMoeForConditionalGeneration": ("ernie45_vl", "Ernie4_5_VLMoeForConditionalGeneration"),  # noqa: E501
+    "Ernie4_5_VLMoeForConditionalGeneration": (
+        "ernie45_vl",
+        "Ernie4_5_VLMoeForConditionalGeneration",
+    ),  # noqa: E501
    "FuyuForCausalLM": ("fuyu", "FuyuForCausalLM"),
    "Gemma3ForConditionalGeneration": ("gemma3_mm", "Gemma3ForConditionalGeneration"),  # noqa: E501
-    "Gemma3nForConditionalGeneration": ("gemma3n_mm", "Gemma3nForConditionalGeneration"),    # noqa: E501
+    "Gemma3nForConditionalGeneration": (
+        "gemma3n_mm",
+        "Gemma3nForConditionalGeneration",
+    ),  # noqa: E501
    "GLM4VForCausalLM": ("glm4v", "GLM4VForCausalLM"),
    "Glm4vForConditionalGeneration": ("glm4_1v", "Glm4vForConditionalGeneration"),  # noqa: E501
    "Glm4vMoeForConditionalGeneration": ("glm4_1v", "Glm4vMoeForConditionalGeneration"),  # noqa: E501
-    "GraniteSpeechForConditionalGeneration": ("granite_speech", "GraniteSpeechForConditionalGeneration"),  # noqa: E501
+    "GraniteSpeechForConditionalGeneration": (
+        "granite_speech",
+        "GraniteSpeechForConditionalGeneration",
+    ),  # noqa: E501
    "H2OVLChatModel": ("h2ovl", "H2OVLChatModel"),
    "InternVLChatModel": ("internvl", "InternVLChatModel"),
    "NemotronH_Nano_VL_V2": ("nano_nemotron_vl", "NemotronH_Nano_VL_V2"),
-    "InternS1ForConditionalGeneration": ("interns1", "InternS1ForConditionalGeneration"),  # noqa: E501
-    "InternVLForConditionalGeneration": ("interns1", "InternS1ForConditionalGeneration"),  # noqa: E501
-    "Idefics3ForConditionalGeneration":("idefics3","Idefics3ForConditionalGeneration"),
-    "SmolVLMForConditionalGeneration": ("smolvlm","SmolVLMForConditionalGeneration"),  # noqa: E501
+    "InternS1ForConditionalGeneration": (
+        "interns1",
+        "InternS1ForConditionalGeneration",
+    ),  # noqa: E501
+    "InternVLForConditionalGeneration": (
+        "interns1",
+        "InternS1ForConditionalGeneration",
+    ),  # noqa: E501
+    "Idefics3ForConditionalGeneration": (
+        "idefics3",
+        "Idefics3ForConditionalGeneration",
+    ),
+    "SmolVLMForConditionalGeneration": ("smolvlm", "SmolVLMForConditionalGeneration"),  # noqa: E501
    "KeyeForConditionalGeneration": ("keye", "KeyeForConditionalGeneration"),
-    "KeyeVL1_5ForConditionalGeneration": ("keye_vl1_5", "KeyeVL1_5ForConditionalGeneration"), # noqa: E501
+    "KeyeVL1_5ForConditionalGeneration": (
+        "keye_vl1_5",
+        "KeyeVL1_5ForConditionalGeneration",
+    ),  # noqa: E501
    "RForConditionalGeneration": ("rvl", "RForConditionalGeneration"),
    "KimiVLForConditionalGeneration": ("kimi_vl", "KimiVLForConditionalGeneration"),  # noqa: E501
    "Llama_Nemotron_Nano_VL": ("nemotron_vl", "LlamaNemotronVLChatModel"),
    "Llama4ForConditionalGeneration": ("mllama4", "Llama4ForConditionalGeneration"),  # noqa: E501
    "LlavaForConditionalGeneration": ("llava", "LlavaForConditionalGeneration"),
-    "LlavaNextForConditionalGeneration": ("llava_next", "LlavaNextForConditionalGeneration"),  # noqa: E501
-    "LlavaNextVideoForConditionalGeneration": ("llava_next_video", "LlavaNextVideoForConditionalGeneration"),  # noqa: E501
-    "LlavaOnevisionForConditionalGeneration": ("llava_onevision", "LlavaOnevisionForConditionalGeneration"),  # noqa: E501
+    "LlavaNextForConditionalGeneration": (
+        "llava_next",
+        "LlavaNextForConditionalGeneration",
+    ),  # noqa: E501
+    "LlavaNextVideoForConditionalGeneration": (
+        "llava_next_video",
+        "LlavaNextVideoForConditionalGeneration",
+    ),  # noqa: E501
+    "LlavaOnevisionForConditionalGeneration": (
+        "llava_onevision",
+        "LlavaOnevisionForConditionalGeneration",
+    ),  # noqa: E501
    "MantisForConditionalGeneration": ("llava", "MantisForConditionalGeneration"),  # noqa: E501
    "MiDashengLMModel": ("midashenglm", "MiDashengLMModel"),
-    "MiniMaxVL01ForConditionalGeneration": ("minimax_vl_01", "MiniMaxVL01ForConditionalGeneration"),  # noqa: E501
+    "MiniMaxVL01ForConditionalGeneration": (
+        "minimax_vl_01",
+        "MiniMaxVL01ForConditionalGeneration",
+    ),  # noqa: E501
    "MiniCPMO": ("minicpmo", "MiniCPMO"),
    "MiniCPMV": ("minicpmv", "MiniCPMV"),
-    "Mistral3ForConditionalGeneration": ("mistral3", "Mistral3ForConditionalGeneration"),  # noqa: E501
+    "Mistral3ForConditionalGeneration": (
+        "mistral3",
+        "Mistral3ForConditionalGeneration",
+    ),  # noqa: E501
    "MolmoForCausalLM": ("molmo", "MolmoForCausalLM"),
    "NVLM_D": ("nvlm_d", "NVLM_D_Model"),
    "Ovis": ("ovis", "Ovis"),
    "Ovis2_5": ("ovis2_5", "Ovis2_5"),
-    "PaliGemmaForConditionalGeneration": ("paligemma", "PaliGemmaForConditionalGeneration"),  # noqa: E501
+    "PaliGemmaForConditionalGeneration": (
+        "paligemma",
+        "PaliGemmaForConditionalGeneration",
+    ),  # noqa: E501
    "Phi3VForCausalLM": ("phi3v", "Phi3VForCausalLM"),
    "Phi4MMForCausalLM": ("phi4mm", "Phi4MMForCausalLM"),
    "Phi4MultimodalForCausalLM": ("phi4_multimodal", "Phi4MultimodalForCausalLM"),  # noqa: E501
    "PixtralForConditionalGeneration": ("pixtral", "PixtralForConditionalGeneration"),  # noqa: E501
    "QwenVLForConditionalGeneration": ("qwen_vl", "QwenVLForConditionalGeneration"),  # noqa: E501
    "Qwen2VLForConditionalGeneration": ("qwen2_vl", "Qwen2VLForConditionalGeneration"),  # noqa: E501
-    "Qwen2_5_VLForConditionalGeneration": ("qwen2_5_vl", "Qwen2_5_VLForConditionalGeneration"),  # noqa: E501
-    "Qwen2AudioForConditionalGeneration": ("qwen2_audio", "Qwen2AudioForConditionalGeneration"),  # noqa: E501
-    "Qwen2_5OmniModel": ("qwen2_5_omni_thinker", "Qwen2_5OmniThinkerForConditionalGeneration"),  # noqa: E501
-    "Qwen2_5OmniForConditionalGeneration": ("qwen2_5_omni_thinker", "Qwen2_5OmniThinkerForConditionalGeneration"),  # noqa: E501
+    "Qwen2_5_VLForConditionalGeneration": (
+        "qwen2_5_vl",
+        "Qwen2_5_VLForConditionalGeneration",
+    ),  # noqa: E501
+    "Qwen2AudioForConditionalGeneration": (
+        "qwen2_audio",
+        "Qwen2AudioForConditionalGeneration",
+    ),  # noqa: E501
+    "Qwen2_5OmniModel": (
+        "qwen2_5_omni_thinker",
+        "Qwen2_5OmniThinkerForConditionalGeneration",
+    ),  # noqa: E501
+    "Qwen2_5OmniForConditionalGeneration": (
+        "qwen2_5_omni_thinker",
+        "Qwen2_5OmniThinkerForConditionalGeneration",
+    ),  # noqa: E501
    "Qwen3VLForConditionalGeneration": ("qwen3_vl", "Qwen3VLForConditionalGeneration"),  # noqa: E501
-    "Qwen3VLMoeForConditionalGeneration": ("qwen3_vl_moe", "Qwen3VLMoeForConditionalGeneration"),  # noqa: E501
+    "Qwen3VLMoeForConditionalGeneration": (
+        "qwen3_vl_moe",
+        "Qwen3VLMoeForConditionalGeneration",
+    ),  # noqa: E501
    "SkyworkR1VChatModel": ("skyworkr1v", "SkyworkR1VChatModel"),
    "Step3VLForConditionalGeneration": ("step3_vl", "Step3VLForConditionalGeneration"),  # noqa: E501
    "TarsierForConditionalGeneration": ("tarsier", "TarsierForConditionalGeneration"),  # noqa: E501
-    "Tarsier2ForConditionalGeneration": ("qwen2_vl", "Tarsier2ForConditionalGeneration"),  # noqa: E501
+    "Tarsier2ForConditionalGeneration": (
+        "qwen2_vl",
+        "Tarsier2ForConditionalGeneration",
+    ),  # noqa: E501
    "UltravoxModel": ("ultravox", "UltravoxModel"),
    "VoxtralForConditionalGeneration": ("voxtral", "VoxtralForConditionalGeneration"),  # noqa: E501
    # [Encoder-decoder]
@@ -324,13 +398,27 @@ _TRANSFORMERS_BACKEND_MODELS = {
    "TransformersForCausalLM": ("transformers", "TransformersForCausalLM"),
    "TransformersForMultimodalLM": ("transformers", "TransformersForMultimodalLM"),  # noqa: E501
    "TransformersMoEForCausalLM": ("transformers_moe", "TransformersMoEForCausalLM"),  # noqa: E501
-    "TransformersMoEForMultimodalLM": ("transformers_moe", "TransformersMoEForMultimodalLM"),  # noqa: E501
-    "TransformersEmbeddingModel": ("transformers_pooling", "TransformersEmbeddingModel"),  # noqa: E501
-    "TransformersForSequenceClassification": ("transformers_pooling", "TransformersForSequenceClassification"),  # noqa: E501
-    "TransformersMoEForSequenceClassification": ("transformers_pooling", "TransformersMoEForSequenceClassification"),  # noqa: E501
-    "TransformersMoEEmbeddingModel": ("transformers_pooling", "TransformersMoEEmbeddingModel"),  # noqa: E501
+    "TransformersMoEForMultimodalLM": (
+        "transformers_moe",
+        "TransformersMoEForMultimodalLM",
+    ),  # noqa: E501
+    "TransformersEmbeddingModel": (
+        "transformers_pooling",
+        "TransformersEmbeddingModel",
+    ),  # noqa: E501
+    "TransformersForSequenceClassification": (
+        "transformers_pooling",
+        "TransformersForSequenceClassification",
+    ),  # noqa: E501
+    "TransformersMoEForSequenceClassification": (
+        "transformers_pooling",
+        "TransformersMoEForSequenceClassification",
+    ),  # noqa: E501
+    "TransformersMoEEmbeddingModel": (
+        "transformers_pooling",
+        "TransformersMoEEmbeddingModel",
+    ),  # noqa: E501
 }
-# yapf: enable

 _VLLM_MODELS = {
    **_TEXT_GENERATION_MODELS,

--- a/vllm/model_executor/models/smolvlm.py
+++ b/vllm/model_executor/models/smolvlm.py
@@ -8,13 +8,10 @@ from transformers import SmolVLMProcessor
 from vllm.config import VllmConfig
 from vllm.multimodal import MULTIMODAL_REGISTRY

-# yapf: disable
 from .idefics3 import Idefics3DummyInputsBuilder as SmolVLMDummyInputsBuilder
 from .idefics3 import Idefics3ForConditionalGeneration, Idefics3ProcessingInfo
 from .idefics3 import Idefics3MultiModalProcessor as SmolVLMMultiModalProcessor

-# yapf: enable
-

 class SmolVLMProcessingInfo(Idefics3ProcessingInfo):
    def get_hf_processor(self, **kwargs: object) -> SmolVLMProcessor:

--- a/vllm/model_executor/models/voxtral.py
+++ b/vllm/model_executor/models/voxtral.py
@@ -32,11 +32,7 @@ from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.models import SupportsPP
 from vllm.model_executor.models.module_mapping import MultiModelKeys
-
-# yapf: disable
 from vllm.model_executor.models.whisper import WhisperEncoder
-
-# yapf: enable
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (
    MultiModalDataDict,

--- a/vllm/transformers_utils/chat_templates/registry.py
+++ b/vllm/transformers_utils/chat_templates/registry.py
@@ -28,7 +28,6 @@ def _get_minicpmv_chat_template_fallback(tokenizer_name_or_path: str) -> Optiona
    return CHAT_TEMPLATES_DIR / "template_chatml.jinja"


-# yapf: disable
 _MODEL_TYPE_TO_CHAT_TEMPLATE_FALLBACK: dict[str, ChatTemplatePath] = {
    "blip-2": CHAT_TEMPLATES_DIR / "template_blip2.jinja",
    "clip": CHAT_TEMPLATES_DIR / "template_basic.jinja",
@@ -39,7 +38,6 @@ _MODEL_TYPE_TO_CHAT_TEMPLATE_FALLBACK: dict[str, ChatTemplatePath] = {
    "paligemma": CHAT_TEMPLATES_DIR / "template_basic.jinja",
    "qwen": _get_qwen_chat_template_fallback,
 }
-# yapf: enable


 def register_chat_template_fallback_path(

--- a/vllm/transformers_utils/configs/arctic.py
+++ b/vllm/transformers_utils/configs/arctic.py
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project

-# yapf: disable
 # ruff: noqa: E501
 # coding=utf-8
 # Copied from
 # https://huggingface.co/Snowflake/snowflake-arctic-instruct/blob/main/configuration_arctic.py
-""" Arctic model configuration"""
+"""Arctic model configuration"""

 from dataclasses import asdict, dataclass
 from typing import Any

--- a/vllm/transformers_utils/configs/nemotron_vl.py
+++ b/vllm/transformers_utils/configs/nemotron_vl.py
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project

-# yapf: disable
 # ruff: noqa: E501
 # Adapted from
 # https://huggingface.co/nvidia/Llama-3.1-Nemotron-Nano-VL-8B-V1/blob/main/configuration.py
@@ -16,7 +15,7 @@ from transformers.dynamic_module_utils import get_class_from_dynamic_module


 class Nemotron_Nano_VL_Config(PretrainedConfig):
-    model_type = 'Llama_Nemotron_Nano_VL'
+    model_type = "Llama_Nemotron_Nano_VL"
    is_composition = True

    def __init__(
@@ -26,17 +25,22 @@ class Nemotron_Nano_VL_Config(PretrainedConfig):
        force_image_size=None,
        downsample_ratio=0.5,
        template=None,
-        ps_version='v1',
+        ps_version="v1",
        image_tag_type="internvl",
        projector_hidden_size=4096,
        vit_hidden_size=1280,
-        **kwargs
+        **kwargs,
    ):
        super().__init__(**kwargs)

        if vision_config is not None:
-            assert "auto_map" in vision_config and "AutoConfig" in vision_config["auto_map"]
-            vision_auto_config = get_class_from_dynamic_module(*vision_config["auto_map"]["AutoConfig"].split("--")[::-1])
+            assert (
+                "auto_map" in vision_config
+                and "AutoConfig" in vision_config["auto_map"]
+            )
+            vision_auto_config = get_class_from_dynamic_module(
+                *vision_config["auto_map"]["AutoConfig"].split("--")[::-1]
+            )
            self.vision_config = vision_auto_config(**vision_config)
        else:
            self.vision_config = PretrainedConfig()
@@ -51,6 +55,6 @@ class Nemotron_Nano_VL_Config(PretrainedConfig):
        self.downsample_ratio = downsample_ratio
        self.template = template  # TODO move out of here and into the tokenizer
        self.ps_version = ps_version  # Pixel shuffle version
-        self.image_tag_type = image_tag_type # TODO: into the tokenizer too?
+        self.image_tag_type = image_tag_type  # TODO: into the tokenizer too?
        self.projector_hidden_size = projector_hidden_size
        self.vit_hidden_size = vit_hidden_size
--- a/vllm/transformers_utils/configs/ovis.py
+++ b/vllm/transformers_utils/configs/ovis.py
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project

-# yapf: disable
 # ruff: noqa: E501
 # adapted from https://huggingface.co/AIDC-AI/Ovis2-1B/blob/main/configuration_aimv2.py
 # and https://huggingface.co/AIDC-AI/Ovis2-1B/blob/main/configuration_ovis.py
@@ -70,34 +69,37 @@ class AIMv2Config(PretrainedConfig):
 #                     Visual Tokenizer Configuration
 # ----------------------------------------------------------------------
 class BaseVisualTokenizerConfig(PretrainedConfig):
-
-    def __init__(self,
-                 vocab_size=16384,
-                 tokenize_function="softmax",
-                 tau=1.0,
-                 depths=None,
-                 drop_cls_token=False,
-                 backbone_config: Optional[Union[PretrainedConfig,
-                                                 dict]] = None,
-                 hidden_stride: int = 1,
-                 **kwargs):
+    def __init__(
+        self,
+        vocab_size=16384,
+        tokenize_function="softmax",
+        tau=1.0,
+        depths=None,
+        drop_cls_token=False,
+        backbone_config: Optional[Union[PretrainedConfig, dict]] = None,
+        hidden_stride: int = 1,
+        **kwargs,
+    ):
        super().__init__(**kwargs)
        self.vocab_size = vocab_size
        self.tokenize_function = tokenize_function
        self.tau = tau
        if isinstance(depths, str):
-            depths = [int(x) for x in depths.split('|')]
+            depths = [int(x) for x in depths.split("|")]
        self.depths = depths
        self.backbone_kwargs = dict[str, Any]()
        self.drop_cls_token = drop_cls_token
        if backbone_config is not None:
-            assert isinstance(backbone_config, (PretrainedConfig, dict)), \
+            assert isinstance(backbone_config, (PretrainedConfig, dict)), (
                f"expect `backbone_config` to be instance of PretrainedConfig or dict, but got {type(backbone_config)} type"
+            )
            if not isinstance(backbone_config, PretrainedConfig):
-                model_type = backbone_config['model_type']
+                model_type = backbone_config["model_type"]
                if model_type != "aimv2":
-                    backbone_config.pop('model_type')
-                    backbone_config = AutoConfig.for_model(model_type, **backbone_config)
+                    backbone_config.pop("model_type")
+                    backbone_config = AutoConfig.for_model(
+                        model_type, **backbone_config
+                    )
                else:
                    backbone_config = AIMv2Config(**backbone_config)
        self.backbone_config = backbone_config
@@ -113,7 +115,7 @@ class Aimv2VisualTokenizerConfig(BaseVisualTokenizerConfig):
            self.drop_cls_token = False
        if self.depths:
            assert len(self.depths) == 1
-            self.backbone_kwargs['num_hidden_layers'] = self.depths[0]
+            self.backbone_kwargs["num_hidden_layers"] = self.depths[0]


 class SiglipVisualTokenizerConfig(BaseVisualTokenizerConfig):
@@ -125,7 +127,7 @@ class SiglipVisualTokenizerConfig(BaseVisualTokenizerConfig):
            self.drop_cls_token = False
        if self.depths:
            assert len(self.depths) == 1
-            self.backbone_kwargs['num_hidden_layers'] = self.depths[0]
+            self.backbone_kwargs["num_hidden_layers"] = self.depths[0]


 AutoConfig.register("siglip_visual_tokenizer", SiglipVisualTokenizerConfig)
@@ -138,35 +140,39 @@ AutoConfig.register("aimv2_visual_tokenizer", Aimv2VisualTokenizerConfig)
 class OvisConfig(PretrainedConfig):
    model_type = "ovis"

-    def __init__(self,
-                 llm_config: Optional[Union[PretrainedConfig, dict]] = None,
-                 visual_tokenizer_config: Optional[Union[PretrainedConfig,
-                                                         dict]] = None,
-                 multimodal_max_length=8192,
-                 hidden_size=None,
-                 conversation_formatter_class=None,
-                 llm_attn_implementation=None,
-                 disable_tie_weight=False,
-                 **kwargs):
+    def __init__(
+        self,
+        llm_config: Optional[Union[PretrainedConfig, dict]] = None,
+        visual_tokenizer_config: Optional[Union[PretrainedConfig, dict]] = None,
+        multimodal_max_length=8192,
+        hidden_size=None,
+        conversation_formatter_class=None,
+        llm_attn_implementation=None,
+        disable_tie_weight=False,
+        **kwargs,
+    ):
        super().__init__(**kwargs)
        if llm_config is not None:
-            assert isinstance(llm_config, (PretrainedConfig, dict)), \
+            assert isinstance(llm_config, (PretrainedConfig, dict)), (
                f"expect `llm_config` to be instance of PretrainedConfig or dict, but got {type(llm_config)} type"
+            )
            if not isinstance(llm_config, PretrainedConfig):
-                model_type = llm_config['model_type']
-                llm_config.pop('model_type')
+                model_type = llm_config["model_type"]
+                llm_config.pop("model_type")
                llm_config = AutoConfig.for_model(model_type, **llm_config)

        # map llm_config to text_config
        self.text_config = llm_config
        if visual_tokenizer_config is not None:
-            assert isinstance(visual_tokenizer_config, (PretrainedConfig, dict)), \
+            assert isinstance(visual_tokenizer_config, (PretrainedConfig, dict)), (
                f"expect `visual_tokenizer_config` to be instance of PretrainedConfig or dict, but got {type(visual_tokenizer_config)} type"
+            )
            if not isinstance(visual_tokenizer_config, PretrainedConfig):
-                model_type = visual_tokenizer_config['model_type']
-                visual_tokenizer_config.pop('model_type')
+                model_type = visual_tokenizer_config["model_type"]
+                visual_tokenizer_config.pop("model_type")
                visual_tokenizer_config = AutoConfig.for_model(
-                    model_type, **visual_tokenizer_config)
+                    model_type, **visual_tokenizer_config
+                )

        self.visual_tokenizer_config = visual_tokenizer_config
        self.multimodal_max_length = multimodal_max_length

--- a/vllm/transformers_utils/processors/deepseek_vl2.py
+++ b/vllm/transformers_utils/processors/deepseek_vl2.py
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project

-# yapf: disable
 # ruff: noqa: E501
 # coding=utf-8
 # adapted from https://github.com/deepseek-ai/DeepSeek-VL2/blob/ff23960c5cf9e6874b44be38af930cfb0ccbb620/deepseek_vl2/models/processing_deepseek_vl_v2.py
@@ -35,11 +34,12 @@ from transformers.processing_utils import ProcessorMixin


 class ImageTransform:
-
-    def __init__(self,
-                 mean: tuple[float, float, float] = (0.5, 0.5, 0.5),
-                 std: tuple[float, float, float] = (0.5, 0.5, 0.5),
-                 normalize: bool = True):
+    def __init__(
+        self,
+        mean: tuple[float, float, float] = (0.5, 0.5, 0.5),
+        std: tuple[float, float, float] = (0.5, 0.5, 0.5),
+        normalize: bool = True,
+    ):
        self.mean = mean
        self.std = std
        self.normalize = normalize
@@ -77,7 +77,6 @@ class DeepseekVLV2Processor(ProcessorMixin):
        ignore_id: int = -100,
        **kwargs,
    ):
-
        self.candidate_resolutions = candidate_resolutions
        self.image_size = candidate_resolutions[0][0]
        self.patch_size = patch_size
@@ -86,13 +85,15 @@ class DeepseekVLV2Processor(ProcessorMixin):
        self.normalize = normalize
        self.downsample_ratio = downsample_ratio

-        self.image_transform = ImageTransform(mean=image_mean, std=image_std, normalize=normalize)
+        self.image_transform = ImageTransform(
+            mean=image_mean, std=image_std, normalize=normalize
+        )
        self.tokenizer = tokenizer
-        self.tokenizer.padding_side = 'left'  # must set this，padding side with make a difference in batch inference
+        self.tokenizer.padding_side = "left"  # must set this，padding side with make a difference in batch inference

        # add the pad_token as special token to use 'tokenizer.pad_token' and 'tokenizer.pad_token_id'
        if tokenizer.pad_token is None:
-            self.tokenizer.add_special_tokens({'pad_token': pad_token})
+            self.tokenizer.add_special_tokens({"pad_token": pad_token})

        # add image token
        image_token_id = self.tokenizer.vocab.get(image_token)
@@ -104,7 +105,7 @@ class DeepseekVLV2Processor(ProcessorMixin):

        # add five special tokens for grounding-related tasks
        # <|ref|>, <|/ref|>, <|det|>, <|/det|>, <|grounding|>
-        special_tokens = ['<|ref|>', '<|/ref|>', '<|det|>', '<|/det|>', '<|grounding|>']
+        special_tokens = ["<|ref|>", "<|/ref|>", "<|det|>", "<|/det|>", "<|grounding|>"]
        special_tokens_dict = {"additional_special_tokens": special_tokens}
        self.tokenizer.add_special_tokens(special_tokens_dict)

@@ -134,15 +135,19 @@ class DeepseekVLV2Processor(ProcessorMixin):

        for width, height in self.candidate_resolutions:
            scale = min(width / original_width, height / original_height)
-            downscaled_width, downscaled_height = int(
-                original_width * scale), int(original_height * scale)
-            effective_resolution = min(downscaled_width * downscaled_height,
-                                       original_width * original_height)
+            downscaled_width, downscaled_height = (
+                int(original_width * scale),
+                int(original_height * scale),
+            )
+            effective_resolution = min(
+                downscaled_width * downscaled_height, original_width * original_height
+            )
            wasted_resolution = (width * height) - effective_resolution

            if effective_resolution > max_effective_resolution or (
-                    effective_resolution == max_effective_resolution
-                    and wasted_resolution < min_wasted_resolution):
+                effective_resolution == max_effective_resolution
+                and wasted_resolution < min_wasted_resolution
+            ):
                max_effective_resolution = effective_resolution
                min_wasted_resolution = wasted_resolution
                best_fit = (width, height)
@@ -198,12 +203,20 @@ class DeepseekVLV2Processor(ProcessorMixin):
                - num_image_tokens (list[int]): the number of image tokens
        """

-        assert (prompt is not None and images is not None
-                ), "prompt and images must be used at the same time."
+        assert prompt is not None and images is not None, (
+            "prompt and images must be used at the same time."
+        )

        sft_format = prompt
-        tokenized_str, images_list, images_seq_mask, images_spatial_crop, num_image_tokens = self.tokenize_with_images(
-            sft_format, images, bos=True, eos=True, cropping=len(images) <= 2)
+        (
+            tokenized_str,
+            images_list,
+            images_seq_mask,
+            images_spatial_crop,
+            num_image_tokens,
+        ) = self.tokenize_with_images(
+            sft_format, images, bos=True, eos=True, cropping=len(images) <= 2
+        )
        masked_tokenized_str = []
        for token_index in tokenized_str:
            if token_index != self.image_token_id:
@@ -211,17 +224,21 @@ class DeepseekVLV2Processor(ProcessorMixin):
            else:
                masked_tokenized_str.append(self.ignore_id)

-        assert len(tokenized_str) == len(images_seq_mask) == len(masked_tokenized_str), \
-            (f"tokenized_str's length {len(tokenized_str)}, input_ids' length {len(masked_tokenized_str)}, "
-             f"imags_seq_mask's length {len(images_seq_mask)}, are not equal")
+        assert (
+            len(tokenized_str) == len(images_seq_mask) == len(masked_tokenized_str)
+        ), (
+            f"tokenized_str's length {len(tokenized_str)}, input_ids' length {len(masked_tokenized_str)}, "
+            f"imags_seq_mask's length {len(images_seq_mask)}, are not equal"
+        )

        input_ids = torch.LongTensor(tokenized_str)
        target_ids = torch.LongTensor(masked_tokenized_str)
        images_seq_mask = torch.tensor(images_seq_mask, dtype=torch.bool)

        # set input_ids < 0 | input_ids == self.image_token_id as ignore_id
-        target_ids[(input_ids < 0) |
-                   (input_ids == self.image_token_id)] = self.ignore_id
+        target_ids[(input_ids < 0) | (input_ids == self.image_token_id)] = (
+            self.ignore_id
+        )
        input_ids[input_ids < 0] = self.pad_id

        if inference_mode:
@@ -311,30 +328,50 @@ class DeepseekVLV2Processor(ProcessorMixin):
                best_width, best_height = self.image_size, self.image_size

            """process the global view"""
-            global_view = ImageOps.pad(image, (self.image_size, self.image_size),
-                                       color=tuple(int(x * 255) for x in self.image_transform.mean))
+            global_view = ImageOps.pad(
+                image,
+                (self.image_size, self.image_size),
+                color=tuple(int(x * 255) for x in self.image_transform.mean),
+            )
            images_list.append(self.image_transform(global_view))

            """process the local views"""
-            local_view = ImageOps.pad(image, (best_width, best_height),
-                                      color=tuple(int(x * 255) for x in self.image_transform.mean))
+            local_view = ImageOps.pad(
+                image,
+                (best_width, best_height),
+                color=tuple(int(x * 255) for x in self.image_transform.mean),
+            )
            for i in range(0, best_height, self.image_size):
                for j in range(0, best_width, self.image_size):
                    images_list.append(
-                        self.image_transform(local_view.crop((j, i, j + self.image_size, i + self.image_size))))
+                        self.image_transform(
+                            local_view.crop(
+                                (j, i, j + self.image_size, i + self.image_size)
+                            )
+                        )
+                    )

            """record height / width crop num"""
-            num_width_tiles, num_height_tiles = best_width // self.image_size, best_height // self.image_size
+            num_width_tiles, num_height_tiles = (
+                best_width // self.image_size,
+                best_height // self.image_size,
+            )
            images_spatial_crop.append([num_width_tiles, num_height_tiles])

            """add image tokens"""
-            h = w = math.ceil((self.image_size // self.patch_size) / self.downsample_ratio)
+            h = w = math.ceil(
+                (self.image_size // self.patch_size) / self.downsample_ratio
+            )
            # global views tokens h * (w + 1), 1 is for line separator
            tokenized_image = [self.image_token_id] * h * (w + 1)
            # add a separator between global and local views
            tokenized_image += [self.image_token_id]
            # local views tokens, (num_height_tiles * h) * (num_width_tiles * w + 1)
-            tokenized_image += [self.image_token_id] * (num_height_tiles * h) * (num_width_tiles * w + 1)
+            tokenized_image += (
+                [self.image_token_id]
+                * (num_height_tiles * h)
+                * (num_width_tiles * w + 1)
+            )

            tokenized_str += tokenized_image
            images_seq_mask += [True] * len(tokenized_image)
@@ -353,10 +390,17 @@ class DeepseekVLV2Processor(ProcessorMixin):
            tokenized_str = tokenized_str + [self.eos_id]
            images_seq_mask = images_seq_mask + [False]

-        assert len(tokenized_str) == len(
-            images_seq_mask), f"tokenize_with_images func: tokenized_str's length {len(tokenized_str)} is not equal to imags_seq_mask's length {len(images_seq_mask)}"
+        assert len(tokenized_str) == len(images_seq_mask), (
+            f"tokenize_with_images func: tokenized_str's length {len(tokenized_str)} is not equal to imags_seq_mask's length {len(images_seq_mask)}"
+        )

-        return tokenized_str, images_list, images_seq_mask, images_spatial_crop, num_image_tokens
+        return (
+            tokenized_str,
+            images_list,
+            images_seq_mask,
+            images_spatial_crop,
+            num_image_tokens,
+        )


 AutoProcessor.register("DeepseekVLV2Processor", DeepseekVLV2Processor)
--- a/vllm/transformers_utils/processors/ovis.py
+++ b/vllm/transformers_utils/processors/ovis.py
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project

-# yapf: disable
 # ruff: noqa: E501
 # coding=utf-8
 # adapted from https://github.com/AIDC-AI/Ovis/blob/35ab51a1a1e3542fa6db260a1084cefbc8f164bb/ovis/vllm/processing_ovis.py
@@ -35,23 +34,24 @@ from transformers.tokenization_utils_base import PreTokenizedInput, TextInput

 from vllm.multimodal.image import convert_image_mode

-__all__ = ['OvisProcessor']
+__all__ = ["OvisProcessor"]
 IGNORE_ID = -100

-class OvisProcessorKwargs(ProcessingKwargs, total=False):   # type: ignore[call-arg]
+
+class OvisProcessorKwargs(ProcessingKwargs, total=False):  # type: ignore[call-arg]
    _defaults = {
        "text_kwargs": {
            "padding": False,
        },
        "images_kwargs": {
-            'max_partition':9,
-            'covering_threshold':0.9,
-            'convert_to_rgb':True,
-        'return_tensors':'pt'},
+            "max_partition": 9,
+            "covering_threshold": 0.9,
+            "convert_to_rgb": True,
+            "return_tensors": "pt",
+        },
    }


-
 class OvisProcessor(ProcessorMixin):
    r"""
    Constructs an Ovis processor which wraps an Ovis image processor and a Qwen2 tokenizer into a single processor.
@@ -97,14 +97,16 @@ class OvisProcessor(ProcessorMixin):
            "image_col_sep": -303,
            "image_row_sep": -304,
            "image_end": -305,
-            'image_pad': image_pad_token_id,
+            "image_pad": image_pad_token_id,
        }
        return extra_special_tokens

    def __call__(
        self,
        images: ImageInput = None,
-        text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
+        text: Union[
+            TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]
+        ] = None,
        **kwargs: Unpack[OvisProcessorKwargs],
    ) -> BatchFeature:
        """
@@ -169,7 +171,6 @@ class OvisProcessor(ProcessorMixin):

        # Process text input
        if text is not None:
-
            if not isinstance(text, list):
                text = [text]

@@ -178,7 +179,10 @@ class OvisProcessor(ProcessorMixin):
            replaced_ids_list = []
            idx = 0
            for ids_tensor in tokenized_batched_text:
-                if image_token_id in ids_tensor and "image_placeholders" in image_features:
+                if (
+                    image_token_id in ids_tensor
+                    and "image_placeholders" in image_features
+                ):
                    if idx < len(image_features["image_placeholders"]):
                        # Converts in list for ease of use
                        ids_list = ids_tensor.tolist()
@@ -188,7 +192,9 @@ class OvisProcessor(ProcessorMixin):
                        # replace placeholders
                        for i, token_id in enumerate(ids_list):
                            if token_id == image_token_id:
-                                placeholder_ids = image_features["image_placeholders"][idx]
+                                placeholder_ids = image_features["image_placeholders"][
+                                    idx
+                                ]
                                new_ids.extend(placeholder_ids)
                                idx += 1
                            else:
@@ -198,7 +204,8 @@ class OvisProcessor(ProcessorMixin):
                        ids_tensor = torch.tensor(new_ids, dtype=torch.long)
                    else:
                        raise RuntimeError(
-                            'Mismatch between the images you provided and the number of placeholder present in the text')
+                            "Mismatch between the images you provided and the number of placeholder present in the text"
+                        )

                replaced_ids_list.append(ids_tensor)

@@ -217,7 +224,7 @@ class OvisProcessor(ProcessorMixin):
            # Add image features if present
            if image_features:
                output["pixel_values"] = processed_images
-                output['grids'] = grids
+                output["grids"] = grids

            return output

@@ -227,8 +234,10 @@ class OvisProcessor(ProcessorMixin):
    def _tokenize_with_image_symbol(self, text_list: list[str]) -> torch.LongTensor:
        batch_token_ids = []
        for text in text_list:
-            text_chunks = [self.tokenizer(chunk, add_special_tokens=False).input_ids for chunk in
-                           text.split(self.image_token)]
+            text_chunks = [
+                self.tokenizer(chunk, add_special_tokens=False).input_ids
+                for chunk in text.split(self.image_token)
+            ]
            token_ids = []
            num_chuck = len(text_chunks)
            for i, chunk in enumerate(text_chunks):
@@ -240,50 +249,60 @@ class OvisProcessor(ProcessorMixin):

    def get_image_size(self):
        size = self.image_processor.size
-        if 'shortest_edge' in size:
-            width = height = size['shortest_edge']
+        if "shortest_edge" in size:
+            width = height = size["shortest_edge"]
        elif "height" in size and "width" in size:
-            width = size['width']
-            height = size['height']
+            width = size["width"]
+            height = size["height"]
        else:
-            raise ValueError( "Can't parse image size from image_processor config.")
+            raise ValueError("Can't parse image size from image_processor config.")
        return height, width

    def get_token_value(self, tok):
        return self.extra_special_tokens[tok]

    def construct_image_indicators(self, grid):
-        image_placeholders = [self.get_token_value('image_start'),
-                              self.get_token_value('image_atom'),
-                              self.get_token_value('image_prefix')]
+        image_placeholders = [
+            self.get_token_value("image_start"),
+            self.get_token_value("image_atom"),
+            self.get_token_value("image_prefix"),
+        ]
        if grid[0] * grid[1] > 1:
            for r in range(grid[0]):
                for c in range(grid[1]):
-                    image_placeholders.append(self.get_token_value('image_atom') )
+                    image_placeholders.append(self.get_token_value("image_atom"))
                    if c < grid[1] - 1:
-                        image_placeholders.append(self.get_token_value('image_col_sep'))
+                        image_placeholders.append(self.get_token_value("image_col_sep"))
                if r < grid[0] - 1:
-                    image_placeholders.append(self.get_token_value('image_row_sep'))
-        image_placeholders.append(self.get_token_value('image_end'))
+                    image_placeholders.append(self.get_token_value("image_row_sep"))
+        image_placeholders.append(self.get_token_value("image_end"))
        return image_placeholders

    def construct_image_placeholders(self, grid):
-
        image_placeholders = self.construct_image_indicators(grid)

-        image_atom_token_id = self.get_token_value('image_atom')
+        image_atom_token_id = self.get_token_value("image_atom")
        # Extract the padding token ID from tokenizer
-        image_padding_token_id = self.get_token_value('image_pad')
+        image_padding_token_id = self.get_token_value("image_pad")

        # Create a new list with padding tokens inserted
        padded_placeholder_tokens = []
        for token in image_placeholders:
            padded_placeholder_tokens.append(image_padding_token_id)
            if token == image_atom_token_id:
-                padded_placeholder_tokens.extend([image_padding_token_id] * self.image_segment_len)
+                padded_placeholder_tokens.extend(
+                    [image_padding_token_id] * self.image_segment_len
+                )
        return padded_placeholder_tokens

-    def preprocess_image(self, image: PIL.Image.Image, max_partition, covering_threshold, convert_to_rgb, return_tensors):
+    def preprocess_image(
+        self,
+        image: PIL.Image.Image,
+        max_partition,
+        covering_threshold,
+        convert_to_rgb,
+        return_tensors,
+    ):
        def _preprocess(img: PIL.Image.Image, side):
            # first resize and preprocess
            w, h = img.size
@@ -296,19 +315,27 @@ class OvisProcessor(ProcessorMixin):
                new_height = side
                new_width = int(w / h * new_height)
            new_size = dict(height=new_height, width=new_width)
-            pixel_values = self.image_processor.preprocess(img, size=new_size, return_tensors=return_tensors)['pixel_values']
+            pixel_values = self.image_processor.preprocess(
+                img, size=new_size, return_tensors=return_tensors
+            )["pixel_values"]

            # then pad to square
-            square_values = torch.zeros([1, 3, side, side], dtype=pixel_values.dtype, device=pixel_values.device)
+            square_values = torch.zeros(
+                [1, 3, side, side], dtype=pixel_values.dtype, device=pixel_values.device
+            )
            new_height, new_width = pixel_values.shape[2:]
            if new_height == new_width:
                square_values[:, :, :, :] = pixel_values
            elif new_height > new_width:
                from_index = (side - new_width) // 2
-                square_values[:, :, :, from_index:from_index + new_width] = pixel_values
+                square_values[:, :, :, from_index : from_index + new_width] = (
+                    pixel_values
+                )
            else:
                from_index = (side - new_height) // 2
-                square_values[:, :, from_index:from_index + new_height, :] = pixel_values
+                square_values[:, :, from_index : from_index + new_height, :] = (
+                    pixel_values
+                )

            return square_values

@@ -350,7 +377,9 @@ class OvisProcessor(ProcessorMixin):
            good_grids = []
            for grid in candidate_grids:
                partition = _partition(img, grid)
-                covering_ratio = sum([_covering_area(*p, side) for p in partition]) / img_area
+                covering_ratio = (
+                    sum([_covering_area(*p, side) for p in partition]) / img_area
+                )
                assert covering_ratio <= 1.0
                all_grids.append((grid, covering_ratio))
                if covering_ratio > covering_threshold:
@@ -358,18 +387,19 @@ class OvisProcessor(ProcessorMixin):

            if len(good_grids) > 0:
                # pick the good partition with minimum #sub_images and break the tie using covering_ratio
-                return sorted(good_grids, key=lambda x: (x[0][0] * x[0][1], -x[1]))[0][0]
+                return sorted(good_grids, key=lambda x: (x[0][0] * x[0][1], -x[1]))[0][
+                    0
+                ]
            else:
                # pick the partition with maximum covering_ratio and break the tie using #sub_images
                return sorted(all_grids, key=lambda x: (-x[1], x[0][0] * x[0][1]))[0][0]

        if convert_to_rgb:
-            image = convert_image_mode(image, 'RGB')
-
+            image = convert_image_mode(image, "RGB")

        sides = self.get_image_size()
        if sides[0] != sides[1]:
-            raise ValueError('get_image_size() returns non-square size')
+            raise ValueError("get_image_size() returns non-square size")
        side = sides[0]
        grid = _get_best_grid(image, side)
        partition = _partition(image, grid)
@@ -405,14 +435,18 @@ class OvisProcessor(ProcessorMixin):
            `list[str]`: The decoded text.
        """
        return self.tokenizer.batch_decode(
-            generated_outputs, skip_special_tokens=True, clean_up_tokenization_spaces=False
+            generated_outputs,
+            skip_special_tokens=True,
+            clean_up_tokenization_spaces=False,
        )

    @property
    def model_input_names(self):
        tokenizer_input_names = self.tokenizer.model_input_names
        image_processor_input_names = self.image_processor.model_input_names
-        names_from_processor = list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
+        names_from_processor = list(
+            dict.fromkeys(tokenizer_input_names + image_processor_input_names)
+        )
        return names_from_processor + ["second_per_grid_ts"]



--- a/vllm/v1/attention/backends/flashinfer.py
+++ b/vllm/v1/attention/backends/flashinfer.py
@@ -40,9 +40,6 @@ from vllm.utils.flashinfer import (
    supports_trtllm_attention,
    use_trtllm_attention,
 )
-
-# yapf conflicts with isort for this block
-# yapf: disable
 from vllm.v1.attention.backends.utils import (
    AttentionCGSupport,
    AttentionMetadataBuilder,
@@ -52,8 +49,6 @@ from vllm.v1.attention.backends.utils import (
    infer_global_hyperparameters,
    split_decodes_and_prefills,
 )
-
-# yapf: enable
 from vllm.v1.kv_cache_interface import AttentionSpec

 FLASHINFER_WORKSPACE_BUFFER_SIZE = 256 * 1024 * 1024

--- a/vllm/v1/attention/backends/mla/rocm_aiter_mla.py
+++ b/vllm/v1/attention/backends/mla/rocm_aiter_mla.py
@@ -11,9 +11,6 @@ from vllm.attention.backends.abstract import AttentionLayer
 from vllm.attention.ops.rocm_aiter_mla import aiter_mla_decode_fwd
 from vllm.config import VllmConfig
 from vllm.utils import cdiv
-
-# yapf conflicts with isort for this docstring
-# yapf: disable
 from vllm.v1.attention.backends.mla.common import (
    MLACommonBackend,
    MLACommonDecodeMetadata,
@@ -24,8 +21,6 @@ from vllm.v1.attention.backends.mla.common import (
 from vllm.v1.attention.backends.utils import AttentionCGSupport
 from vllm.v1.kv_cache_interface import AttentionSpec

-# yapf: enable
-

 def is_aiter_mla_enabled() -> bool:
    return envs.VLLM_ROCM_USE_AITER and envs.VLLM_ROCM_USE_AITER_MLA

--- a/vllm/v1/serial_utils.py
+++ b/vllm/v1/serial_utils.py
@@ -18,8 +18,6 @@ from msgspec import msgpack

 from vllm import envs
 from vllm.logger import init_logger
-
-# yapf: disable
 from vllm.multimodal.inputs import (
    BaseMultiModalField,
    MultiModalBatchedField,
@@ -32,8 +30,6 @@ from vllm.multimodal.inputs import (
    MultiModalSharedField,
    NestedTensors,
 )
-
-# yapf: enable
 from vllm.v1.engine import UtilityResult

 logger = init_logger(__name__)

--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -48,9 +48,6 @@ from vllm.model_executor.layers.mamba.abstract import MambaBase
 from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding
 from vllm.model_executor.model_loader import TensorizerLoader, get_model_loader
 from vllm.model_executor.models.deepseek_v2 import DeepseekV32IndexerCache
-
-# yapf conflicts with isort for this block
-# yapf: disable
 from vllm.model_executor.models.interfaces import (
    SupportsMultiModal,
    is_mixture_of_experts,
@@ -59,8 +56,6 @@ from vllm.model_executor.models.interfaces import (
    supports_multimodal_pruning,
    supports_transcription,
 )
-
-# yapf: enable
 from vllm.model_executor.models.interfaces_base import (
    VllmModelForPooling,
    is_pooling_model,
@@ -101,9 +96,6 @@ from vllm.v1.attention.backends.utils import (
    split_attn_metadata,
 )
 from vllm.v1.cudagraph_dispatcher import CudagraphDispatcher
-
-# yapf conflicts with isort for this block
-# yapf: disable
 from vllm.v1.kv_cache_interface import (
    AttentionSpec,
    ChunkedLocalAttentionSpec,
@@ -118,8 +110,6 @@ from vllm.v1.kv_cache_interface import (
    SlidingWindowSpec,
    UniformTypeKVCacheSpecs,
 )
-
-# yapf: enable
 from vllm.v1.outputs import (
    EMPTY_MODEL_RUNNER_OUTPUT,
    AsyncModelRunnerOutput,