Fix per file ruff ignores related to line length (#26262)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>

Fix per file ruff ignores related to line length (#26262)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
6c046382 · Harry Mellor · GitHub · 91ac7f76 · 6c046382 · 6c046382
Unverified Commit 6c046382 authored Oct 06, 2025 by Harry Mellor Committed by GitHub Oct 06, 2025
20 changed files
--- a/tests/tool_use/test_tool_choice_required.py
+++ b/tests/tool_use/test_tool_choice_required.py
@@ -47,7 +47,8 @@ EXAMPLE_TOOLS = [
                "properties": {
                    "city": {
                        "type": "string",
-                        "description": "The city to get the forecast for, e.g. 'New York'",
+                        "description": "The city to get the forecast for, e.g. "
+                        "'New York'",
                    },
                    "days": {
                        "type": "integer",

--- a/tests/v1/attention/utils.py
+++ b/tests/v1/attention/utils.py
@@ -134,15 +134,15 @@ def get_attention_backend(backend_name: _Backend):
            else "vllm.v1.attention.backends.rocm_aiter_fa.AiterFlashAttentionBackend"
        ),
        _Backend.FLASHINFER: "vllm.v1.attention.backends.flashinfer.FlashInferBackend",
-        _Backend.FLEX_ATTENTION: "vllm.v1.attention.backends.flex_attention.FlexAttentionBackend",
-        _Backend.TRITON_ATTN: "vllm.v1.attention.backends.triton_attn.TritonAttentionBackend",
+        _Backend.FLEX_ATTENTION: "vllm.v1.attention.backends.flex_attention.FlexAttentionBackend",  # noqa: E501
+        _Backend.TRITON_ATTN: "vllm.v1.attention.backends.triton_attn.TritonAttentionBackend",  # noqa: E501
        _Backend.TREE_ATTN: "vllm.v1.attention.backends.tree_attn.TreeAttentionBackend",
-        _Backend.XFORMERS: "vllm.v1.attention.backends.xformers.XFormersAttentionBackend",
-        _Backend.CUTLASS_MLA: "vllm.v1.attention.backends.mla.cutlass_mla.CutlassMLABackend",
+        _Backend.XFORMERS: "vllm.v1.attention.backends.xformers.XFormersAttentionBackend",  # noqa: E501
+        _Backend.CUTLASS_MLA: "vllm.v1.attention.backends.mla.cutlass_mla.CutlassMLABackend",  # noqa: E501
        _Backend.FLASHMLA: "vllm.v1.attention.backends.mla.flashmla.FlashMLABackend",
-        _Backend.FLASH_ATTN_MLA: "vllm.v1.attention.backends.mla.flashattn_mla.FlashAttnMLABackend",
-        _Backend.FLASHINFER_MLA: "vllm.v1.attention.backends.mla.flashinfer_mla.FlashInferMLABackend",
-        _Backend.TRITON_MLA: "vllm.v1.attention.backends.mla.triton_mla.TritonMLABackend",
+        _Backend.FLASH_ATTN_MLA: "vllm.v1.attention.backends.mla.flashattn_mla.FlashAttnMLABackend",  # noqa: E501
+        _Backend.FLASHINFER_MLA: "vllm.v1.attention.backends.mla.flashinfer_mla.FlashInferMLABackend",  # noqa: E501
+        _Backend.TRITON_MLA: "vllm.v1.attention.backends.mla.triton_mla.TritonMLABackend",  # noqa: E501
    }

    if backend_name not in backend_map:

--- a/tests/v1/entrypoints/openai/responses/test_image.py
+++ b/tests/v1/entrypoints/openai/responses/test_image.py
@@ -104,7 +104,7 @@ async def test_single_chat_session_image_base64encoded(
            "content": [
                {
                    "type": "input_image",
-                    "image_url": f"data:image/jpeg;base64,{base64_encoded_image[raw_image_url]}",
+                    "image_url": f"data:image/jpeg;base64,{base64_encoded_image[raw_image_url]}",  # noqa: E501
                    "detail": "auto",
                },
                {"type": "input_text", "text": content_text},

--- a/tests/v1/kv_connector/nixl_integration/test_accuracy.py
+++ b/tests/v1/kv_connector/nixl_integration/test_accuracy.py
@@ -15,8 +15,9 @@ RTOL = 0.03
 EXPECTED_VALUES = {"Qwen/Qwen3-0.6B": 0.41, "deepseek-ai/deepseek-vl2-small": 0.59}

 SIMPLE_PROMPT = (
-    "The best part about working on vLLM is that I got to meet so many people across various different organizations like UCB, Google, and Meta which means",
-)  # noqa: E501
+    "The best part about working on vLLM is that I got to meet so many people across "
+    "various different organizations like UCB, Google, and Meta which means",
+)

 # Get model name from environment variable
 MODEL_NAME = os.environ.get("TEST_MODEL", "Qwen/Qwen3-0.6B")

--- a/tests/v1/kv_connector/unit/test_offloading_connector.py
+++ b/tests/v1/kv_connector/unit/test_offloading_connector.py
@@ -127,7 +127,7 @@ class RequestRunner:
            kv_role="kv_both",
            kv_connector_extra_config={
                "spec_name": "MockOffloadingSpec",
-                "spec_module_path": "tests.v1.kv_connector.unit.test_offloading_connector",
+                "spec_module_path": "tests.v1.kv_connector.unit.test_offloading_connector",  # noqa: E501
                "block_size": offloaded_block_size,
            },
        )

--- a/tests/v1/logits_processors/test_custom_offline.py
+++ b/tests/v1/logits_processors/test_custom_offline.py
@@ -260,15 +260,8 @@ def test_pooling_rejects_custom_logitsprocs(
            gpu_memory_utilization=0.1,
        )
        # Require that no logitsprocs have been loaded
-        assert (
-            sum(
-                [
-                    1
-                    for _ in llm.llm_engine.model_executor.driver_worker.worker.model_runner.input_batch.logitsprocs.all
-                ]
-            )
-            == 0
-        )
+        worker = llm.llm_engine.model_executor.driver_worker.worker
+        assert sum([1 for _ in worker.model_runner.input_batch.logitsprocs.all]) == 0
        return

    kwargs: dict[str, list[Union[str, type[LogitsProcessor]]]] = {}

--- a/vllm/attention/ops/pallas_kv_cache_update.py
+++ b/vllm/attention/ops/pallas_kv_cache_update.py
@@ -76,10 +76,14 @@ def _kv_cache_update_kernel(
    static_argnames=["page_size", "num_slices_per_block"],
 )
 def kv_cache_update(
-    new_kv: jax.Array,  # [total_num_token, num_combined_kv_heads, head_dim]
-    slices: jax.Array,  # [3, slices], list of (kv_cache_start, new_kv_start, slice_len)
-    kv_cache: jax.Array,  # [total_num_pages * page_size, num_combined_kv_heads, head_dim]
-    num_kv_update_slices: jax.Array,  # [1]
+    # [total_num_token, num_combined_kv_heads, head_dim]
+    new_kv: jax.Array,
+    # [3, slices], list of (kv_cache_start, new_kv_start, slice_len)
+    slices: jax.Array,
+    # [total_num_pages * page_size, num_combined_kv_heads, head_dim]
+    kv_cache: jax.Array,
+    # [1]
+    num_kv_update_slices: jax.Array,
    *,
    page_size: int = 32,
    num_slices_per_block: int = 8,

--- a/vllm/compilation/collective_fusion.py
+++ b/vllm/compilation/collective_fusion.py
@@ -834,7 +834,10 @@ class AllReduceFusedRMSNormStaticQuantFP8Pattern(BasePattern):
                scale_out=None,
                rms_gamma=weight,
                rms_eps=self.epsilon,
-                pattern_code=flashinfer_comm.AllReduceFusionPattern.kARResidualRMSNormFP8Quant,  # we don't use norm_out afterwards
+                # We don't use norm_out afterwards
+                pattern_code=(
+                    flashinfer_comm.AllReduceFusionPattern.kARResidualRMSNormFP8Quant
+                ),
                scale_factor=scale,
                **self.allreduce_params.get_trtllm_fused_allreduce_kwargs(),
            )
@@ -928,11 +931,14 @@ class AllReduceFusedAddRMSNormStaticQuantFP8Pattern(BasePattern):
                scale_out=None,
                rms_gamma=weight,
                rms_eps=self.epsilon,
-                pattern_code=flashinfer_comm.AllReduceFusionPattern.kARResidualRMSNormFP8Quant,  # we don't use norm_out afterwards
+                # We don't use norm_out afterwards
+                pattern_code=(
+                    flashinfer_comm.AllReduceFusionPattern.kARResidualRMSNormFP8Quant
+                ),
                scale_factor=scale,
                **self.allreduce_params.get_trtllm_fused_allreduce_kwargs(),
            )
-            # # quant_out, rms_norm_residual
+            # quant_out, rms_norm_residual
            return allreduce[4], allreduce[2]

        pm.register_replacement(
@@ -1028,7 +1034,10 @@ class AllReduceFusedRMSNormStaticQuantNVFP4Pattern(BasePattern):
                scale_out=output_scale,
                rms_gamma=weight,
                rms_eps=self.epsilon,
-                pattern_code=flashinfer_comm.AllReduceFusionPattern.kARResidualRMSNormFP4Quant,  # we don't use norm_out afterwards
+                # We don't use norm_out afterwards
+                pattern_code=(
+                    flashinfer_comm.AllReduceFusionPattern.kARResidualRMSNormFP4Quant
+                ),
                scale_factor=input_global_scale,
                **self.allreduce_params.get_trtllm_fused_allreduce_kwargs(),
            )
@@ -1130,7 +1139,10 @@ class AllReduceFusedAddRMSNormStaticQuantNVFP4Pattern(BasePattern):
                scale_out=output_scale,
                rms_gamma=weight,
                rms_eps=self.epsilon,
-                pattern_code=flashinfer_comm.AllReduceFusionPattern.kARResidualRMSNormFP4Quant,  # we don't use norm_out afterwards
+                # We don't use norm_out afterwards
+                pattern_code=(
+                    flashinfer_comm.AllReduceFusionPattern.kARResidualRMSNormFP4Quant
+                ),
                scale_factor=input_global_scale,
                **self.allreduce_params.get_trtllm_fused_allreduce_kwargs(),
            )

--- a/vllm/compilation/wrapper.py
+++ b/vllm/compilation/wrapper.py
@@ -119,9 +119,12 @@ class TorchCompileWrapperWithCustomDispatcher:

            src = depyf.decompile(new_code)
            msg = (
-                "Assigning / modifying buffers of nn.Module during forward pass is not allowed when using cudagraph inside the compiler because it will cause silent errors. Please use eager mode or fix the code. The following code contains clues about which buffer is being modified (please search for the usage of the function `update`):\n"
-                + src
-            )  # noqa
+                "Assigning / modifying buffers of nn.Module during forward pass is not "
+                "allowed when using cudagraph inside the compiler because it will "
+                "cause silent errors. Please use eager mode or fix the code. The "
+                "following code contains clues about which buffer is being modified "
+                f"(please search for the usage of the function `update`):\n{src}"
+            )
            raise RuntimeError(msg)

    @contextmanager
@@ -132,8 +135,9 @@ class TorchCompileWrapperWithCustomDispatcher:
        variables as the original code. Therefore we can directly switch
        the code object in the function and call it.

-        See https://dev-discuss.pytorch.org/t/what-is-the-relationship-requirement-among-original-bytecode-transformed-bytecode-and-bytecode-returned-by-hooks-in-dynamo/1693/7 for more details.
-        """  # noqa
+        See https://dev-discuss.pytorch.org/t/what-is-the-relationship-requirement-among-original-bytecode-transformed-bytecode-and-bytecode-returned-by-hooks-in-dynamo/1693/7
+        for more details.
+        """
        self.__class__.forward.__code__ = self.compiled_codes[index]
        yield
        self.__class__.forward.__code__ = self.original_code_object
--- a/vllm/config/vllm.py
+++ b/vllm/config/vllm.py
@@ -472,7 +472,7 @@ class VllmConfig:
                self.compilation_config.cudagraph_mode.has_full_cudagraphs()
                and self.model_config is not None
                and not self.model_config.disable_cascade_attn
-                and not self.compilation_config.cudagraph_mode.has_piecewise_cudagraphs()
+                and not self.compilation_config.cudagraph_mode.has_piecewise_cudagraphs()  # noqa: E501
            ):
                logger.warning_once(
                    "No piecewise cudagraph for executing cascade attention."

--- a/vllm/distributed/device_communicators/all2all.py
+++ b/vllm/distributed/device_communicators/all2all.py
@@ -147,8 +147,9 @@ class PPLXAll2AllManager(All2AllManagerBase):

    def __init__(self, cpu_group):
        assert has_pplx(), (
-            "pplx_kernels not found. Please follow https://github.com/vllm-project/vllm/blob/main/tools/ep_kernels/README.md to install pplx_kernels."
-        )  # noqa
+            "pplx_kernels not found. Please follow https://github.com/vllm-project/vllm/blob/main/tools/ep_kernels/README.md"
+            " to install pplx_kernels."
+        )
        super().__init__(cpu_group)

        if self.internode:
@@ -220,7 +221,8 @@ class DeepEPAll2AllManagerBase(All2AllManagerBase):

    def __init__(self, cpu_group):
        assert has_deep_ep(), (
-            "DeepEP kernels not found. Please follow https://github.com/vllm-project/vllm/blob/main/tools/ep_kernels/README.md to install DeepEP kernels."
+            "DeepEP kernels not found. Please follow https://github.com/vllm-project/vllm/blob/main/tools/ep_kernels/README.md"
+            " to install DeepEP kernels."
        )  # noqa
        super().__init__(cpu_group)
        self.handle_cache = Cache()

--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -471,7 +471,8 @@ class ChatCompletionRequest(OpenAIBaseModel):
    top_logprobs: Optional[int] = 0
    max_tokens: Optional[int] = Field(
        default=None,
-        deprecated="max_tokens is deprecated in favor of the max_completion_tokens field",
+        deprecated="max_tokens is deprecated in favor of "
+        "the max_completion_tokens field",
    )
    max_completion_tokens: Optional[int] = None
    n: Optional[int] = 1

--- a/vllm/lora/layers/vocal_parallel_embedding.py
+++ b/vllm/lora/layers/vocal_parallel_embedding.py
@@ -31,7 +31,7 @@ class VocabParallelEmbeddingWithLoRA(BaseLayerWithLoRA):
        if self.base_layer.num_added_embeddings_per_partition > 0:
            # We can start adding lora weights
            self.embeddings_weights = self.base_layer.weight.data[
-                self.base_layer.num_org_embeddings_per_partition : self.base_layer.num_org_embeddings_per_partition
+                self.base_layer.num_org_embeddings_per_partition : self.base_layer.num_org_embeddings_per_partition  # noqa: E501
                + self.base_layer.num_added_embeddings_per_partition
            ]
            self.embeddings_slice = (

--- a/vllm/model_executor/layers/quantization/ptpc_fp8.py
+++ b/vllm/model_executor/layers/quantization/ptpc_fp8.py
@@ -107,8 +107,8 @@ class PTPCFp8LinearMethod(Fp8LinearMethod):
        layer.weight = torch.nn.Parameter(layer.weight.data, requires_grad=False)

        assert layer.weight.data.dtype == torch.bfloat16, (
-            f"Currently torch._scaled_mm (hipBLASLt) rowwise gemm only support output dtype of bfloat16. {str(layer.weight.data.dtype)} is specified."
-        )  # noqa: E501
+            f"Currently torch._scaled_mm (hipBLASLt) rowwise gemm only support output dtype of bfloat16. {str(layer.weight.data.dtype)} is specified."  # noqa: E501
+        )
        # Quantize the weights.
        qweight, weight_scale = ops.scaled_fp8_quant(
            layer.weight, scale=None, use_per_token_if_dynamic=True

--- a/vllm/model_executor/model_loader/bitsandbytes_loader.py
+++ b/vllm/model_executor/model_loader/bitsandbytes_loader.py
@@ -391,7 +391,7 @@ class BitsAndBytesModelLoader(BaseModelLoader):
                    total_shard_sizes = next(
                        (
                            sizes
-                            for module, sizes in self.maybe_fused_weights_modules.items()
+                            for module, sizes in self.maybe_fused_weights_modules.items()  # noqa: E501
                            if check_match(mapped_weight_name, module)
                        )
                    )

--- a/vllm/model_executor/models/bailing_moe.py
+++ b/vllm/model_executor/models/bailing_moe.py
@@ -270,8 +270,8 @@ class BailingMoE(nn.Module):
            ) or (
                self.score_function == "sigmoid" and self.correction_bias is not None
            ), (
-                "score_function and correction_bias should be in 2 combination (softmax, None) or (sigmoid, not None)"
-            )  # noqa: E501
+                "score_function and correction_bias should be in 2 combination (softmax, None) or (sigmoid, not None)"  # noqa: E501
+            )
        else:
            # default value for scoring_func
            self.score_function = "softmax"

--- a/vllm/model_executor/models/hyperclovax_vision.py
+++ b/vllm/model_executor/models/hyperclovax_vision.py
@@ -825,10 +825,10 @@ class HCXVisionForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
        # Run MM-Projector
        # len(num_grids) == len(num_queries_vis_abstractors) + 1
        grid_idx = 0
-        num_grids = [
-            grid_idx
-        ]  # e.g. [0, 9, 18, 19, 27, 28, 36, 37, 45, 46, 54, 55, 56]
-        num_queries_vis_abstractors = []  # e.g. [81, 81, 81, 9, 81, 9, 81, 9, 81, 9, 81, 9]
+        # e.g. [0, 9, 18, 19, 27, 28, 36, 37, 45, 46, 54, 55, 56]
+        num_grids = [grid_idx]
+        # e.g. [81, 81, 81, 9, 81, 9, 81, 9, 81, 9, 81, 9]
+        num_queries_vis_abstractors = []
        len_total_frames = video_forward_outs.shape[0]

        if self.config.first_last_frames_slow:

--- a/vllm/model_executor/models/llama4_eagle.py
+++ b/vllm/model_executor/models/llama4_eagle.py
@@ -154,9 +154,10 @@ class LlamaModel(nn.Module):
                    str(layer_index), str(layer_index + start_layer_id)
                )

-            quant_config.torchao_config.module_fqn_to_config = {
+            torchao_config = quant_config.torchao_config
+            torchao_config.module_fqn_to_config = {
                pad_layer_name(layer): quantization
-                for layer, quantization in quant_config.torchao_config.module_fqn_to_config.items()
+                for layer, quantization in torchao_config.module_fqn_to_config.items()
            }



--- a/vllm/model_executor/models/longcat_flash_mtp.py
+++ b/vllm/model_executor/models/longcat_flash_mtp.py
@@ -186,26 +186,26 @@ class LongCatFlashMTP(nn.Module, SupportsPP):
            "model.mtp.layers.0.eh_proj.weight_scale_inv": "eh_proj.weight_scale_inv",
            "model.mtp.layers.0.enorm.m.weight": "enorm.weight",
            "model.mtp.layers.0.hnorm.m.weight": "hnorm.weight",
-            "model.mtp.layers.0.input_layernorm.weight": "model.layers.0.input_layernorm.weight",
-            "model.mtp.layers.0.post_attention_layernorm.weight": "model.layers.0.post_attention_layernorm.weight",
-            "model.mtp.layers.0.self_attn.kv_a_layernorm.weight": "model.layers.0.self_attn.kv_a_layernorm.weight",
-            "model.mtp.layers.0.self_attn.kv_a_proj_with_mqa.weight": "model.layers.0.self_attn.kv_a_proj_with_mqa.weight",
-            "model.mtp.layers.0.self_attn.kv_a_proj_with_mqa.weight_scale_inv": "model.layers.0.self_attn.kv_a_proj_with_mqa.weight_scale_inv",
-            "model.mtp.layers.0.self_attn.kv_b_proj.weight": "model.layers.0.self_attn.kv_b_proj.weight",
-            "model.mtp.layers.0.self_attn.kv_b_proj.weight_scale_inv": "model.layers.0.self_attn.kv_b_proj.weight_scale_inv",
-            "model.mtp.layers.0.self_attn.o_proj.weight": "model.layers.0.self_attn.o_proj.weight",
-            "model.mtp.layers.0.self_attn.o_proj.weight_scale_inv": "model.layers.0.self_attn.o_proj.weight_scale_inv",
-            "model.mtp.layers.0.self_attn.q_a_layernorm.weight": "model.layers.0.self_attn.q_a_layernorm.weight",
-            "model.mtp.layers.0.self_attn.q_a_proj.weight": "model.layers.0.self_attn.q_a_proj.weight",
-            "model.mtp.layers.0.self_attn.q_a_proj.weight_scale_inv": "model.layers.0.self_attn.q_a_proj.weight_scale_inv",
-            "model.mtp.layers.0.self_attn.q_b_proj.weight": "model.layers.0.self_attn.q_b_proj.weight",
-            "model.mtp.layers.0.self_attn.q_b_proj.weight_scale_inv": "model.layers.0.self_attn.q_b_proj.weight_scale_inv",
-            "model.mtp.layers.0.transformer_layer.mlp.down_proj.weight": "model.layers.0.mlp.down_proj.weight",
-            "model.mtp.layers.0.transformer_layer.mlp.down_proj.weight_scale_inv": "model.layers.0.mlp.down_proj.weight_scale_inv",
-            "model.mtp.layers.0.transformer_layer.mlp.gate_proj.weight": "model.layers.0.mlp.gate_proj.weight",
-            "model.mtp.layers.0.transformer_layer.mlp.gate_proj.weight_scale_inv": "model.layers.0.mlp.gate_proj.weight_scale_inv",
-            "model.mtp.layers.0.transformer_layer.mlp.up_proj.weight": "model.layers.0.mlp.up_proj.weight",
-            "model.mtp.layers.0.transformer_layer.mlp.up_proj.weight_scale_inv": "model.layers.0.mlp.up_proj.weight_scale_inv",
+            "model.mtp.layers.0.input_layernorm.weight": "model.layers.0.input_layernorm.weight",  # noqa: E501
+            "model.mtp.layers.0.post_attention_layernorm.weight": "model.layers.0.post_attention_layernorm.weight",  # noqa: E501
+            "model.mtp.layers.0.self_attn.kv_a_layernorm.weight": "model.layers.0.self_attn.kv_a_layernorm.weight",  # noqa: E501
+            "model.mtp.layers.0.self_attn.kv_a_proj_with_mqa.weight": "model.layers.0.self_attn.kv_a_proj_with_mqa.weight",  # noqa: E501
+            "model.mtp.layers.0.self_attn.kv_a_proj_with_mqa.weight_scale_inv": "model.layers.0.self_attn.kv_a_proj_with_mqa.weight_scale_inv",  # noqa: E501
+            "model.mtp.layers.0.self_attn.kv_b_proj.weight": "model.layers.0.self_attn.kv_b_proj.weight",  # noqa: E501
+            "model.mtp.layers.0.self_attn.kv_b_proj.weight_scale_inv": "model.layers.0.self_attn.kv_b_proj.weight_scale_inv",  # noqa: E501
+            "model.mtp.layers.0.self_attn.o_proj.weight": "model.layers.0.self_attn.o_proj.weight",  # noqa: E501
+            "model.mtp.layers.0.self_attn.o_proj.weight_scale_inv": "model.layers.0.self_attn.o_proj.weight_scale_inv",  # noqa: E501
+            "model.mtp.layers.0.self_attn.q_a_layernorm.weight": "model.layers.0.self_attn.q_a_layernorm.weight",  # noqa: E501
+            "model.mtp.layers.0.self_attn.q_a_proj.weight": "model.layers.0.self_attn.q_a_proj.weight",  # noqa: E501
+            "model.mtp.layers.0.self_attn.q_a_proj.weight_scale_inv": "model.layers.0.self_attn.q_a_proj.weight_scale_inv",  # noqa: E501
+            "model.mtp.layers.0.self_attn.q_b_proj.weight": "model.layers.0.self_attn.q_b_proj.weight",  # noqa: E501
+            "model.mtp.layers.0.self_attn.q_b_proj.weight_scale_inv": "model.layers.0.self_attn.q_b_proj.weight_scale_inv",  # noqa: E501
+            "model.mtp.layers.0.transformer_layer.mlp.down_proj.weight": "model.layers.0.mlp.down_proj.weight",  # noqa: E501
+            "model.mtp.layers.0.transformer_layer.mlp.down_proj.weight_scale_inv": "model.layers.0.mlp.down_proj.weight_scale_inv",  # noqa: E501
+            "model.mtp.layers.0.transformer_layer.mlp.gate_proj.weight": "model.layers.0.mlp.gate_proj.weight",  # noqa: E501
+            "model.mtp.layers.0.transformer_layer.mlp.gate_proj.weight_scale_inv": "model.layers.0.mlp.gate_proj.weight_scale_inv",  # noqa: E501
+            "model.mtp.layers.0.transformer_layer.mlp.up_proj.weight": "model.layers.0.mlp.up_proj.weight",  # noqa: E501
+            "model.mtp.layers.0.transformer_layer.mlp.up_proj.weight_scale_inv": "model.layers.0.mlp.up_proj.weight_scale_inv",  # noqa: E501
            "model.mtp.norm.weight": "final_layernorm.weight",
        }


--- a/vllm/model_executor/models/phi4mm.py
+++ b/vllm/model_executor/models/phi4mm.py
@@ -1000,8 +1000,8 @@ class Phi4MMForCausalLM(nn.Module, SupportsLoRA, SupportsMultiModal):
            "base_layer.": "",
        },
        orig_to_new_prefix={
-            "model.embed_tokens_extend.audio_embed.audio_projection.vision.": "embed_tokens_extend.audio_projection_for_vision.",
-            "model.embed_tokens_extend.audio_embed.audio_projection.speech.": "embed_tokens_extend.audio_projection.",
+            "model.embed_tokens_extend.audio_embed.audio_projection.vision.": "embed_tokens_extend.audio_projection_for_vision.",  # noqa: E501
+            "model.embed_tokens_extend.audio_embed.audio_projection.speech.": "embed_tokens_extend.audio_projection.",  # noqa: E501
            "model.embed_tokens_extend.audio_embed.": "embed_tokens_extend.",
            "model.embed_tokens_extend.image_embed.": "vision_encoder.",
        },