[Mypy] Better fixes for the `mypy` issues in `vllm/config` (#37902)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>

[Mypy] Better fixes for the `mypy` issues in `vllm/config` (#37902)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
d215d1ef · Harry Mellor · GitHub · 34d317dc · d215d1ef · d215d1ef
Unverified Commit d215d1ef authored Mar 25, 2026 by Harry Mellor Committed by GitHub Mar 25, 2026
15 changed files
--- a/vllm/benchmarks/throughput.py
+++ b/vllm/benchmarks/throughput.py
@@ -8,7 +8,6 @@ import os
 import random
 import time
 import warnings
-from dataclasses import fields
 from typing import Any

 import torch
@@ -53,7 +52,7 @@ def run_vllm(
 ) -> tuple[float, list[RequestOutput] | None]:
    from vllm import LLM, SamplingParams

-    llm = LLM(**{f.name: getattr(engine_args, f.name) for f in fields(engine_args)})
+    llm = LLM.from_engine_args(engine_args)
    assert all(
        llm.llm_engine.model_config.max_model_len
        >= (request.prompt_len + request.expected_output_len)
@@ -141,7 +140,7 @@ def run_vllm_chat(
    """
    from vllm import LLM, SamplingParams

-    llm = LLM(**{f.name: getattr(engine_args, f.name) for f in fields(engine_args)})
+    llm = LLM.from_engine_args(engine_args)

    assert all(
        llm.llm_engine.model_config.max_model_len

--- a/vllm/config/compilation.py
+++ b/vllm/config/compilation.py
@@ -116,29 +116,29 @@ class PassConfig:
    """

    # New flags
-    fuse_norm_quant: bool | None = Field(default=None)
+    fuse_norm_quant: bool = None  # type: ignore[assignment]
    """Fuse the custom RMSNorm + quant ops."""
-    fuse_act_quant: bool | None = Field(default=None)
+    fuse_act_quant: bool = None  # type: ignore[assignment]
    """Fuse the custom SiluMul + quant ops."""
-    fuse_attn_quant: bool | None = Field(default=None)
+    fuse_attn_quant: bool = None  # type: ignore[assignment]
    """Fuse the custom attention + quant ops."""
    eliminate_noops: bool = Field(default=True)
    """Eliminate no-op ops."""
-    enable_sp: bool | None = Field(default=None)
+    enable_sp: bool = None  # type: ignore[assignment]
    """Enable sequence parallelism. Requires TP>1. Automatically disabled
    if the model's hidden_size is too small for SP to be beneficial
    (threshold is device-capability dependent)."""
-    fuse_gemm_comms: bool | None = Field(default=None)
+    fuse_gemm_comms: bool = None  # type: ignore[assignment]
    """Enable async TP."""
-    fuse_allreduce_rms: bool | None = Field(default=None)
+    fuse_allreduce_rms: bool = None  # type: ignore[assignment]
    """Enable flashinfer allreduce fusion."""
    enable_qk_norm_rope_fusion: bool = False
    """Enable fused Q/K RMSNorm + RoPE pass."""

    # ROCm/AITER specific fusions
-    fuse_act_padding: bool | None = Field(default=None)
+    fuse_act_padding: bool = None  # type: ignore[assignment]
    """Fuse the custom RMSNorm + padding ops."""
-    fuse_rope_kvcache: bool | None = Field(default=None)
+    fuse_rope_kvcache: bool = None  # type: ignore[assignment]
    """Fuse the QK rope + KV cache ops."""

    rope_kvcache_fusion_max_token_num: int = 256
@@ -405,7 +405,7 @@ class CompilationConfig:
    """

    # Top-level Compilation control
-    mode: CompilationMode = Field(default=None)  # type: ignore[assignment]
+    mode: CompilationMode = None  # type: ignore[assignment]
    """The compilation approach used for torch.compile-based compilation of the
    model.

@@ -545,7 +545,7 @@ class CompilationConfig:
    constructor, e.g. `CompilationConfig(inductor_passes={"a": func})`."""

    # CudaGraph compilation
-    cudagraph_mode: CUDAGraphMode = Field(default=None)  # type: ignore[assignment]
+    cudagraph_mode: CUDAGraphMode = None  # type: ignore[assignment]
    """
    The mode of the cudagraph:

@@ -586,7 +586,7 @@ class CompilationConfig:
    It means the first several runs will be treated as warmup runs.
    Only after that, the execution will be recorded, and the recorded
    cudagraph will be used for subsequent runs."""
-    cudagraph_capture_sizes: list[int] | None = None
+    cudagraph_capture_sizes: list[int] = None  # type: ignore[assignment]
    """Sizes to capture cudagraph.
    - None (default): capture sizes are inferred from vllm config.
    - list[int]: capture sizes are specified as given."""
@@ -607,7 +607,7 @@ class CompilationConfig:
    When `enable_lora` is False, this option has no effect.
    """

-    use_inductor_graph_partition: bool = Field(default=None)  # type: ignore[assignment]
+    use_inductor_graph_partition: bool = None  # type: ignore[assignment]
    """Use inductor graph partition to split the graph at cudagraph_unsafe ops.
    This partition happens at inductor codegen time after all passes and fusions
    are finished. It generates a single `call` function which wraps
@@ -630,7 +630,7 @@ class CompilationConfig:
    pass_config: PassConfig = field(default_factory=PassConfig)
    """Custom inductor passes, see PassConfig for more details"""

-    max_cudagraph_capture_size: int | None = field(default=None)
+    max_cudagraph_capture_size: int = None  # type: ignore[assignment]
    """The maximum cudagraph capture size.

    If cudagraph_capture_sizes is specified, this will be set to the largest
@@ -750,7 +750,7 @@ class CompilationConfig:
        return hash_factors(factors)

    def __repr__(self) -> str:
-        exclude = {
+        exclude: dict[str, bool | dict[str, bool]] = {
            "static_forward_context": True,
            "enabled_custom_ops": True,
            "disabled_custom_ops": True,
@@ -770,9 +770,7 @@ class CompilationConfig:
            exclude["pass_config"] = pass_config_exclude

        config = TypeAdapter(CompilationConfig).dump_python(
-            self,
-            exclude=exclude,  # type: ignore[arg-type]
-            exclude_unset=True,
+            self, exclude=exclude, exclude_unset=True
        )

        return str(config)
@@ -1023,7 +1021,6 @@ class CompilationConfig:
                        "Unrecognized size type in compile_sizes, "
                        f"expect 'cudagraph_capture_sizes', got {x}"
                    )
-                    assert self.cudagraph_capture_sizes is not None
                    computed_compile_sizes.extend(self.cudagraph_capture_sizes)
                else:
                    assert isinstance(x, int)
@@ -1031,7 +1028,6 @@ class CompilationConfig:
        self.compile_sizes = computed_compile_sizes  # type: ignore

        # make sure the sizes are in ascending order
-        assert self.cudagraph_capture_sizes is not None
        self.cudagraph_capture_sizes.sort()
        if self.cudagraph_capture_sizes:
            assert self.cudagraph_capture_sizes[-1] == self.max_cudagraph_capture_size
@@ -1123,7 +1119,6 @@ class CompilationConfig:

    def set_splitting_ops_for_attn_fusion(self):
        assert self.pass_config.fuse_attn_quant
-        assert self.cudagraph_mode is not None
        if self.splitting_ops is None:
            self.splitting_ops = []
            if self.cudagraph_mode.has_piecewise_cudagraphs():

--- a/vllm/config/device.py
+++ b/vllm/config/device.py
@@ -13,8 +13,8 @@ from vllm.utils.hashing import safe_hash
 Device = Literal["auto", "cuda", "cpu", "tpu", "xpu"]


-@config(config=ConfigDict(arbitrary_types_allowed=True))  # type: ignore[arg-type,misc]
-class DeviceConfig:  # type: ignore[misc]
+@config(config=ConfigDict(arbitrary_types_allowed=True))
+class DeviceConfig:
    """Configuration for the device to use for vLLM execution."""

    device: SkipValidation[Device | torch.device | None] = "auto"

--- a/vllm/config/kernel.py
+++ b/vllm/config/kernel.py
@@ -4,7 +4,7 @@
 from collections.abc import Callable
 from typing import Any, Literal

-from pydantic import Field, field_validator
+from pydantic import field_validator

 from vllm.config.utils import config
 from vllm.utils.hashing import safe_hash
@@ -26,7 +26,7 @@ MoEBackend = Literal[
 class KernelConfig:
    """Configuration for kernel selection and warmup behavior."""

-    enable_flashinfer_autotune: bool | None = Field(default=None)
+    enable_flashinfer_autotune: bool = None  # type: ignore[assignment]
    """If True, run FlashInfer autotuning during kernel warmup."""

    moe_backend: MoEBackend = "auto"

--- a/vllm/config/kv_events.py
+++ b/vllm/config/kv_events.py
@@ -4,8 +4,6 @@

 from typing import Literal

-from pydantic import Field
-
 from vllm.config.utils import config


@@ -18,7 +16,7 @@ class KVEventsConfig:
    Events can be published externally by zmq using the event publisher config.
    """

-    publisher: Literal["null", "zmq"] | None = Field(default=None)
+    publisher: Literal["null", "zmq"] = None  # type: ignore[assignment]
    """The publisher to use for publishing kv events. Can be "null", "zmq".
    """


--- a/vllm/config/lora.py
+++ b/vllm/config/lora.py
@@ -25,8 +25,8 @@ MaxLoRARanks = Literal[1, 8, 16, 32, 64, 128, 256, 320, 512]
 LoRAExtraVocabSize = Literal[256, 512]


-@config(config=ConfigDict(arbitrary_types_allowed=True))  # type: ignore[arg-type,misc]
-class LoRAConfig:  # type: ignore[misc]
+@config(config=ConfigDict(arbitrary_types_allowed=True))
+class LoRAConfig:
    """Configuration for LoRA."""

    max_lora_rank: MaxLoRARanks = 16

--- a/vllm/config/model.py
+++ b/vllm/config/model.py
@@ -102,8 +102,8 @@ AttnTypeStr = Literal[
 ]


-@config(config=ConfigDict(arbitrary_types_allowed=True))  # type: ignore[arg-type,misc]
-class ModelConfig:  # type: ignore[misc]
+@config(config=ConfigDict(arbitrary_types_allowed=True))
+class ModelConfig:
    """Configuration for the model."""

    model: str = "Qwen/Qwen3-0.6B"
@@ -121,7 +121,7 @@ class ModelConfig:  # type: ignore[misc]
    """Convert the model using adapters defined in
    [vllm.model_executor.models.adapters][]. The most common use case is to
    adapt a text generation model to be used for pooling tasks."""
-    tokenizer: str = Field(default=None)  # type: ignore[assignment]
+    tokenizer: str = None  # type: ignore[assignment]
    """Name or path of the Hugging Face tokenizer to use. If unspecified, model
    name or path will be used."""
    tokenizer_mode: TokenizerMode | str = "auto"
@@ -583,7 +583,7 @@ class ModelConfig:  # type: ignore[misc]
            self.dtype,
            is_pooling_model=self.runner_type == "pooling",
            revision=self.revision,
-            config_format=self.config_format,  # type: ignore[arg-type]
+            config_format=self.config_format,
        )

        self.original_max_model_len = self.max_model_len
@@ -733,7 +733,7 @@ class ModelConfig:  # type: ignore[misc]

    @property
    def architectures(self) -> list[str]:
-        return self.model_arch_config.architectures  # type: ignore[return-value]
+        return self.model_arch_config.architectures

    @property
    def architecture(self) -> str:
@@ -1944,7 +1944,7 @@ def _get_and_verify_dtype(
    *,
    is_pooling_model: bool,
    revision: str | None = None,
-    config_format: ConfigFormat = "hf",
+    config_format: str | ConfigFormat = "hf",
 ) -> torch.dtype:
    config_dtype = ModelArchConfigConvertorBase.get_torch_dtype(
        config, model_id, revision=revision, config_format=config_format

--- a/vllm/config/model_arch.py
+++ b/vllm/config/model_arch.py
@@ -16,7 +16,7 @@ class ModelArchitectureConfig:
    Configuration for model architecture that required by vLLM runtime
    """

-    architectures: list[str] | None
+    architectures: list[str]
    """List of model architecture class names (e.g., ['LlamaForCausalLM']).
       It can be None upon calling `vllm_config.with_hf_config(config.text_config)`"""


--- a/vllm/config/parallel.py
+++ b/vllm/config/parallel.py
@@ -194,7 +194,7 @@ class ParallelConfig:
    threshold, microbatching will be used. Otherwise, the request will be
    processed in a single batch."""

-    disable_nccl_for_dp_synchronization: bool | None = Field(default=None)
+    disable_nccl_for_dp_synchronization: bool | None = None
    """Forces the dp synchronization logic in vllm/v1/worker/dp_utils.py 
    to use Gloo instead of NCCL for its all reduce.


--- a/vllm/config/scheduler.py
+++ b/vllm/config/scheduler.py
@@ -52,7 +52,7 @@ class SchedulerConfig:
    In real usage, this should be set in `EngineArgs.create_engine_config`.
    """

-    max_num_scheduled_tokens: int | None = Field(default=None)
+    max_num_scheduled_tokens: int | None = None
    """Maximum number of tokens that the scheduler may issue in a single iteration.
    
    This is usually equal to max_num_batched_tokens, but can be smaller in cases
@@ -122,7 +122,7 @@ class SchedulerConfig:

    # scheduler class or path. "vllm.v1.core.sched.scheduler.Scheduler"
    # (default) or "mod.custom_class".
-    scheduler_cls: str | type[object] | None = Field(default=None)
+    scheduler_cls: str | type[object] | None = None
    """The scheduler class to use. "vllm.v1.core.sched.scheduler.Scheduler" is
    the default scheduler. Can be a class directly or the path to a class of
    form "mod.custom_class"."""
@@ -141,7 +141,7 @@ class SchedulerConfig:
    checking the first chunk. Prevents over-admission and KV cache thrashing
    with chunked prefill."""

-    async_scheduling: bool | None = Field(default=None)
+    async_scheduling: bool | None = None
    """If set to False, disable async scheduling. Async scheduling helps to
    avoid gaps in GPU utilization, leading to better latency and throughput.
    """

--- a/vllm/config/utils.py
+++ b/vllm/config/utils.py
@@ -11,13 +11,13 @@ import os
 import pathlib
 import textwrap
 from collections.abc import Callable, Mapping, Sequence, Set
-from dataclasses import MISSING, dataclass, field, fields, is_dataclass
+from dataclasses import MISSING, field, fields, is_dataclass
 from itertools import pairwise
-from typing import TYPE_CHECKING, Any, Protocol, TypeVar, cast
+from typing import TYPE_CHECKING, Any, Protocol, TypeVar, cast, overload

 import torch
 from pydantic import ConfigDict
-from pydantic.dataclasses import dataclass as pydantic_dataclass
+from pydantic.dataclasses import dataclass
 from pydantic.fields import Field as PydanticField
 from pydantic.fields import FieldInfo
 from typing_extensions import dataclass_transform, runtime_checkable
@@ -36,6 +36,16 @@ ConfigType = type[DataclassInstance]
 ConfigT = TypeVar("ConfigT", bound=DataclassInstance)


+@overload
+def config(cls: type[ConfigT]) -> type[ConfigT]: ...
+
+
+@overload
+def config(
+    *, config: ConfigDict | None = None, **kwargs: Any
+) -> Callable[[type[ConfigT]], type[ConfigT]]: ...
+
+
 @dataclass_transform(field_specifiers=(PydanticField,))
 def config(
    cls: type[ConfigT] | None = None,
@@ -59,7 +69,7 @@ def config(
        merged_config.update(config)

    def decorator(cls: type[ConfigT]) -> type[ConfigT]:
-        return pydantic_dataclass(cls, config=merged_config, **kwargs)  # type: ignore[return-value]
+        return dataclass(cls, config=merged_config, **kwargs)  # type: ignore[return-value]

    # Called with arguments: @config(config=...)
    if cls is None:

--- a/vllm/config/vllm.py
+++ b/vllm/config/vllm.py
@@ -246,15 +246,15 @@ OPTIMIZATION_LEVEL_TO_CONFIG = {
 }


-@config(config=ConfigDict(arbitrary_types_allowed=True))  # type: ignore[arg-type,misc]
-class VllmConfig:  # type: ignore[misc]
+@config(config=ConfigDict(arbitrary_types_allowed=True))
+class VllmConfig:
    """Dataclass which contains all vllm-related configuration. This
    simplifies passing around the distinct configurations in the codebase.
    """

    # TODO: use default_factory once default constructing ModelConfig doesn't
    # try to download a model
-    model_config: ModelConfig = Field(default=None)  # type: ignore[assignment]
+    model_config: ModelConfig = None  # type: ignore[assignment]
    """Model configuration."""
    cache_config: CacheConfig = Field(default_factory=CacheConfig)
    """Cache configuration."""
@@ -912,7 +912,8 @@ class VllmConfig:  # type: ignore[misc]

                    tp_size = self.parallel_config.tensor_parallel_size
                    hidden_size = self.model_config.get_hidden_size()
-                    element_size = self.model_config.dtype.itemsize  # type: ignore[union-attr]
+                    assert isinstance(self.model_config.dtype, torch.dtype)
+                    element_size = self.model_config.dtype.itemsize
                    pass_config.sp_min_token_num = get_sequence_parallelism_threshold(
                        hidden_size, tp_size, element_size
                    )
@@ -1246,14 +1247,6 @@ class VllmConfig:  # type: ignore[misc]
                )
            self.compilation_config.debug_dump_path = env_path

-        def has_blocked_weights():  # type: ignore[no-redef]
-            if self.quant_config is not None:
-                if hasattr(self.quant_config, "weight_block_size"):
-                    return self.quant_config.weight_block_size is not None
-                elif hasattr(self.quant_config, "has_blocked_weights"):
-                    return self.quant_config.has_blocked_weights()
-            return False
-
        # Enable quant_fp8 CUDA ops (TODO disable in follow up)
        # On H100 the CUDA kernel is faster than
        # native implementation
@@ -1502,9 +1495,10 @@ class VllmConfig:  # type: ignore[misc]
            tp_size = self.parallel_config.tensor_parallel_size
            max_size = compilation_config.pass_config.flashinfer_max_size(tp_size)
            if max_size is not None:
+                assert isinstance(self.model_config.dtype, torch.dtype)
                max_token_num = max_size // (
                    self.model_config.get_hidden_size()
-                    * self.model_config.dtype.itemsize  # type: ignore[union-attr]
+                    * self.model_config.dtype.itemsize
                )
                if compile_range_end is not None and max_token_num < compile_range_end:
                    computed_compile_ranges_endpoints.append(max_token_num)
@@ -1527,7 +1521,8 @@ class VllmConfig:  # type: ignore[misc]

                tp_size = self.parallel_config.tensor_parallel_size
                hidden_size = self.model_config.get_hidden_size()
-                element_size = self.model_config.dtype.itemsize  # type: ignore[union-attr]
+                assert isinstance(self.model_config.dtype, torch.dtype)
+                element_size = self.model_config.dtype.itemsize
                pass_config.sp_min_token_num = get_sequence_parallelism_threshold(
                    hidden_size, tp_size, element_size
                )

--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -1935,7 +1935,7 @@ class EngineArgs:
            )

        offload_config = OffloadConfig(
-            offload_backend=self.offload_backend,  # type: ignore[arg-type]
+            offload_backend=self.offload_backend,
            uva=UVAOffloadConfig(
                cpu_offload_gb=self.cpu_offload_gb,
                cpu_offload_params=self.cpu_offload_params,

--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -409,6 +409,11 @@ class LLM:
        # Cache for __repr__ to avoid repeated collective_rpc calls
        self._cached_repr: str | None = None

+    @classmethod
+    def from_engine_args(cls, engine_args: EngineArgs) -> "LLM":
+        """Create an LLM instance from EngineArgs."""
+        return cls(**vars(engine_args))
+
    def get_tokenizer(self) -> TokenizerLike:
        return self.llm_engine.get_tokenizer()


--- a/vllm/transformers_utils/model_arch_config_convertor.py
+++ b/vllm/transformers_utils/model_arch_config_convertor.py
@@ -28,7 +28,10 @@ class ModelArchConfigConvertorBase:
        self.hf_text_config = hf_text_config

    def get_architectures(self) -> list[str]:
-        return getattr(self.hf_config, "architectures", [])
+        # Sometimes we get here from `vllm_config.with_hf_config(text_config)` where
+        # `text_config` is a sub-config from a multi-modal model. If this is the case,
+        # the sub-config will not have `architectures` and it will explicitly be `None`
+        return getattr(self.hf_config, "architectures", None) or []

    def get_num_hidden_layers(self) -> int:
        return getattr(self.hf_text_config, "num_hidden_layers", 0)
@@ -128,7 +131,7 @@ class ModelArchConfigConvertorBase:
        hf_config: PretrainedConfig,
        model_id: str,
        revision: str | None,
-        config_format: ConfigFormat,
+        config_format: str | ConfigFormat,
    ):
        # NOTE: getattr(config, "dtype", torch.float32) is not correct
        # because config.dtype can be None.