[BugFix] Fix `VllmConfig()` construction on all platforms (#20695)

Signed-off-by: Nick Hill <nhill@redhat.com>

[BugFix] Fix `VllmConfig()` construction on all platforms (#20695)
Signed-off-by: Nick Hill <nhill@redhat.com>
ffbcc9e7 · Nick Hill · GitHub · 59389c92 · ffbcc9e7 · ffbcc9e7
Unverified Commit ffbcc9e7 authored Jul 10, 2025 by Nick Hill Committed by GitHub Jul 10, 2025
5 changed files
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -4722,7 +4722,6 @@ class VllmConfig:
        # calculate the default `batch_size_capture_list`
        if not envs.VLLM_USE_V1:
            batch_size_capture_list = []
-            max_batchsize_to_capture = 0
            if self.scheduler_config is not None and \
                self.model_config is not None and \
                    not self.model_config.enforce_eager:

--- a/vllm/platforms/cpu.py
+++ b/vllm/platforms/cpu.py
@@ -96,7 +96,8 @@ class CpuPlatform(Platform):
        from vllm.utils import GiB_bytes
        model_config = vllm_config.model_config

-        model_config.disable_cascade_attn = True
+        if model_config is not None:
+            model_config.disable_cascade_attn = True

        cache_config = vllm_config.cache_config

@@ -123,7 +124,7 @@ class CpuPlatform(Platform):
                "CPU backend doesn't support fp8_e4m3 KV cache type, "
                "cast to fp8_e5m2.")

-        if (cache_config.cache_dtype != "auto"
+        if (cache_config.cache_dtype != "auto" and model_config is not None
                and model_config.dtype == torch.half):
            logger.warning("FP8 KV cache on the CPU backend only does not"
                           " support fp16 for now, cast to bf16.")
@@ -229,7 +230,7 @@ class CpuPlatform(Platform):
        os.environ["LOCAL_WORLD_SIZE"] = str(
            vllm_config.parallel_config.tensor_parallel_size)

-        if vllm_config.model_config and vllm_config.model_config.use_mla:
+        if model_config is not None and model_config.use_mla:
            logger.info(
                "MLA is enabled on a non-GPU platform; forcing chunked "
                "prefill and prefix caching to be disabled.")

--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -166,17 +166,19 @@ class CudaPlatformBase(Platform):
                logger.info(
                    "Forcing kv cache block size to 64 for FlashMLA backend.")

+        compilation_config = vllm_config.compilation_config
        if (envs.VLLM_ALL2ALL_BACKEND == "deepep_high_throughput"
                and parallel_config.data_parallel_size > 1
-                and vllm_config.compilation_config.use_cudagraph):
+                and compilation_config.use_cudagraph):
            logger.info(
                "Data Parallel: Forcing enforce eager to be True since DP "
                "with DeepEP high-throughput kernels are not CUDA Graph "
                "compatible. The DeepEP low-latency kernels are CUDA Graph "
                "compatible. Set the all_to_all backend to deepep_low_latency "
                "to use those kernels instead.")
-            vllm_config.compilation_config.use_cudagraph = False
-            vllm_config.model_config.enforce_eager = True
+            compilation_config.use_cudagraph = False
+            if model_config is not None:
+                model_config.enforce_eager = True
            # TODO (varun): Turning this ON gives incorrect results for the
            # Deepseek-V2-lite model.
            vllm_config.compilation_config.use_inductor = False

--- a/vllm/platforms/tpu.py
+++ b/vllm/platforms/tpu.py
@@ -116,11 +116,13 @@ class TpuPlatform(Platform):
        assert vllm_config.speculative_config is None, \
            "TPU does not support speculative decoding"

-        if vllm_config.model_config.dtype in (torch.float16, torch.float32):
+        model_config = vllm_config.model_config
+        if model_config is not None and model_config.dtype in (torch.float16,
+                                                               torch.float32):
            logger.warning(
                "The TPU backend currently does not support %s. "
-                "Using bfloat16 instead.", vllm_config.model_config.dtype)
-            vllm_config.model_config.dtype = torch.bfloat16
+                "Using bfloat16 instead.", model_config.dtype)
+            model_config.dtype = torch.bfloat16

        from vllm.v1.attention.backends.pallas import PallasAttentionBackend
        cache_config.block_size = PallasAttentionBackend.get_page_size(
@@ -146,7 +148,7 @@ class TpuPlatform(Platform):
            "Forcing --disable_chunked_mm_input.")
            scheduler_config.disable_chunked_mm_input = True

-        if vllm_config.model_config and vllm_config.model_config.use_mla:
+        if model_config and model_config.use_mla:
            logger.info(
                "MLA is enabled on a non-GPU platform; forcing chunked "
                "prefill and prefix caching to be disabled.")

--- a/vllm/platforms/xpu.py
+++ b/vllm/platforms/xpu.py
@@ -85,14 +85,14 @@ class XPUPlatform(Platform):
    @classmethod
    def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
        cache_config = vllm_config.cache_config
+        model_config = vllm_config.model_config
        # in V1(or with ipex chunked prefill) block_size is 64
        if cache_config and cache_config.block_size is None:
            cache_config.block_size = 64

        # FIXME: Temporarily forcing eager mode
        # remove after t.compile support stabilizes.
-
-        if (envs.VLLM_USE_V1 and vllm_config.model_config is not None
+        if (envs.VLLM_USE_V1 and model_config is not None
                and not vllm_config.model_config.enforce_eager):
            from vllm.config import CompilationLevel
            vllm_config.compilation_config.level = CompilationLevel.NO_COMPILATION  # noqa: E501
@@ -100,8 +100,7 @@ class XPUPlatform(Platform):
        # Instances created using VllmConfig() typically have model_config as
        # None by default. The modification involves adding a check to prevent
        # potential null exceptions check and update model config.
-        if vllm_config.model_config is not None:
-            model_config = vllm_config.model_config
+        if model_config is not None:
            if model_config.dtype == torch.bfloat16:
                bf16_supported = cls.device_support_bf16()
                if not bf16_supported:
@@ -139,7 +138,7 @@ class XPUPlatform(Platform):
                parallel_config.distributed_executor_backend)
            parallel_config.distributed_executor_backend = "ray"

-        if vllm_config.model_config and vllm_config.model_config.use_mla:
+        if model_config and model_config.use_mla:
            logger.info(
                "MLA is enabled on a non-GPU platform; forcing chunked "
                "prefill and prefix caching to be disabled.")