Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
ffbcc9e7
Unverified
Commit
ffbcc9e7
authored
Jul 10, 2025
by
Nick Hill
Committed by
GitHub
Jul 10, 2025
Browse files
[BugFix] Fix `VllmConfig()` construction on all platforms (#20695)
Signed-off-by:
Nick Hill
<
nhill@redhat.com
>
parent
59389c92
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
19 additions
and
16 deletions
+19
-16
vllm/config.py
vllm/config.py
+0
-1
vllm/platforms/cpu.py
vllm/platforms/cpu.py
+4
-3
vllm/platforms/cuda.py
vllm/platforms/cuda.py
+5
-3
vllm/platforms/tpu.py
vllm/platforms/tpu.py
+6
-4
vllm/platforms/xpu.py
vllm/platforms/xpu.py
+4
-5
No files found.
vllm/config.py
View file @
ffbcc9e7
...
...
@@ -4722,7 +4722,6 @@ class VllmConfig:
# calculate the default `batch_size_capture_list`
if
not
envs
.
VLLM_USE_V1
:
batch_size_capture_list
=
[]
max_batchsize_to_capture
=
0
if
self
.
scheduler_config
is
not
None
and
\
self
.
model_config
is
not
None
and
\
not
self
.
model_config
.
enforce_eager
:
...
...
vllm/platforms/cpu.py
View file @
ffbcc9e7
...
...
@@ -96,7 +96,8 @@ class CpuPlatform(Platform):
from
vllm.utils
import
GiB_bytes
model_config
=
vllm_config
.
model_config
model_config
.
disable_cascade_attn
=
True
if
model_config
is
not
None
:
model_config
.
disable_cascade_attn
=
True
cache_config
=
vllm_config
.
cache_config
...
...
@@ -123,7 +124,7 @@ class CpuPlatform(Platform):
"CPU backend doesn't support fp8_e4m3 KV cache type, "
"cast to fp8_e5m2."
)
if
(
cache_config
.
cache_dtype
!=
"auto"
if
(
cache_config
.
cache_dtype
!=
"auto"
and
model_config
is
not
None
and
model_config
.
dtype
==
torch
.
half
):
logger
.
warning
(
"FP8 KV cache on the CPU backend only does not"
" support fp16 for now, cast to bf16."
)
...
...
@@ -229,7 +230,7 @@ class CpuPlatform(Platform):
os
.
environ
[
"LOCAL_WORLD_SIZE"
]
=
str
(
vllm_config
.
parallel_config
.
tensor_parallel_size
)
if
vllm_config
.
model_config
and
vllm_config
.
model_config
.
use_mla
:
if
model_config
is
not
None
and
model_config
.
use_mla
:
logger
.
info
(
"MLA is enabled on a non-GPU platform; forcing chunked "
"prefill and prefix caching to be disabled."
)
...
...
vllm/platforms/cuda.py
View file @
ffbcc9e7
...
...
@@ -166,17 +166,19 @@ class CudaPlatformBase(Platform):
logger
.
info
(
"Forcing kv cache block size to 64 for FlashMLA backend."
)
compilation_config
=
vllm_config
.
compilation_config
if
(
envs
.
VLLM_ALL2ALL_BACKEND
==
"deepep_high_throughput"
and
parallel_config
.
data_parallel_size
>
1
and
vllm_config
.
compilation_config
.
use_cudagraph
):
and
compilation_config
.
use_cudagraph
):
logger
.
info
(
"Data Parallel: Forcing enforce eager to be True since DP "
"with DeepEP high-throughput kernels are not CUDA Graph "
"compatible. The DeepEP low-latency kernels are CUDA Graph "
"compatible. Set the all_to_all backend to deepep_low_latency "
"to use those kernels instead."
)
vllm_config
.
compilation_config
.
use_cudagraph
=
False
vllm_config
.
model_config
.
enforce_eager
=
True
compilation_config
.
use_cudagraph
=
False
if
model_config
is
not
None
:
model_config
.
enforce_eager
=
True
# TODO (varun): Turning this ON gives incorrect results for the
# Deepseek-V2-lite model.
vllm_config
.
compilation_config
.
use_inductor
=
False
...
...
vllm/platforms/tpu.py
View file @
ffbcc9e7
...
...
@@ -116,11 +116,13 @@ class TpuPlatform(Platform):
assert
vllm_config
.
speculative_config
is
None
,
\
"TPU does not support speculative decoding"
if
vllm_config
.
model_config
.
dtype
in
(
torch
.
float16
,
torch
.
float32
):
model_config
=
vllm_config
.
model_config
if
model_config
is
not
None
and
model_config
.
dtype
in
(
torch
.
float16
,
torch
.
float32
):
logger
.
warning
(
"The TPU backend currently does not support %s. "
"Using bfloat16 instead."
,
vllm_config
.
model_config
.
dtype
)
vllm_config
.
model_config
.
dtype
=
torch
.
bfloat16
"Using bfloat16 instead."
,
model_config
.
dtype
)
model_config
.
dtype
=
torch
.
bfloat16
from
vllm.v1.attention.backends.pallas
import
PallasAttentionBackend
cache_config
.
block_size
=
PallasAttentionBackend
.
get_page_size
(
...
...
@@ -146,7 +148,7 @@ class TpuPlatform(Platform):
"Forcing --disable_chunked_mm_input."
)
scheduler_config
.
disable_chunked_mm_input
=
True
if
vllm_config
.
model_config
and
vllm_config
.
model_config
.
use_mla
:
if
model_config
and
model_config
.
use_mla
:
logger
.
info
(
"MLA is enabled on a non-GPU platform; forcing chunked "
"prefill and prefix caching to be disabled."
)
...
...
vllm/platforms/xpu.py
View file @
ffbcc9e7
...
...
@@ -85,14 +85,14 @@ class XPUPlatform(Platform):
@
classmethod
def
check_and_update_config
(
cls
,
vllm_config
:
VllmConfig
)
->
None
:
cache_config
=
vllm_config
.
cache_config
model_config
=
vllm_config
.
model_config
# in V1(or with ipex chunked prefill) block_size is 64
if
cache_config
and
cache_config
.
block_size
is
None
:
cache_config
.
block_size
=
64
# FIXME: Temporarily forcing eager mode
# remove after t.compile support stabilizes.
if
(
envs
.
VLLM_USE_V1
and
vllm_config
.
model_config
is
not
None
if
(
envs
.
VLLM_USE_V1
and
model_config
is
not
None
and
not
vllm_config
.
model_config
.
enforce_eager
):
from
vllm.config
import
CompilationLevel
vllm_config
.
compilation_config
.
level
=
CompilationLevel
.
NO_COMPILATION
# noqa: E501
...
...
@@ -100,8 +100,7 @@ class XPUPlatform(Platform):
# Instances created using VllmConfig() typically have model_config as
# None by default. The modification involves adding a check to prevent
# potential null exceptions check and update model config.
if
vllm_config
.
model_config
is
not
None
:
model_config
=
vllm_config
.
model_config
if
model_config
is
not
None
:
if
model_config
.
dtype
==
torch
.
bfloat16
:
bf16_supported
=
cls
.
device_support_bf16
()
if
not
bf16_supported
:
...
...
@@ -139,7 +138,7 @@ class XPUPlatform(Platform):
parallel_config
.
distributed_executor_backend
)
parallel_config
.
distributed_executor_backend
=
"ray"
if
vllm_config
.
model_config
and
vllm_config
.
model_config
.
use_mla
:
if
model_config
and
model_config
.
use_mla
:
logger
.
info
(
"MLA is enabled on a non-GPU platform; forcing chunked "
"prefill and prefix caching to be disabled."
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment