Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
721fb9b1
Unverified
Commit
721fb9b1
authored
May 23, 2025
by
wangxiyuan
Committed by
GitHub
May 22, 2025
Browse files
[Platform] Move platform check to right place (#18470)
Signed-off-by:
wangxiyuan
<
wangxiyuan1007@gmail.com
>
parent
1f3a1200
Changes
7
Show whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
71 additions
and
25 deletions
+71
-25
vllm/config.py
vllm/config.py
+10
-25
vllm/platforms/cpu.py
vllm/platforms/cpu.py
+11
-0
vllm/platforms/hpu.py
vllm/platforms/hpu.py
+11
-0
vllm/platforms/neuron.py
vllm/platforms/neuron.py
+11
-0
vllm/platforms/tpu.py
vllm/platforms/tpu.py
+11
-0
vllm/platforms/xpu.py
vllm/platforms/xpu.py
+11
-0
vllm/utils.py
vllm/utils.py
+6
-0
No files found.
vllm/config.py
View file @
721fb9b1
...
@@ -42,7 +42,10 @@ from vllm.transformers_utils.config import (
...
@@ -42,7 +42,10 @@ from vllm.transformers_utils.config import (
try_get_generation_config
,
uses_mrope
)
try_get_generation_config
,
uses_mrope
)
from
vllm.transformers_utils.s3_utils
import
S3Model
from
vllm.transformers_utils.s3_utils
import
S3Model
from
vllm.transformers_utils.utils
import
is_s3
,
maybe_model_redirect
from
vllm.transformers_utils.utils
import
is_s3
,
maybe_model_redirect
from
vllm.utils
import
(
GiB_bytes
,
LayerBlockType
,
cuda_device_count_stateless
,
from
vllm.utils
import
(
DEFAULT_MAX_NUM_BATCHED_TOKENS
,
MULTIMODAL_MODEL_MAX_NUM_BATCHED_TOKENS
,
POOLING_MODEL_MAX_NUM_BATCHED_TOKENS
,
GiB_bytes
,
LayerBlockType
,
cuda_device_count_stateless
,
get_cpu_memory
,
get_open_port
,
is_torch_equal_or_newer
,
get_cpu_memory
,
get_open_port
,
is_torch_equal_or_newer
,
random_uuid
,
resolve_obj_by_qualname
)
random_uuid
,
resolve_obj_by_qualname
)
...
@@ -64,12 +67,6 @@ logger = init_logger(__name__)
...
@@ -64,12 +67,6 @@ logger = init_logger(__name__)
ConfigT
=
TypeVar
(
"ConfigT"
,
bound
=
ConfigType
)
ConfigT
=
TypeVar
(
"ConfigT"
,
bound
=
ConfigType
)
# This value is chosen to have a balance between ITL and TTFT. Note it is
# not optimized for throughput.
_DEFAULT_MAX_NUM_BATCHED_TOKENS
=
2048
_POOLING_MODEL_MAX_NUM_BATCHED_TOKENS
=
32768
_MULTIMODAL_MODEL_MAX_NUM_BATCHED_TOKENS
=
5120
TaskOption
=
Literal
[
"auto"
,
"generate"
,
"embedding"
,
"embed"
,
"classify"
,
TaskOption
=
Literal
[
"auto"
,
"generate"
,
"embedding"
,
"embed"
,
"classify"
,
"score"
,
"reward"
,
"transcription"
]
"score"
,
"reward"
,
"transcription"
]
...
@@ -2074,28 +2071,28 @@ class SchedulerConfig:
...
@@ -2074,28 +2071,28 @@ class SchedulerConfig:
# so we don't reject sequences on account of a short
# so we don't reject sequences on account of a short
# max_num_batched_tokens.
# max_num_batched_tokens.
self
.
max_num_batched_tokens
=
max
(
self
.
max_num_batched_tokens
=
max
(
self
.
max_model_len
,
_
DEFAULT_MAX_NUM_BATCHED_TOKENS
)
self
.
max_model_len
,
DEFAULT_MAX_NUM_BATCHED_TOKENS
)
else
:
else
:
self
.
max_num_batched_tokens
=
(
self
.
max_num_batched_tokens
=
(
_
DEFAULT_MAX_NUM_BATCHED_TOKENS
)
DEFAULT_MAX_NUM_BATCHED_TOKENS
)
else
:
else
:
# If max_model_len is too short, use
# If max_model_len is too short, use
#
_
DEFAULT_MAX_NUM_BATCHED_TOKENS as the default value
# DEFAULT_MAX_NUM_BATCHED_TOKENS as the default value
# for higher throughput.
# for higher throughput.
self
.
max_num_batched_tokens
=
max
(
self
.
max_num_batched_tokens
=
max
(
self
.
max_model_len
,
_
DEFAULT_MAX_NUM_BATCHED_TOKENS
)
self
.
max_model_len
,
DEFAULT_MAX_NUM_BATCHED_TOKENS
)
if
self
.
runner_type
==
"pooling"
:
if
self
.
runner_type
==
"pooling"
:
# Choose specific value for higher throughput
# Choose specific value for higher throughput
self
.
max_num_batched_tokens
=
max
(
self
.
max_num_batched_tokens
=
max
(
self
.
max_num_batched_tokens
,
self
.
max_num_batched_tokens
,
_
POOLING_MODEL_MAX_NUM_BATCHED_TOKENS
,
POOLING_MODEL_MAX_NUM_BATCHED_TOKENS
,
)
)
if
self
.
is_multimodal_model
:
if
self
.
is_multimodal_model
:
# The value needs to be at least the number of multimodal tokens
# The value needs to be at least the number of multimodal tokens
self
.
max_num_batched_tokens
=
max
(
self
.
max_num_batched_tokens
=
max
(
self
.
max_num_batched_tokens
,
self
.
max_num_batched_tokens
,
_
MULTIMODAL_MODEL_MAX_NUM_BATCHED_TOKENS
,
MULTIMODAL_MODEL_MAX_NUM_BATCHED_TOKENS
,
)
)
# When using default settings,
# When using default settings,
...
@@ -4316,18 +4313,6 @@ class VllmConfig:
...
@@ -4316,18 +4313,6 @@ class VllmConfig:
"full_cuda_graph is not supported with "
"full_cuda_graph is not supported with "
"cascade attention. Disabling cascade attention."
)
"cascade attention. Disabling cascade attention."
)
self
.
model_config
.
disable_cascade_attn
=
True
self
.
model_config
.
disable_cascade_attn
=
True
if
self
.
model_config
and
self
.
model_config
.
use_mla
and
\
not
(
current_platform
.
is_cuda
()
or
current_platform
.
is_rocm
()):
logger
.
info
(
"MLA is enabled on a non-GPU platform; forcing chunked "
"prefill and prefix caching to be disabled."
)
self
.
scheduler_config
.
enable_chunked_prefill
=
False
self
.
scheduler_config
.
chunked_prefill_enabled
=
False
self
.
scheduler_config
.
max_num_batched_tokens
=
max
(
self
.
scheduler_config
.
max_model_len
,
_DEFAULT_MAX_NUM_BATCHED_TOKENS
)
if
self
.
cache_config
is
not
None
:
if
self
.
cache_config
is
not
None
:
self
.
cache_config
.
enable_prefix_caching
=
False
self
.
cache_config
.
enable_prefix_caching
=
False
...
...
vllm/platforms/cpu.py
View file @
721fb9b1
...
@@ -9,6 +9,7 @@ import psutil
...
@@ -9,6 +9,7 @@ import psutil
import
torch
import
torch
from
vllm.logger
import
init_logger
from
vllm.logger
import
init_logger
from
vllm.utils
import
DEFAULT_MAX_NUM_BATCHED_TOKENS
from
.interface
import
CpuArchEnum
,
Platform
,
PlatformEnum
,
_Backend
from
.interface
import
CpuArchEnum
,
Platform
,
PlatformEnum
,
_Backend
...
@@ -177,6 +178,16 @@ class CpuPlatform(Platform):
...
@@ -177,6 +178,16 @@ class CpuPlatform(Platform):
" set VLLM_WORKER_MULTIPROC_METHOD to fork explicitly."
)
" set VLLM_WORKER_MULTIPROC_METHOD to fork explicitly."
)
os
.
environ
[
'VLLM_WORKER_MULTIPROC_METHOD'
]
=
'spawn'
os
.
environ
[
'VLLM_WORKER_MULTIPROC_METHOD'
]
=
'spawn'
if
vllm_config
.
model_config
and
vllm_config
.
model_config
.
use_mla
:
logger
.
info
(
"MLA is enabled on a non-GPU platform; forcing chunked "
"prefill and prefix caching to be disabled."
)
vllm_config
.
scheduler_config
.
enable_chunked_prefill
=
False
vllm_config
.
scheduler_config
.
chunked_prefill_enabled
=
False
vllm_config
.
scheduler_config
.
max_num_batched_tokens
=
max
(
vllm_config
.
scheduler_config
.
max_model_len
,
DEFAULT_MAX_NUM_BATCHED_TOKENS
)
@
classmethod
@
classmethod
def
is_pin_memory_available
(
cls
)
->
bool
:
def
is_pin_memory_available
(
cls
)
->
bool
:
logger
.
warning
(
"Pin memory is not supported on CPU."
)
logger
.
warning
(
"Pin memory is not supported on CPU."
)
...
...
vllm/platforms/hpu.py
View file @
721fb9b1
...
@@ -7,6 +7,7 @@ import torch
...
@@ -7,6 +7,7 @@ import torch
from
vllm
import
envs
from
vllm
import
envs
from
vllm.logger
import
init_logger
from
vllm.logger
import
init_logger
from
vllm.utils
import
DEFAULT_MAX_NUM_BATCHED_TOKENS
from
.interface
import
Platform
,
PlatformEnum
,
_Backend
from
.interface
import
Platform
,
PlatformEnum
,
_Backend
...
@@ -80,6 +81,16 @@ class HpuPlatform(Platform):
...
@@ -80,6 +81,16 @@ class HpuPlatform(Platform):
"VLLM_WORKER_MULTIPROC_METHOD=fork explicitly."
)
"VLLM_WORKER_MULTIPROC_METHOD=fork explicitly."
)
os
.
environ
[
"VLLM_WORKER_MULTIPROC_METHOD"
]
=
"spawn"
os
.
environ
[
"VLLM_WORKER_MULTIPROC_METHOD"
]
=
"spawn"
if
vllm_config
.
model_config
and
vllm_config
.
model_config
.
use_mla
:
logger
.
info
(
"MLA is enabled on a non-GPU platform; forcing chunked "
"prefill and prefix caching to be disabled."
)
vllm_config
.
scheduler_config
.
enable_chunked_prefill
=
False
vllm_config
.
scheduler_config
.
chunked_prefill_enabled
=
False
vllm_config
.
scheduler_config
.
max_num_batched_tokens
=
max
(
vllm_config
.
scheduler_config
.
max_model_len
,
DEFAULT_MAX_NUM_BATCHED_TOKENS
)
@
classmethod
@
classmethod
def
is_pin_memory_available
(
cls
):
def
is_pin_memory_available
(
cls
):
logger
.
warning
(
"Pin memory is not supported on HPU."
)
logger
.
warning
(
"Pin memory is not supported on HPU."
)
...
...
vllm/platforms/neuron.py
View file @
721fb9b1
...
@@ -6,6 +6,7 @@ from typing import TYPE_CHECKING, Optional
...
@@ -6,6 +6,7 @@ from typing import TYPE_CHECKING, Optional
from
vllm
import
envs
from
vllm
import
envs
from
vllm.logger
import
init_logger
from
vllm.logger
import
init_logger
from
vllm.utils
import
DEFAULT_MAX_NUM_BATCHED_TOKENS
from
.interface
import
Platform
,
PlatformEnum
from
.interface
import
Platform
,
PlatformEnum
...
@@ -56,6 +57,16 @@ class NeuronPlatform(Platform):
...
@@ -56,6 +57,16 @@ class NeuronPlatform(Platform):
vllm_config
.
cache_config
.
block_size
=
\
vllm_config
.
cache_config
.
block_size
=
\
vllm_config
.
model_config
.
max_model_len
# type: ignore
vllm_config
.
model_config
.
max_model_len
# type: ignore
if
vllm_config
.
model_config
and
vllm_config
.
model_config
.
use_mla
:
logger
.
info
(
"MLA is enabled on a non-GPU platform; forcing chunked "
"prefill and prefix caching to be disabled."
)
vllm_config
.
scheduler_config
.
enable_chunked_prefill
=
False
vllm_config
.
scheduler_config
.
chunked_prefill_enabled
=
False
vllm_config
.
scheduler_config
.
max_num_batched_tokens
=
max
(
vllm_config
.
scheduler_config
.
max_model_len
,
DEFAULT_MAX_NUM_BATCHED_TOKENS
)
@
classmethod
@
classmethod
def
is_pin_memory_available
(
cls
)
->
bool
:
def
is_pin_memory_available
(
cls
)
->
bool
:
logger
.
warning
(
"Pin memory is not supported on Neuron."
)
logger
.
warning
(
"Pin memory is not supported on Neuron."
)
...
...
vllm/platforms/tpu.py
View file @
721fb9b1
...
@@ -9,6 +9,7 @@ import vllm.envs as envs
...
@@ -9,6 +9,7 @@ import vllm.envs as envs
from
vllm.inputs
import
ProcessorInputs
,
PromptType
from
vllm.inputs
import
ProcessorInputs
,
PromptType
from
vllm.logger
import
init_logger
from
vllm.logger
import
init_logger
from
vllm.sampling_params
import
SamplingParams
,
SamplingType
from
vllm.sampling_params
import
SamplingParams
,
SamplingType
from
vllm.utils
import
DEFAULT_MAX_NUM_BATCHED_TOKENS
from
.interface
import
Platform
,
PlatformEnum
,
_Backend
from
.interface
import
Platform
,
PlatformEnum
,
_Backend
...
@@ -161,6 +162,16 @@ class TpuPlatform(Platform):
...
@@ -161,6 +162,16 @@ class TpuPlatform(Platform):
"Forcing --disable_chunked_mm_input."
)
"Forcing --disable_chunked_mm_input."
)
scheduler_config
.
disable_chunked_mm_input
=
True
scheduler_config
.
disable_chunked_mm_input
=
True
if
vllm_config
.
model_config
and
vllm_config
.
model_config
.
use_mla
:
logger
.
info
(
"MLA is enabled on a non-GPU platform; forcing chunked "
"prefill and prefix caching to be disabled."
)
vllm_config
.
scheduler_config
.
enable_chunked_prefill
=
False
vllm_config
.
scheduler_config
.
chunked_prefill_enabled
=
False
vllm_config
.
scheduler_config
.
max_num_batched_tokens
=
max
(
vllm_config
.
scheduler_config
.
max_model_len
,
DEFAULT_MAX_NUM_BATCHED_TOKENS
)
@
classmethod
@
classmethod
def
is_pin_memory_available
(
cls
):
def
is_pin_memory_available
(
cls
):
logger
.
warning
(
"Pin memory is not supported on TPU."
)
logger
.
warning
(
"Pin memory is not supported on TPU."
)
...
...
vllm/platforms/xpu.py
View file @
721fb9b1
...
@@ -5,6 +5,7 @@ from typing import TYPE_CHECKING, Optional
...
@@ -5,6 +5,7 @@ from typing import TYPE_CHECKING, Optional
import
torch
import
torch
from
vllm.logger
import
init_logger
from
vllm.logger
import
init_logger
from
vllm.utils
import
DEFAULT_MAX_NUM_BATCHED_TOKENS
from
.interface
import
DeviceCapability
,
Platform
,
PlatformEnum
,
_Backend
from
.interface
import
DeviceCapability
,
Platform
,
PlatformEnum
,
_Backend
...
@@ -113,6 +114,16 @@ class XPUPlatform(Platform):
...
@@ -113,6 +114,16 @@ class XPUPlatform(Platform):
parallel_config
.
distributed_executor_backend
)
parallel_config
.
distributed_executor_backend
)
parallel_config
.
distributed_executor_backend
=
"ray"
parallel_config
.
distributed_executor_backend
=
"ray"
if
vllm_config
.
model_config
and
vllm_config
.
model_config
.
use_mla
:
logger
.
info
(
"MLA is enabled on a non-GPU platform; forcing chunked "
"prefill and prefix caching to be disabled."
)
vllm_config
.
scheduler_config
.
enable_chunked_prefill
=
False
vllm_config
.
scheduler_config
.
chunked_prefill_enabled
=
False
vllm_config
.
scheduler_config
.
max_num_batched_tokens
=
max
(
vllm_config
.
scheduler_config
.
max_model_len
,
DEFAULT_MAX_NUM_BATCHED_TOKENS
)
@
classmethod
@
classmethod
def
is_pin_memory_available
(
cls
):
def
is_pin_memory_available
(
cls
):
logger
.
warning
(
"Pin memory is not supported on XPU."
)
logger
.
warning
(
"Pin memory is not supported on XPU."
)
...
...
vllm/utils.py
View file @
721fb9b1
...
@@ -77,6 +77,12 @@ if TYPE_CHECKING:
...
@@ -77,6 +77,12 @@ if TYPE_CHECKING:
logger
=
init_logger
(
__name__
)
logger
=
init_logger
(
__name__
)
# This value is chosen to have a balance between ITL and TTFT. Note it is
# not optimized for throughput.
DEFAULT_MAX_NUM_BATCHED_TOKENS
=
2048
POOLING_MODEL_MAX_NUM_BATCHED_TOKENS
=
32768
MULTIMODAL_MODEL_MAX_NUM_BATCHED_TOKENS
=
5120
# Exception strings for non-implemented encoder/decoder scenarios
# Exception strings for non-implemented encoder/decoder scenarios
# Reminder: Please update docs/source/features/compatibility_matrix.md
# Reminder: Please update docs/source/features/compatibility_matrix.md
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment