Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
647214f3
Unverified
Commit
647214f3
authored
Oct 21, 2025
by
Nick Hill
Committed by
GitHub
Oct 21, 2025
Browse files
[V0 Deprecation] Remove V0 executors (#27142)
Signed-off-by:
Nick Hill
<
nhill@redhat.com
>
parent
ddeec11b
Changes
31
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
37 additions
and
510 deletions
+37
-510
tests/basic_correctness/test_basic_correctness.py
tests/basic_correctness/test_basic_correctness.py
+2
-4
tests/distributed/test_multi_node_assignment.py
tests/distributed/test_multi_node_assignment.py
+1
-1
tests/distributed/test_pipeline_parallel.py
tests/distributed/test_pipeline_parallel.py
+1
-3
tests/model_executor/model_loader/tensorizer_loader/conftest.py
...model_executor/model_loader/tensorizer_loader/conftest.py
+1
-1
tests/v1/engine/test_engine_core.py
tests/v1/engine/test_engine_core.py
+2
-1
tools/pre_commit/check_pickle_imports.py
tools/pre_commit/check_pickle_imports.py
+4
-4
vllm/__init__.py
vllm/__init__.py
+2
-2
vllm/config/parallel.py
vllm/config/parallel.py
+14
-8
vllm/config/scheduler.py
vllm/config/scheduler.py
+0
-6
vllm/distributed/device_communicators/tpu_communicator.py
vllm/distributed/device_communicators/tpu_communicator.py
+1
-1
vllm/engine/arg_utils.py
vllm/engine/arg_utils.py
+3
-4
vllm/entrypoints/cli/serve.py
vllm/entrypoints/cli/serve.py
+1
-1
vllm/envs.py
vllm/envs.py
+1
-22
vllm/executor/__init__.py
vllm/executor/__init__.py
+0
-0
vllm/executor/executor_base.py
vllm/executor/executor_base.py
+0
-393
vllm/executor/msgspec_utils.py
vllm/executor/msgspec_utils.py
+0
-36
vllm/sequence.py
vllm/sequence.py
+0
-10
vllm/transformers_utils/config.py
vllm/transformers_utils/config.py
+1
-1
vllm/v1/engine/async_llm.py
vllm/v1/engine/async_llm.py
+1
-1
vllm/v1/engine/core.py
vllm/v1/engine/core.py
+2
-11
No files found.
tests/basic_correctness/test_basic_correctness.py
View file @
647214f3
...
...
@@ -157,11 +157,9 @@ def test_models_distributed(
and
distributed_executor_backend
==
"ray"
and
attention_backend
==
""
and
test_suite
==
"L4"
and
enable_prompt_embeds
):
# noqa
if
enable_prompt_embeds
:
pytest
.
skip
(
"enable_prompt_embeds does not work with ray compiled dag."
)
monkeypatch_context
.
setenv
(
"VLLM_USE_RAY_SPMD_WORKER"
,
"1"
)
monkeypatch_context
.
setenv
(
"VLLM_USE_RAY_COMPILED_DAG"
,
"1"
)
if
attention_backend
:
monkeypatch_context
.
setenv
(
...
...
tests/distributed/test_multi_node_assignment.py
View file @
647214f3
...
...
@@ -18,8 +18,8 @@ from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy
from
vllm
import
initialize_ray_cluster
from
vllm.config
import
ParallelConfig
from
vllm.executor.ray_utils
import
_wait_until_pg_removed
from
vllm.utils.network_utils
import
get_ip
from
vllm.v1.executor.ray_utils
import
_wait_until_pg_removed
VLLM_MULTI_NODE
=
os
.
getenv
(
"VLLM_MULTI_NODE"
,
"0"
)
==
"1"
...
...
tests/distributed/test_pipeline_parallel.py
View file @
647214f3
...
...
@@ -305,10 +305,8 @@ def _compare_tp(
common_args
.
extend
([
"--max-num-seqs"
,
f
"
{
max_num_seqs
}
"
])
if
distributed_backend
==
"ray"
:
#
For V1, t
est Ray Compiled Graph for all the tests
#
T
est Ray Compiled Graph for all the tests
pp_env
=
{
"VLLM_USE_RAY_COMPILED_DAG"
:
"1"
,
"VLLM_USE_RAY_SPMD_WORKER"
:
"1"
,
"VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL"
:
"1"
,
}
# Temporary. Currently when zeromq + SPMD is used, it does not properly
...
...
tests/model_executor/model_loader/tensorizer_loader/conftest.py
View file @
647214f3
...
...
@@ -9,7 +9,7 @@ from vllm.distributed import cleanup_dist_env_and_memory
from
vllm.model_executor.model_loader
import
tensorizer
as
tensorizer_mod
from
vllm.model_executor.model_loader.tensorizer
import
TensorizerConfig
from
vllm.utils.network_utils
import
get_distributed_init_method
,
get_ip
,
get_open_port
from
vllm.v1.executor
.abstract
import
UniProcExecutor
from
vllm.v1.executor
import
UniProcExecutor
from
vllm.v1.worker.worker_base
import
WorkerWrapperBase
MODEL_REF
=
"facebook/opt-125m"
...
...
tests/v1/engine/test_engine_core.py
View file @
647214f3
...
...
@@ -15,7 +15,8 @@ from vllm.platforms import current_platform
from
vllm.utils.torch_utils
import
set_default_torch_num_threads
from
vllm.v1.engine
import
EngineCoreRequest
from
vllm.v1.engine.core
import
EngineCore
from
vllm.v1.executor.abstract
import
Executor
,
UniProcExecutor
from
vllm.v1.executor.abstract
import
Executor
from
vllm.v1.executor.uniproc_executor
import
UniProcExecutor
from
vllm.v1.kv_cache_interface
import
KVCacheConfig
from
vllm.v1.outputs
import
ModelRunnerOutput
...
...
tools/pre_commit/check_pickle_imports.py
View file @
647214f3
...
...
@@ -17,8 +17,6 @@ import regex as re
# add to this list if absolutely necessary and after careful security review.
ALLOWED_FILES
=
{
# pickle
"vllm/v1/serial_utils.py"
,
"vllm/v1/executor/multiproc_executor.py"
,
"vllm/multimodal/hasher.py"
,
"vllm/transformers_utils/config.py"
,
"vllm/model_executor/models/registry.py"
,
...
...
@@ -38,11 +36,13 @@ ALLOWED_FILES = {
"benchmarks/cutlass_benchmarks/w8a8_benchmarks.py"
,
"benchmarks/cutlass_benchmarks/sparse_benchmarks.py"
,
# cloudpickle
"vllm/executor/m
p_distributed
_executor.py"
,
"vllm/executor/ray_
distributed_
executor.py"
,
"vllm/
v1/
executor/m
ultiproc
_executor.py"
,
"vllm/
v1/
executor/ray_executor.py"
,
"vllm/entrypoints/llm.py"
,
"vllm/utils/__init__.py"
,
"tests/utils.py"
,
# pickle and cloudpickle
"vllm/v1/serial_utils.py"
,
}
PICKLE_RE
=
re
.
compile
(
...
...
vllm/__init__.py
View file @
647214f3
...
...
@@ -21,7 +21,7 @@ MODULE_ATTRS = {
"AsyncLLMEngine"
:
".engine.async_llm_engine:AsyncLLMEngine"
,
"LLMEngine"
:
".engine.llm_engine:LLMEngine"
,
"LLM"
:
".entrypoints.llm:LLM"
,
"initialize_ray_cluster"
:
".executor.ray_utils:initialize_ray_cluster"
,
"initialize_ray_cluster"
:
".
v1.
executor.ray_utils:initialize_ray_cluster"
,
"PromptType"
:
".inputs:PromptType"
,
"TextPrompt"
:
".inputs:TextPrompt"
,
"TokensPrompt"
:
".inputs:TokensPrompt"
,
...
...
@@ -45,7 +45,6 @@ if typing.TYPE_CHECKING:
from
vllm.engine.async_llm_engine
import
AsyncLLMEngine
from
vllm.engine.llm_engine
import
LLMEngine
from
vllm.entrypoints.llm
import
LLM
from
vllm.executor.ray_utils
import
initialize_ray_cluster
from
vllm.inputs
import
PromptType
,
TextPrompt
,
TokensPrompt
from
vllm.model_executor.models
import
ModelRegistry
from
vllm.outputs
import
(
...
...
@@ -62,6 +61,7 @@ if typing.TYPE_CHECKING:
)
from
vllm.pooling_params
import
PoolingParams
from
vllm.sampling_params
import
SamplingParams
from
vllm.v1.executor.ray_utils
import
initialize_ray_cluster
from
._bc_linter
import
bc_linter_include
,
bc_linter_skip
else
:
...
...
vllm/config/parallel.py
View file @
647214f3
...
...
@@ -25,11 +25,11 @@ if TYPE_CHECKING:
from
ray.runtime_env
import
RuntimeEnv
from
ray.util.placement_group
import
PlacementGroup
from
vllm.
executor
.executor
_base
import
Executor
Base
from
vllm.
v1
.executor
import
Executor
else
:
RuntimeEnv
=
Any
PlacementGroup
=
Any
Executor
Base
=
Any
Executor
=
Any
logger
=
init_logger
(
__name__
)
...
...
@@ -189,7 +189,7 @@ class ParallelConfig:
"""ray distributed model workers placement group."""
distributed_executor_backend
:
(
str
|
DistributedExecutorBackend
|
type
[
Executor
Base
]
|
None
str
|
DistributedExecutorBackend
|
type
[
Executor
]
|
None
)
=
None
"""Backend to use for distributed model
workers, either "ray" or "mp" (multiprocessing). If the product
...
...
@@ -511,7 +511,7 @@ class ParallelConfig:
# We use multiprocessing by default if world_size fits on the
# current node and we aren't in a ray placement group.
from
vllm.executor
import
ray_utils
from
vllm.
v1.
executor
import
ray_utils
backend
:
DistributedExecutorBackend
=
"mp"
ray_found
=
ray_utils
.
ray_is_available
()
...
...
@@ -553,6 +553,12 @@ class ParallelConfig:
if
self
.
distributed_executor_backend
is
None
and
self
.
world_size
==
1
:
self
.
distributed_executor_backend
=
"uni"
if
self
.
max_parallel_loading_workers
is
not
None
:
logger
.
warning
(
"max_parallel_loading_workers is currently "
"not supported and will be ignored."
)
@
property
def
use_ray
(
self
)
->
bool
:
return
self
.
distributed_executor_backend
==
"ray"
or
(
...
...
@@ -563,7 +569,7 @@ class ParallelConfig:
@
model_validator
(
mode
=
"after"
)
def
_verify_args
(
self
)
->
Self
:
# Lazy import to avoid circular import
from
vllm.
executor
.executor
_base
import
Executor
Base
from
vllm.
v1
.executor
import
Executor
# Enable batch invariance settings if requested
if
vllm_is_batch_invariant
():
...
...
@@ -574,17 +580,17 @@ class ParallelConfig:
and
not
isinstance
(
self
.
distributed_executor_backend
,
str
)
and
not
(
isinstance
(
self
.
distributed_executor_backend
,
type
)
and
issubclass
(
self
.
distributed_executor_backend
,
Executor
Base
)
and
issubclass
(
self
.
distributed_executor_backend
,
Executor
)
)
):
raise
ValueError
(
"Unrecognized distributed executor backend "
f
"
{
self
.
distributed_executor_backend
}
. Supported "
"values are 'ray', 'mp' 'uni', 'external_launcher', "
" custom Executor
Base
subclass or its import path."
" custom Executor subclass or its import path."
)
if
self
.
use_ray
:
from
vllm.executor
import
ray_utils
from
vllm.
v1.
executor
import
ray_utils
ray_utils
.
assert_ray_available
()
...
...
vllm/config/scheduler.py
View file @
647214f3
...
...
@@ -107,12 +107,6 @@ class SchedulerConfig:
NOTE: This is not currently configurable. It will be overridden by
max_num_batched_tokens in case max multimodal embedding size is larger."""
send_delta_data
:
bool
=
False
"""Private API. If used, scheduler sends delta data to
workers instead of an entire data. It should be enabled only
when SPMD worker architecture is enabled. I.e.,
VLLM_USE_RAY_SPMD_WORKER=1"""
policy
:
SchedulerPolicy
=
"fcfs"
"""The scheduling policy to use:
\n
- "fcfs" means first come first served, i.e. requests are handled in order
...
...
vllm/distributed/device_communicators/tpu_communicator.py
View file @
647214f3
...
...
@@ -31,7 +31,7 @@ if not USE_TPU_INFERENCE:
)
if
USE_RAY
:
from
vllm.executor
import
ray_utils
from
vllm.
v1.
executor
import
ray_utils
class
TpuCommunicator
(
DeviceCommunicatorBase
):
...
...
vllm/engine/arg_utils.py
View file @
647214f3
...
...
@@ -88,12 +88,12 @@ from vllm.utils.network_utils import get_ip
from
vllm.v1.sample.logits_processor
import
LogitsProcessor
if
TYPE_CHECKING
:
from
vllm.executor.executor_base
import
ExecutorBase
from
vllm.model_executor.layers.quantization
import
QuantizationMethods
from
vllm.model_executor.model_loader
import
LoadFormats
from
vllm.usage.usage_lib
import
UsageContext
from
vllm.v1.executor
import
Executor
else
:
Executor
Base
=
Any
Executor
=
Any
QuantizationMethods
=
Any
LoadFormats
=
Any
UsageContext
=
Any
...
...
@@ -369,7 +369,7 @@ class EngineArgs:
# is intended for expert use only. The API may change without
# notice.
distributed_executor_backend
:
(
str
|
DistributedExecutorBackend
|
type
[
Executor
Base
]
|
None
str
|
DistributedExecutorBackend
|
type
[
Executor
]
|
None
)
=
ParallelConfig
.
distributed_executor_backend
# number of P/D disaggregation (or other disaggregation) workers
pipeline_parallel_size
:
int
=
ParallelConfig
.
pipeline_parallel_size
...
...
@@ -1549,7 +1549,6 @@ class EngineArgs:
disable_chunked_mm_input
=
self
.
disable_chunked_mm_input
,
is_multimodal_model
=
model_config
.
is_multimodal_model
,
is_encoder_decoder
=
model_config
.
is_encoder_decoder
,
send_delta_data
=
(
envs
.
VLLM_USE_RAY_SPMD_WORKER
and
parallel_config
.
use_ray
),
policy
=
self
.
scheduling_policy
,
scheduler_cls
=
self
.
scheduler_cls
,
max_num_partial_prefills
=
self
.
max_num_partial_prefills
,
...
...
vllm/entrypoints/cli/serve.py
View file @
647214f3
...
...
@@ -26,7 +26,7 @@ from vllm.utils import (
from
vllm.utils.network_utils
import
get_tcp_uri
from
vllm.v1.engine.core
import
EngineCoreProc
from
vllm.v1.engine.utils
import
CoreEngineProcManager
,
launch_core_engines
from
vllm.v1.executor
.abstract
import
Executor
from
vllm.v1.executor
import
Executor
from
vllm.v1.metrics.prometheus
import
setup_multiprocess_prometheus
from
vllm.v1.utils
import
APIServerProcessManager
,
wait_for_completion_or_failure
...
...
vllm/envs.py
View file @
647214f3
...
...
@@ -56,8 +56,6 @@ if TYPE_CHECKING:
VLLM_XLA_CHECK_RECOMPILATION
:
bool
=
False
VLLM_FUSED_MOE_CHUNK_SIZE
:
int
=
64
*
1024
VLLM_ENABLE_FUSED_MOE_ACTIVATION_CHUNKING
:
bool
=
True
VLLM_USE_RAY_SPMD_WORKER
:
bool
=
False
VLLM_USE_RAY_COMPILED_DAG
:
bool
=
False
VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE
:
Literal
[
"auto"
,
"nccl"
,
"shm"
]
=
"auto"
VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM
:
bool
=
False
VLLM_USE_RAY_WRAPPED_PP_COMM
:
bool
=
True
...
...
@@ -623,22 +621,6 @@ environment_variables: dict[str, Callable[[], Any]] = {
"VLLM_CPU_MOE_PREPACK"
:
lambda
:
bool
(
int
(
os
.
getenv
(
"VLLM_CPU_MOE_PREPACK"
,
"1"
))),
# (CPU backend only) whether to use SGL kernels, optimized for small batch.
"VLLM_CPU_SGL_KERNEL"
:
lambda
:
bool
(
int
(
os
.
getenv
(
"VLLM_CPU_SGL_KERNEL"
,
"0"
))),
# If the env var is set, then all workers will execute as separate
# processes from the engine, and we use the same mechanism to trigger
# execution on all workers.
# Run vLLM with VLLM_USE_RAY_SPMD_WORKER=1 to enable it.
"VLLM_USE_RAY_SPMD_WORKER"
:
lambda
:
bool
(
int
(
os
.
getenv
(
"VLLM_USE_RAY_SPMD_WORKER"
,
"0"
))
),
# If the env var is set, it uses the Ray's Compiled Graph
# (previously known as ADAG) API which optimizes the
# control plane overhead.
# Run vLLM with VLLM_USE_RAY_COMPILED_DAG=1 to enable it.
# Note that this variable is set to 1 in V1 by default
# when ray distributed executor is used.
"VLLM_USE_RAY_COMPILED_DAG"
:
lambda
:
bool
(
int
(
os
.
getenv
(
"VLLM_USE_RAY_COMPILED_DAG"
,
"0"
))
),
# If the env var is set, Ray Compiled Graph uses the specified
# channel type to communicate between workers belonging to
# different pipeline-parallel stages.
...
...
@@ -646,20 +628,17 @@ environment_variables: dict[str, Callable[[], Any]] = {
# - "auto": use the default channel type
# - "nccl": use NCCL for communication
# - "shm": use shared memory and gRPC for communication
# This flag is ignored if VLLM_USE_RAY_COMPILED_DAG is not set.
"VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE"
:
env_with_choices
(
"VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE"
,
"auto"
,
[
"auto"
,
"nccl"
,
"shm"
]
),
# If the env var is set, it enables GPU communication overlap
# (experimental feature) in Ray's Compiled Graph. This flag is ignored if
# VLLM_USE_RAY_COMPILED_DAG is not set.
# (experimental feature) in Ray's Compiled Graph.
"VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM"
:
lambda
:
bool
(
int
(
os
.
getenv
(
"VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM"
,
"0"
))
),
# If the env var is set, it uses a Ray Communicator wrapping
# vLLM's pipeline parallelism communicator to interact with Ray's
# Compiled Graph. Otherwise, it uses Ray's NCCL communicator.
# This flag is ignored if VLLM_USE_RAY_COMPILED_DAG is not set.
"VLLM_USE_RAY_WRAPPED_PP_COMM"
:
lambda
:
bool
(
int
(
os
.
getenv
(
"VLLM_USE_RAY_WRAPPED_PP_COMM"
,
"1"
))
),
...
...
vllm/executor/__init__.py
deleted
100644 → 0
View file @
ddeec11b
vllm/executor/executor_base.py
deleted
100644 → 0
View file @
ddeec11b
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
asyncio
import
time
from
abc
import
ABC
,
abstractmethod
from
collections.abc
import
Awaitable
,
Callable
from
functools
import
cached_property
from
typing
import
Any
from
typing_extensions
import
TypeVar
import
vllm.platforms
from
vllm.config
import
VllmConfig
from
vllm.distributed.kv_transfer.kv_connector.utils
import
KVOutputAggregator
from
vllm.logger
import
init_logger
from
vllm.lora.request
import
LoRARequest
from
vllm.sequence
import
ExecuteModelRequest
from
vllm.tasks
import
SupportedTask
from
vllm.utils.async_utils
import
make_async
from
vllm.v1.outputs
import
SamplerOutput
from
vllm.v1.worker.worker_base
import
WorkerBase
logger
=
init_logger
(
__name__
)
_R
=
TypeVar
(
"_R"
,
default
=
Any
)
class
ExecutorBase
(
ABC
):
"""Base class for all executors.
An executor is responsible for executing the model on one device,
or it can be a distributed executor
that can execute the model on multiple devices.
"""
uses_ray
:
bool
# whether the executor uses Ray for orchestration.
supports_pp
:
bool
=
False
# whether the executor supports PP
def
__init__
(
self
,
vllm_config
:
VllmConfig
,
)
->
None
:
self
.
vllm_config
=
vllm_config
self
.
model_config
=
vllm_config
.
model_config
self
.
cache_config
=
vllm_config
.
cache_config
self
.
lora_config
=
vllm_config
.
lora_config
self
.
load_config
=
vllm_config
.
load_config
self
.
parallel_config
=
vllm_config
.
parallel_config
self
.
scheduler_config
=
vllm_config
.
scheduler_config
self
.
device_config
=
vllm_config
.
device_config
self
.
speculative_config
=
vllm_config
.
speculative_config
self
.
observability_config
=
vllm_config
.
observability_config
self
.
_init_executor
()
self
.
is_sleeping
=
False
self
.
sleeping_tags
:
set
[
str
]
=
set
()
self
.
kv_output_aggregator
:
KVOutputAggregator
|
None
=
None
@
abstractmethod
def
_init_executor
(
self
)
->
None
:
raise
NotImplementedError
@
abstractmethod
def
collective_rpc
(
self
,
method
:
str
|
Callable
[[
WorkerBase
],
_R
],
timeout
:
float
|
None
=
None
,
args
:
tuple
=
(),
kwargs
:
dict
[
str
,
Any
]
|
None
=
None
,
)
->
list
[
_R
]:
"""
Execute an RPC call on all workers.
Args:
method: Name of the worker method to execute, or a callable that
is serialized and sent to all workers to execute.
If the method is a callable, it should accept an additional
`self` argument, in addition to the arguments passed in `args`
and `kwargs`. The `self` argument will be the worker object.
timeout: Maximum time in seconds to wait for execution. Raises a
[`TimeoutError`][] on timeout. `None` means wait indefinitely.
args: Positional arguments to pass to the worker method.
kwargs: Keyword arguments to pass to the worker method.
Returns:
A list containing the results from each worker.
Note:
It is recommended to use this API to only pass control messages,
and set up data-plane communication to pass data.
"""
raise
NotImplementedError
def
determine_num_available_blocks
(
self
)
->
tuple
[
int
,
int
]:
"""Determine the number of available blocks for the GPU KV cache and
swappable CPU KV cache.
Normally, this should simply delegate to the underlying Worker. Some
ExecutorBase may require modification of the result, e.g. to ensure the
selected cache sizes are compatible with all workers.
Returns a tuple `(num_gpu_blocks, num_cpu_blocks)`, where
`num_gpu_blocks` are blocks that are "active" on the device and can be
appended to.
`num_cpu_blocks` refers to "swapped" blocks in CPU memory and cannot be
appended to.
"""
results
=
self
.
collective_rpc
(
"determine_num_available_blocks"
)
a
=
min
([
r
[
0
]
for
r
in
results
])
b
=
min
([
r
[
1
]
for
r
in
results
])
return
a
,
b
def
initialize_cache
(
self
,
num_gpu_blocks
:
int
,
num_cpu_blocks
)
->
None
:
"""Initialize the KV cache by invoking the underlying worker."""
# NOTE: This is logged in the executor because there can be >1 workers.
logger
.
info
(
"# %s blocks: %d, # CPU blocks: %d"
,
vllm
.
platforms
.
current_platform
.
device_name
,
num_gpu_blocks
,
num_cpu_blocks
,
)
max_concurrency
=
(
num_gpu_blocks
*
self
.
cache_config
.
block_size
/
self
.
model_config
.
max_model_len
)
logger
.
info
(
"Maximum concurrency for %s tokens per request: %.2fx"
,
self
.
model_config
.
max_model_len
,
max_concurrency
,
)
self
.
cache_config
.
num_gpu_blocks
=
num_gpu_blocks
self
.
cache_config
.
num_cpu_blocks
=
num_cpu_blocks
self
.
collective_rpc
(
"initialize_cache"
,
args
=
(
num_gpu_blocks
,
num_cpu_blocks
))
@
cached_property
# Avoid unnecessary RPC calls
def
supported_tasks
(
self
)
->
tuple
[
SupportedTask
,
...]:
output
=
self
.
collective_rpc
(
"get_supported_tasks"
)
return
output
[
0
]
def
execute_model
(
self
,
execute_model_req
:
ExecuteModelRequest
)
->
list
[
SamplerOutput
]:
output
=
self
.
collective_rpc
(
"execute_model"
,
args
=
(
execute_model_req
,))
assert
output
[
0
]
is
not
None
return
output
[
0
]
def
stop_remote_worker_execution_loop
(
self
)
->
None
:
"""Releases parallel workers from model loop."""
return
def
add_lora
(
self
,
lora_request
:
LoRARequest
)
->
bool
:
assert
lora_request
.
lora_int_id
>
0
,
"lora_id must be greater than 0."
return
all
(
self
.
collective_rpc
(
"add_lora"
,
args
=
(
lora_request
,)))
def
remove_lora
(
self
,
lora_id
:
int
)
->
bool
:
assert
lora_id
>
0
,
"lora_id must be greater than 0."
return
all
(
self
.
collective_rpc
(
"remove_lora"
,
args
=
(
lora_id
,)))
def
pin_lora
(
self
,
lora_id
:
int
)
->
bool
:
assert
lora_id
>
0
,
"lora_id must be greater than 0."
return
all
(
self
.
collective_rpc
(
"pin_lora"
,
args
=
(
lora_id
,)))
def
list_loras
(
self
)
->
set
[
int
]:
sets
=
self
.
collective_rpc
(
"list_loras"
)
for
s
in
sets
:
assert
s
==
sets
[
0
],
"All workers should have the same LORAs."
return
sets
[
0
]
def
reset_mm_cache
(
self
)
->
None
:
"""Reset the multi-modal cache in each worker."""
self
.
collective_rpc
(
"reset_mm_cache"
)
def
start_profile
(
self
)
->
None
:
self
.
collective_rpc
(
"start_profile"
)
def
stop_profile
(
self
)
->
None
:
self
.
collective_rpc
(
"stop_profile"
)
def
sleep
(
self
,
level
:
int
=
1
):
if
self
.
is_sleeping
:
logger
.
warning
(
"Executor is already sleeping."
)
return
time_before_sleep
=
time
.
perf_counter
()
self
.
collective_rpc
(
"sleep"
,
kwargs
=
dict
(
level
=
level
))
time_after_sleep
=
time
.
perf_counter
()
self
.
sleeping_tags
=
{
"weights"
,
"kv_cache"
}
self
.
is_sleeping
=
True
logger
.
info
(
"It took %.6f seconds to fall asleep."
,
time_after_sleep
-
time_before_sleep
)
def
wake_up
(
self
,
tags
:
list
[
str
]
|
None
=
None
):
if
not
self
.
is_sleeping
:
logger
.
warning
(
"Executor is not sleeping."
)
return
if
tags
:
for
tag
in
tags
:
if
tag
not
in
self
.
sleeping_tags
:
logger
.
warning
(
"Tag %s is not in sleeping tags %s"
,
tag
,
self
.
sleeping_tags
)
return
time_before_wakeup
=
time
.
perf_counter
()
self
.
collective_rpc
(
"wake_up"
,
kwargs
=
dict
(
tags
=
tags
))
time_after_wakeup
=
time
.
perf_counter
()
logger
.
info
(
"It took %.6f seconds to wake up tags %s."
,
time_after_wakeup
-
time_before_wakeup
,
tags
if
tags
is
not
None
else
self
.
sleeping_tags
,
)
if
tags
:
for
tag
in
tags
:
self
.
sleeping_tags
.
remove
(
tag
)
else
:
self
.
sleeping_tags
.
clear
()
if
not
self
.
sleeping_tags
:
self
.
is_sleeping
=
False
def
save_sharded_state
(
self
,
path
:
str
,
pattern
:
str
|
None
=
None
,
max_size
:
int
|
None
=
None
,
)
->
None
:
self
.
collective_rpc
(
"save_sharded_state"
,
kwargs
=
dict
(
path
=
path
,
pattern
=
pattern
,
max_size
=
max_size
),
)
@
abstractmethod
def
check_health
(
self
)
->
None
:
"""Checks if the executor is healthy. If not, it should raise an
exception."""
raise
NotImplementedError
def
shutdown
(
self
)
->
None
:
"""Shutdown the executor."""
self
.
collective_rpc
(
"shutdown"
)
async
def
execute_model_async
(
self
,
execute_model_req
:
ExecuteModelRequest
)
->
list
[
SamplerOutput
]:
"""Executes one model step on the given sequences."""
output
=
await
make_async
(
self
.
execute_model
)(
execute_model_req
)
return
output
async
def
stop_remote_worker_execution_loop_async
(
self
)
->
None
:
"""Releases parallel workers from model loop."""
return
async
def
check_health_async
(
self
)
->
None
:
"""Checks if the executor is healthy. If not, it should raise an
exception."""
self
.
check_health
()
def
init_kv_output_aggregator
(
self
,
finished_count
:
int
|
None
)
->
None
:
"""Init KVOutputAggregator"""
self
.
kv_output_aggregator
=
KVOutputAggregator
(
finished_count
or
self
.
parallel_config
.
world_size
)
class
DistributedExecutorBase
(
ExecutorBase
):
"""Abstract superclass of distributed executor implementations."""
def
__init__
(
self
,
*
args
,
**
kwargs
):
# This is non-None when the execute model loop is running
# in the parallel workers. It's a coroutine in the AsyncLLMEngine case.
self
.
parallel_worker_tasks
:
Any
|
Awaitable
[
Any
]
|
None
=
None
super
().
__init__
(
*
args
,
**
kwargs
)
def
execute_model
(
self
,
execute_model_req
:
ExecuteModelRequest
,
)
->
list
[
SamplerOutput
]:
# TODO: unify into collective_rpc
if
self
.
parallel_worker_tasks
is
None
:
self
.
parallel_worker_tasks
=
self
.
_run_workers
(
"start_worker_execution_loop"
,
async_run_tensor_parallel_workers_only
=
True
,
)
# Only the driver worker returns the sampling results.
driver_outputs
=
self
.
_driver_execute_model
(
execute_model_req
)
assert
driver_outputs
is
not
None
return
driver_outputs
def
stop_remote_worker_execution_loop
(
self
)
->
None
:
if
self
.
parallel_worker_tasks
is
None
:
return
self
.
_driver_execute_model
(
execute_model_req
=
None
)
parallel_worker_tasks
=
self
.
parallel_worker_tasks
self
.
parallel_worker_tasks
=
None
# Ensure that workers exit model loop cleanly
# (this will raise otherwise)
self
.
_wait_for_tasks_completion
(
parallel_worker_tasks
)
@
abstractmethod
def
_driver_execute_model
(
self
,
execute_model_req
:
ExecuteModelRequest
|
None
)
->
list
[
SamplerOutput
]
|
None
:
"""Run execute_model in the driver worker.
Passing None will cause the driver to stop the model execution loop
running in each of the remote workers. In this case, this method
returns None. Otherwise, this method returns the model output.
"""
raise
NotImplementedError
def
collective_rpc
(
self
,
method
:
str
|
Callable
,
timeout
:
float
|
None
=
None
,
args
:
tuple
=
(),
kwargs
:
dict
[
str
,
Any
]
|
None
=
None
,
)
->
list
[
Any
]:
return
self
.
_run_workers
(
method
,
*
args
,
**
(
kwargs
or
{}))
@
abstractmethod
def
_run_workers
(
self
,
method
:
str
|
Callable
,
*
args
,
async_run_tensor_parallel_workers_only
:
bool
=
False
,
max_concurrent_workers
:
int
|
None
=
None
,
**
kwargs
,
)
->
Any
:
"""Runs the given method on all workers.
Args:
async_run_tensor_parallel_workers_only: If True the method will be
run only in the remote TP workers, not the driver worker.
It will also be run asynchronously and return a list of futures
rather than blocking on the results.
# TODO: simplify and merge with collective_rpc
"""
raise
NotImplementedError
@
abstractmethod
def
_wait_for_tasks_completion
(
self
,
parallel_worker_tasks
:
Any
)
->
None
:
"""Wait for futures returned from _run_workers() with
async_run_remote_workers_only to complete."""
raise
NotImplementedError
async
def
execute_model_async
(
self
,
execute_model_req
:
ExecuteModelRequest
)
->
list
[
SamplerOutput
]:
if
self
.
parallel_worker_tasks
is
None
:
# Start model execution loop running in the parallel workers
self
.
parallel_worker_tasks
=
asyncio
.
create_task
(
self
.
_start_worker_execution_loop
()
)
# Only the driver worker returns the sampling results.
return
await
self
.
_driver_execute_model_async
(
execute_model_req
)
async
def
stop_remote_worker_execution_loop_async
(
self
)
->
None
:
if
self
.
parallel_worker_tasks
is
None
:
return
await
self
.
_driver_execute_model_async
()
parallel_worker_tasks
=
self
.
parallel_worker_tasks
self
.
parallel_worker_tasks
=
None
# Ensure that workers exit model loop cleanly
# (this will raise otherwise)
await
parallel_worker_tasks
@
abstractmethod
async
def
_driver_execute_model_async
(
self
,
execute_model_req
:
ExecuteModelRequest
|
None
=
None
,
)
->
list
[
SamplerOutput
]:
"""Execute the model asynchronously in the driver worker.
Passing None will cause the driver to stop the model execution
loop running in each of the remote workers.
"""
raise
NotImplementedError
@
abstractmethod
async
def
_start_worker_execution_loop
(
self
):
"""Run execution loop on all workers. It guarantees all workers run
the loop or None of them is running the loop. Loop can be stopped by
`stop_remote_worker_execution_loop`.
The API is idempotent (guarantee only 1 loop run at any moment)."""
raise
NotImplementedError
vllm/executor/msgspec_utils.py
deleted
100644 → 0
View file @
ddeec11b
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
array
import
array
from
typing
import
Any
from
vllm.multimodal.inputs
import
MultiModalKwargs
from
vllm.sequence
import
VLLM_TOKEN_ID_ARRAY_TYPE
def
encode_hook
(
obj
:
Any
)
->
Any
:
"""Custom msgspec enc hook that supports array types and MultiModalKwargs.
See https://jcristharif.com/msgspec/api.html#msgspec.msgpack.Encoder
"""
if
isinstance
(
obj
,
array
):
assert
obj
.
typecode
==
VLLM_TOKEN_ID_ARRAY_TYPE
,
(
f
"vLLM array type should use '
{
VLLM_TOKEN_ID_ARRAY_TYPE
}
' type. "
f
"Given array has a type code of
{
obj
.
typecode
}
."
)
return
obj
.
tobytes
()
if
isinstance
(
obj
,
MultiModalKwargs
):
return
dict
(
obj
)
def
decode_hook
(
type
:
type
,
obj
:
Any
)
->
Any
:
"""Custom msgspec dec hook that supports array types and MultiModalKwargs.
See https://jcristharif.com/msgspec/api.html#msgspec.msgpack.Encoder
"""
if
type
is
array
:
deserialized
=
array
(
VLLM_TOKEN_ID_ARRAY_TYPE
)
deserialized
.
frombytes
(
obj
)
return
deserialized
if
type
is
MultiModalKwargs
:
return
MultiModalKwargs
(
obj
)
vllm/sequence.py
View file @
647214f3
...
...
@@ -5,7 +5,6 @@
from
dataclasses
import
dataclass
from
typing
import
TYPE_CHECKING
,
Any
import
msgspec
import
torch
if
TYPE_CHECKING
:
...
...
@@ -92,12 +91,3 @@ class IntermediateTensors:
def
__repr__
(
self
)
->
str
:
return
f
"IntermediateTensors(tensors=
{
self
.
tensors
}
)"
class
ExecuteModelRequest
(
msgspec
.
Struct
,
array_like
=
True
,
# type: ignore[call-arg]
omit_defaults
=
True
,
):
# type: ignore[call-arg]
# Placeholder. Remove.
pass
vllm/transformers_utils/config.py
View file @
647214f3
...
...
@@ -943,7 +943,7 @@ def maybe_register_config_serialize_by_value() -> None:
cloudpickle
.
register_pickle_by_value
(
transformers_modules
)
# ray vendors its own version of cloudpickle
from
vllm.executor.ray_utils
import
ray
from
vllm.
v1.
executor.ray_utils
import
ray
if
ray
:
ray
.
cloudpickle
.
register_pickle_by_value
(
transformers_modules
)
...
...
vllm/v1/engine/async_llm.py
View file @
647214f3
...
...
@@ -39,7 +39,7 @@ from vllm.v1.engine.exceptions import EngineDeadError, EngineGenerateError
from
vllm.v1.engine.output_processor
import
OutputProcessor
,
RequestOutputCollector
from
vllm.v1.engine.parallel_sampling
import
ParentRequest
from
vllm.v1.engine.processor
import
Processor
from
vllm.v1.executor
.abstract
import
Executor
from
vllm.v1.executor
import
Executor
from
vllm.v1.metrics.loggers
import
(
StatLoggerFactory
,
StatLoggerManager
,
...
...
vllm/v1/engine/core.py
View file @
647214f3
...
...
@@ -60,7 +60,7 @@ from vllm.v1.engine.utils import (
EngineZmqAddresses
,
get_device_indices
,
)
from
vllm.v1.executor
.abstract
import
Executor
from
vllm.v1.executor
import
Executor
from
vllm.v1.kv_cache_interface
import
KVCacheConfig
from
vllm.v1.metrics.stats
import
SchedulerStats
from
vllm.v1.outputs
import
ModelRunnerOutput
...
...
@@ -322,7 +322,6 @@ class EngineCore:
with
self
.
log_error_detail
(
scheduler_output
):
model_output
=
self
.
model_executor
.
execute_model
(
scheduler_output
)
assert
isinstance
(
model_output
,
ModelRunnerOutput
)
engine_core_outputs
=
self
.
scheduler
.
update_from_output
(
scheduler_output
,
model_output
)
...
...
@@ -364,7 +363,7 @@ class EngineCore:
if
self
.
scheduler
.
has_requests
():
scheduler_output
=
self
.
scheduler
.
schedule
()
future
=
self
.
model_executor
.
execute_model
(
scheduler_output
,
non_block
=
True
)
batch_queue
.
appendleft
((
future
,
scheduler_output
))
# type: ignore[arg-type]
batch_queue
.
appendleft
((
future
,
scheduler_output
))
model_executed
=
scheduler_output
.
total_num_scheduled_tokens
>
0
if
(
...
...
@@ -463,14 +462,6 @@ class EngineCore:
)
->
list
[
_R
]:
return
self
.
model_executor
.
collective_rpc
(
method
,
timeout
,
args
,
kwargs
)
def
save_tensorized_model
(
self
,
tensorizer_config
,
)
->
None
:
self
.
model_executor
.
save_tensorized_model
(
tensorizer_config
=
tensorizer_config
,
)
def
preprocess_add_request
(
self
,
request
:
EngineCoreRequest
)
->
tuple
[
Request
,
int
]:
"""Preprocess the request.
...
...
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment