[V0 Deprecation] Remove V0 executors (#27142)

Signed-off-by: Nick Hill <nhill@redhat.com>

[V0 Deprecation] Remove V0 executors (#27142)
Signed-off-by: Nick Hill <nhill@redhat.com>
647214f3 · Nick Hill · GitHub · ddeec11b · 647214f3 · 647214f3
Unverified Commit 647214f3 authored Oct 21, 2025 by Nick Hill Committed by GitHub Oct 21, 2025
20 changed files
--- a/tests/basic_correctness/test_basic_correctness.py
+++ b/tests/basic_correctness/test_basic_correctness.py
@@ -157,11 +157,9 @@ def test_models_distributed(
            and distributed_executor_backend == "ray"
            and attention_backend == ""
            and test_suite == "L4"
+            and enable_prompt_embeds
        ):  # noqa
-            if enable_prompt_embeds:
-                pytest.skip("enable_prompt_embeds does not work with ray compiled dag.")
-            monkeypatch_context.setenv("VLLM_USE_RAY_SPMD_WORKER", "1")
-            monkeypatch_context.setenv("VLLM_USE_RAY_COMPILED_DAG", "1")
+            pytest.skip("enable_prompt_embeds does not work with ray compiled dag.")

        if attention_backend:
            monkeypatch_context.setenv(

--- a/tests/distributed/test_multi_node_assignment.py
+++ b/tests/distributed/test_multi_node_assignment.py
@@ -18,8 +18,8 @@ from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy

 from vllm import initialize_ray_cluster
 from vllm.config import ParallelConfig
-from vllm.executor.ray_utils import _wait_until_pg_removed
 from vllm.utils.network_utils import get_ip
+from vllm.v1.executor.ray_utils import _wait_until_pg_removed

 VLLM_MULTI_NODE = os.getenv("VLLM_MULTI_NODE", "0") == "1"


--- a/tests/distributed/test_pipeline_parallel.py
+++ b/tests/distributed/test_pipeline_parallel.py
@@ -305,10 +305,8 @@ def _compare_tp(
        common_args.extend(["--max-num-seqs", f"{max_num_seqs}"])

    if distributed_backend == "ray":
-        # For V1, test Ray Compiled Graph for all the tests
+        # Test Ray Compiled Graph for all the tests
        pp_env = {
-            "VLLM_USE_RAY_COMPILED_DAG": "1",
-            "VLLM_USE_RAY_SPMD_WORKER": "1",
            "VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL": "1",
        }
        # Temporary. Currently when zeromq + SPMD is used, it does not properly

--- a/tests/model_executor/model_loader/tensorizer_loader/conftest.py
+++ b/tests/model_executor/model_loader/tensorizer_loader/conftest.py
@@ -9,7 +9,7 @@ from vllm.distributed import cleanup_dist_env_and_memory
 from vllm.model_executor.model_loader import tensorizer as tensorizer_mod
 from vllm.model_executor.model_loader.tensorizer import TensorizerConfig
 from vllm.utils.network_utils import get_distributed_init_method, get_ip, get_open_port
-from vllm.v1.executor.abstract import UniProcExecutor
+from vllm.v1.executor import UniProcExecutor
 from vllm.v1.worker.worker_base import WorkerWrapperBase

 MODEL_REF = "facebook/opt-125m"

--- a/tests/v1/engine/test_engine_core.py
+++ b/tests/v1/engine/test_engine_core.py
@@ -15,7 +15,8 @@ from vllm.platforms import current_platform
 from vllm.utils.torch_utils import set_default_torch_num_threads
 from vllm.v1.engine import EngineCoreRequest
 from vllm.v1.engine.core import EngineCore
-from vllm.v1.executor.abstract import Executor, UniProcExecutor
+from vllm.v1.executor.abstract import Executor
+from vllm.v1.executor.uniproc_executor import UniProcExecutor
 from vllm.v1.kv_cache_interface import KVCacheConfig
 from vllm.v1.outputs import ModelRunnerOutput


--- a/tools/pre_commit/check_pickle_imports.py
+++ b/tools/pre_commit/check_pickle_imports.py
@@ -17,8 +17,6 @@ import regex as re
 #  add to this list if absolutely necessary and after careful security review.
 ALLOWED_FILES = {
    # pickle
-    "vllm/v1/serial_utils.py",
-    "vllm/v1/executor/multiproc_executor.py",
    "vllm/multimodal/hasher.py",
    "vllm/transformers_utils/config.py",
    "vllm/model_executor/models/registry.py",
@@ -38,11 +36,13 @@ ALLOWED_FILES = {
    "benchmarks/cutlass_benchmarks/w8a8_benchmarks.py",
    "benchmarks/cutlass_benchmarks/sparse_benchmarks.py",
    # cloudpickle
-    "vllm/executor/mp_distributed_executor.py",
-    "vllm/executor/ray_distributed_executor.py",
+    "vllm/v1/executor/multiproc_executor.py",
+    "vllm/v1/executor/ray_executor.py",
    "vllm/entrypoints/llm.py",
    "vllm/utils/__init__.py",
    "tests/utils.py",
+    # pickle and cloudpickle
+    "vllm/v1/serial_utils.py",
 }

 PICKLE_RE = re.compile(

--- a/vllm/__init__.py
+++ b/vllm/__init__.py
@@ -21,7 +21,7 @@ MODULE_ATTRS = {
    "AsyncLLMEngine": ".engine.async_llm_engine:AsyncLLMEngine",
    "LLMEngine": ".engine.llm_engine:LLMEngine",
    "LLM": ".entrypoints.llm:LLM",
-    "initialize_ray_cluster": ".executor.ray_utils:initialize_ray_cluster",
+    "initialize_ray_cluster": ".v1.executor.ray_utils:initialize_ray_cluster",
    "PromptType": ".inputs:PromptType",
    "TextPrompt": ".inputs:TextPrompt",
    "TokensPrompt": ".inputs:TokensPrompt",
@@ -45,7 +45,6 @@ if typing.TYPE_CHECKING:
    from vllm.engine.async_llm_engine import AsyncLLMEngine
    from vllm.engine.llm_engine import LLMEngine
    from vllm.entrypoints.llm import LLM
-    from vllm.executor.ray_utils import initialize_ray_cluster
    from vllm.inputs import PromptType, TextPrompt, TokensPrompt
    from vllm.model_executor.models import ModelRegistry
    from vllm.outputs import (
@@ -62,6 +61,7 @@ if typing.TYPE_CHECKING:
    )
    from vllm.pooling_params import PoolingParams
    from vllm.sampling_params import SamplingParams
+    from vllm.v1.executor.ray_utils import initialize_ray_cluster

    from ._bc_linter import bc_linter_include, bc_linter_skip
 else:

--- a/vllm/config/parallel.py
+++ b/vllm/config/parallel.py
@@ -25,11 +25,11 @@ if TYPE_CHECKING:
    from ray.runtime_env import RuntimeEnv
    from ray.util.placement_group import PlacementGroup

-    from vllm.executor.executor_base import ExecutorBase
+    from vllm.v1.executor import Executor
 else:
    RuntimeEnv = Any
    PlacementGroup = Any
-    ExecutorBase = Any
+    Executor = Any

 logger = init_logger(__name__)

@@ -189,7 +189,7 @@ class ParallelConfig:
    """ray distributed model workers placement group."""

    distributed_executor_backend: (
-        str | DistributedExecutorBackend | type[ExecutorBase] | None
+        str | DistributedExecutorBackend | type[Executor] | None
    ) = None
    """Backend to use for distributed model
    workers, either "ray" or "mp" (multiprocessing). If the product
@@ -511,7 +511,7 @@ class ParallelConfig:
            # We use multiprocessing by default if world_size fits on the
            # current node and we aren't in a ray placement group.

-            from vllm.executor import ray_utils
+            from vllm.v1.executor import ray_utils

            backend: DistributedExecutorBackend = "mp"
            ray_found = ray_utils.ray_is_available()
@@ -553,6 +553,12 @@ class ParallelConfig:
        if self.distributed_executor_backend is None and self.world_size == 1:
            self.distributed_executor_backend = "uni"

+        if self.max_parallel_loading_workers is not None:
+            logger.warning(
+                "max_parallel_loading_workers is currently "
+                "not supported and will be ignored."
+            )
+
    @property
    def use_ray(self) -> bool:
        return self.distributed_executor_backend == "ray" or (
@@ -563,7 +569,7 @@ class ParallelConfig:
    @model_validator(mode="after")
    def _verify_args(self) -> Self:
        # Lazy import to avoid circular import
-        from vllm.executor.executor_base import ExecutorBase
+        from vllm.v1.executor import Executor

        # Enable batch invariance settings if requested
        if vllm_is_batch_invariant():
@@ -574,17 +580,17 @@ class ParallelConfig:
            and not isinstance(self.distributed_executor_backend, str)
            and not (
                isinstance(self.distributed_executor_backend, type)
-                and issubclass(self.distributed_executor_backend, ExecutorBase)
+                and issubclass(self.distributed_executor_backend, Executor)
            )
        ):
            raise ValueError(
                "Unrecognized distributed executor backend "
                f"{self.distributed_executor_backend}. Supported "
                "values are 'ray', 'mp' 'uni', 'external_launcher', "
-                " custom ExecutorBase subclass or its import path."
+                " custom Executor subclass or its import path."
            )
        if self.use_ray:
-            from vllm.executor import ray_utils
+            from vllm.v1.executor import ray_utils

            ray_utils.assert_ray_available()


--- a/vllm/config/scheduler.py
+++ b/vllm/config/scheduler.py
@@ -107,12 +107,6 @@ class SchedulerConfig:
    NOTE: This is not currently configurable. It will be overridden by
    max_num_batched_tokens in case max multimodal embedding size is larger."""

-    send_delta_data: bool = False
-    """Private API. If used, scheduler sends delta data to
-    workers instead of an entire data. It should be enabled only
-    when SPMD worker architecture is enabled. I.e.,
-    VLLM_USE_RAY_SPMD_WORKER=1"""
-
    policy: SchedulerPolicy = "fcfs"
    """The scheduling policy to use:\n
    - "fcfs" means first come first served, i.e. requests are handled in order

--- a/vllm/distributed/device_communicators/tpu_communicator.py
+++ b/vllm/distributed/device_communicators/tpu_communicator.py
@@ -31,7 +31,7 @@ if not USE_TPU_INFERENCE:
        )

        if USE_RAY:
-            from vllm.executor import ray_utils
+            from vllm.v1.executor import ray_utils


 class TpuCommunicator(DeviceCommunicatorBase):

--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -88,12 +88,12 @@ from vllm.utils.network_utils import get_ip
 from vllm.v1.sample.logits_processor import LogitsProcessor

 if TYPE_CHECKING:
-    from vllm.executor.executor_base import ExecutorBase
    from vllm.model_executor.layers.quantization import QuantizationMethods
    from vllm.model_executor.model_loader import LoadFormats
    from vllm.usage.usage_lib import UsageContext
+    from vllm.v1.executor import Executor
 else:
-    ExecutorBase = Any
+    Executor = Any
    QuantizationMethods = Any
    LoadFormats = Any
    UsageContext = Any
@@ -369,7 +369,7 @@ class EngineArgs:
    # is intended for expert use only. The API may change without
    # notice.
    distributed_executor_backend: (
-        str | DistributedExecutorBackend | type[ExecutorBase] | None
+        str | DistributedExecutorBackend | type[Executor] | None
    ) = ParallelConfig.distributed_executor_backend
    # number of P/D disaggregation (or other disaggregation) workers
    pipeline_parallel_size: int = ParallelConfig.pipeline_parallel_size
@@ -1549,7 +1549,6 @@ class EngineArgs:
            disable_chunked_mm_input=self.disable_chunked_mm_input,
            is_multimodal_model=model_config.is_multimodal_model,
            is_encoder_decoder=model_config.is_encoder_decoder,
-            send_delta_data=(envs.VLLM_USE_RAY_SPMD_WORKER and parallel_config.use_ray),
            policy=self.scheduling_policy,
            scheduler_cls=self.scheduler_cls,
            max_num_partial_prefills=self.max_num_partial_prefills,

--- a/vllm/entrypoints/cli/serve.py
+++ b/vllm/entrypoints/cli/serve.py
@@ -26,7 +26,7 @@ from vllm.utils import (
 from vllm.utils.network_utils import get_tcp_uri
 from vllm.v1.engine.core import EngineCoreProc
 from vllm.v1.engine.utils import CoreEngineProcManager, launch_core_engines
-from vllm.v1.executor.abstract import Executor
+from vllm.v1.executor import Executor
 from vllm.v1.metrics.prometheus import setup_multiprocess_prometheus
 from vllm.v1.utils import APIServerProcessManager, wait_for_completion_or_failure


--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -56,8 +56,6 @@ if TYPE_CHECKING:
    VLLM_XLA_CHECK_RECOMPILATION: bool = False
    VLLM_FUSED_MOE_CHUNK_SIZE: int = 64 * 1024
    VLLM_ENABLE_FUSED_MOE_ACTIVATION_CHUNKING: bool = True
-    VLLM_USE_RAY_SPMD_WORKER: bool = False
-    VLLM_USE_RAY_COMPILED_DAG: bool = False
    VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE: Literal["auto", "nccl", "shm"] = "auto"
    VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM: bool = False
    VLLM_USE_RAY_WRAPPED_PP_COMM: bool = True
@@ -623,22 +621,6 @@ environment_variables: dict[str, Callable[[], Any]] = {
    "VLLM_CPU_MOE_PREPACK": lambda: bool(int(os.getenv("VLLM_CPU_MOE_PREPACK", "1"))),
    # (CPU backend only) whether to use SGL kernels, optimized for small batch.
    "VLLM_CPU_SGL_KERNEL": lambda: bool(int(os.getenv("VLLM_CPU_SGL_KERNEL", "0"))),
-    # If the env var is set, then all workers will execute as separate
-    # processes from the engine, and we use the same mechanism to trigger
-    # execution on all workers.
-    # Run vLLM with VLLM_USE_RAY_SPMD_WORKER=1 to enable it.
-    "VLLM_USE_RAY_SPMD_WORKER": lambda: bool(
-        int(os.getenv("VLLM_USE_RAY_SPMD_WORKER", "0"))
-    ),
-    # If the env var is set, it uses the Ray's Compiled Graph
-    # (previously known as ADAG) API which optimizes the
-    # control plane overhead.
-    # Run vLLM with VLLM_USE_RAY_COMPILED_DAG=1 to enable it.
-    # Note that this variable is set to 1 in V1 by default
-    # when ray distributed executor is used.
-    "VLLM_USE_RAY_COMPILED_DAG": lambda: bool(
-        int(os.getenv("VLLM_USE_RAY_COMPILED_DAG", "0"))
-    ),
    # If the env var is set, Ray Compiled Graph uses the specified
    # channel type to communicate between workers belonging to
    # different pipeline-parallel stages.
@@ -646,20 +628,17 @@ environment_variables: dict[str, Callable[[], Any]] = {
    # - "auto": use the default channel type
    # - "nccl": use NCCL for communication
    # - "shm": use shared memory and gRPC for communication
-    # This flag is ignored if VLLM_USE_RAY_COMPILED_DAG is not set.
    "VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE": env_with_choices(
        "VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE", "auto", ["auto", "nccl", "shm"]
    ),
    # If the env var is set, it enables GPU communication overlap
-    # (experimental feature) in Ray's Compiled Graph. This flag is ignored if
-    # VLLM_USE_RAY_COMPILED_DAG is not set.
+    # (experimental feature) in Ray's Compiled Graph.
    "VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM": lambda: bool(
        int(os.getenv("VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM", "0"))
    ),
    # If the env var is set, it uses a Ray Communicator wrapping
    # vLLM's pipeline parallelism communicator to interact with Ray's
    # Compiled Graph. Otherwise, it uses Ray's NCCL communicator.
-    # This flag is ignored if VLLM_USE_RAY_COMPILED_DAG is not set.
    "VLLM_USE_RAY_WRAPPED_PP_COMM": lambda: bool(
        int(os.getenv("VLLM_USE_RAY_WRAPPED_PP_COMM", "1"))
    ),

--- a/vllm/executor/__init__.py
+++ b/vllm/executor/__init__.py
--- a/vllm/executor/executor_base.py
+++ b/vllm/executor/executor_base.py
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import asyncio
-import time
-from abc import ABC, abstractmethod
-from collections.abc import Awaitable, Callable
-from functools import cached_property
-from typing import Any
-
-from typing_extensions import TypeVar
-
-import vllm.platforms
-from vllm.config import VllmConfig
-from vllm.distributed.kv_transfer.kv_connector.utils import KVOutputAggregator
-from vllm.logger import init_logger
-from vllm.lora.request import LoRARequest
-from vllm.sequence import ExecuteModelRequest
-from vllm.tasks import SupportedTask
-from vllm.utils.async_utils import make_async
-from vllm.v1.outputs import SamplerOutput
-from vllm.v1.worker.worker_base import WorkerBase
-
-logger = init_logger(__name__)
-
-_R = TypeVar("_R", default=Any)
-
-
-class ExecutorBase(ABC):
-    """Base class for all executors.
-
-    An executor is responsible for executing the model on one device,
-    or it can be a distributed executor
-    that can execute the model on multiple devices.
-    """
-
-    uses_ray: bool  # whether the executor uses Ray for orchestration.
-    supports_pp: bool = False  # whether the executor supports PP
-
-    def __init__(
-        self,
-        vllm_config: VllmConfig,
-    ) -> None:
-        self.vllm_config = vllm_config
-        self.model_config = vllm_config.model_config
-        self.cache_config = vllm_config.cache_config
-        self.lora_config = vllm_config.lora_config
-        self.load_config = vllm_config.load_config
-        self.parallel_config = vllm_config.parallel_config
-        self.scheduler_config = vllm_config.scheduler_config
-        self.device_config = vllm_config.device_config
-        self.speculative_config = vllm_config.speculative_config
-        self.observability_config = vllm_config.observability_config
-        self._init_executor()
-        self.is_sleeping = False
-        self.sleeping_tags: set[str] = set()
-        self.kv_output_aggregator: KVOutputAggregator | None = None
-
-    @abstractmethod
-    def _init_executor(self) -> None:
-        raise NotImplementedError
-
-    @abstractmethod
-    def collective_rpc(
-        self,
-        method: str | Callable[[WorkerBase], _R],
-        timeout: float | None = None,
-        args: tuple = (),
-        kwargs: dict[str, Any] | None = None,
-    ) -> list[_R]:
-        """
-        Execute an RPC call on all workers.
-
-        Args:
-            method: Name of the worker method to execute, or a callable that
-                is serialized and sent to all workers to execute.
-
-                If the method is a callable, it should accept an additional
-                `self` argument, in addition to the arguments passed in `args`
-                and `kwargs`. The `self` argument will be the worker object.
-            timeout: Maximum time in seconds to wait for execution. Raises a
-                [`TimeoutError`][] on timeout. `None` means wait indefinitely.
-            args: Positional arguments to pass to the worker method.
-            kwargs: Keyword arguments to pass to the worker method.
-
-        Returns:
-            A list containing the results from each worker.
-
-        Note:
-            It is recommended to use this API to only pass control messages,
-            and set up data-plane communication to pass data.
-        """
-        raise NotImplementedError
-
-    def determine_num_available_blocks(self) -> tuple[int, int]:
-        """Determine the number of available blocks for the GPU KV cache and
-        swappable CPU KV cache.
-
-        Normally, this should simply delegate to the underlying Worker. Some
-        ExecutorBase may require modification of the result, e.g. to ensure the
-        selected cache sizes are compatible with all workers.
-
-        Returns a tuple `(num_gpu_blocks, num_cpu_blocks)`, where
-        `num_gpu_blocks` are blocks that are "active" on the device and can be
-        appended to.
-        `num_cpu_blocks` refers to "swapped" blocks in CPU memory and cannot be
-        appended to.
-        """
-        results = self.collective_rpc("determine_num_available_blocks")
-        a = min([r[0] for r in results])
-        b = min([r[1] for r in results])
-        return a, b
-
-    def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks) -> None:
-        """Initialize the KV cache by invoking the underlying worker."""
-        # NOTE: This is logged in the executor because there can be >1 workers.
-        logger.info(
-            "# %s blocks: %d, # CPU blocks: %d",
-            vllm.platforms.current_platform.device_name,
-            num_gpu_blocks,
-            num_cpu_blocks,
-        )
-        max_concurrency = (
-            num_gpu_blocks
-            * self.cache_config.block_size
-            / self.model_config.max_model_len
-        )
-        logger.info(
-            "Maximum concurrency for %s tokens per request: %.2fx",
-            self.model_config.max_model_len,
-            max_concurrency,
-        )
-
-        self.cache_config.num_gpu_blocks = num_gpu_blocks
-        self.cache_config.num_cpu_blocks = num_cpu_blocks
-
-        self.collective_rpc("initialize_cache", args=(num_gpu_blocks, num_cpu_blocks))
-
-    @cached_property  # Avoid unnecessary RPC calls
-    def supported_tasks(self) -> tuple[SupportedTask, ...]:
-        output = self.collective_rpc("get_supported_tasks")
-        return output[0]
-
-    def execute_model(
-        self, execute_model_req: ExecuteModelRequest
-    ) -> list[SamplerOutput]:
-        output = self.collective_rpc("execute_model", args=(execute_model_req,))
-        assert output[0] is not None
-        return output[0]
-
-    def stop_remote_worker_execution_loop(self) -> None:
-        """Releases parallel workers from model loop."""
-        return
-
-    def add_lora(self, lora_request: LoRARequest) -> bool:
-        assert lora_request.lora_int_id > 0, "lora_id must be greater than 0."
-        return all(self.collective_rpc("add_lora", args=(lora_request,)))
-
-    def remove_lora(self, lora_id: int) -> bool:
-        assert lora_id > 0, "lora_id must be greater than 0."
-        return all(self.collective_rpc("remove_lora", args=(lora_id,)))
-
-    def pin_lora(self, lora_id: int) -> bool:
-        assert lora_id > 0, "lora_id must be greater than 0."
-        return all(self.collective_rpc("pin_lora", args=(lora_id,)))
-
-    def list_loras(self) -> set[int]:
-        sets = self.collective_rpc("list_loras")
-        for s in sets:
-            assert s == sets[0], "All workers should have the same LORAs."
-        return sets[0]
-
-    def reset_mm_cache(self) -> None:
-        """Reset the multi-modal cache in each worker."""
-        self.collective_rpc("reset_mm_cache")
-
-    def start_profile(self) -> None:
-        self.collective_rpc("start_profile")
-
-    def stop_profile(self) -> None:
-        self.collective_rpc("stop_profile")
-
-    def sleep(self, level: int = 1):
-        if self.is_sleeping:
-            logger.warning("Executor is already sleeping.")
-            return
-        time_before_sleep = time.perf_counter()
-        self.collective_rpc("sleep", kwargs=dict(level=level))
-        time_after_sleep = time.perf_counter()
-        self.sleeping_tags = {"weights", "kv_cache"}
-        self.is_sleeping = True
-        logger.info(
-            "It took %.6f seconds to fall asleep.", time_after_sleep - time_before_sleep
-        )
-
-    def wake_up(self, tags: list[str] | None = None):
-        if not self.is_sleeping:
-            logger.warning("Executor is not sleeping.")
-            return
-        if tags:
-            for tag in tags:
-                if tag not in self.sleeping_tags:
-                    logger.warning(
-                        "Tag %s is not in sleeping tags %s", tag, self.sleeping_tags
-                    )
-                    return
-        time_before_wakeup = time.perf_counter()
-        self.collective_rpc("wake_up", kwargs=dict(tags=tags))
-        time_after_wakeup = time.perf_counter()
-        logger.info(
-            "It took %.6f seconds to wake up tags %s.",
-            time_after_wakeup - time_before_wakeup,
-            tags if tags is not None else self.sleeping_tags,
-        )
-        if tags:
-            for tag in tags:
-                self.sleeping_tags.remove(tag)
-        else:
-            self.sleeping_tags.clear()
-        if not self.sleeping_tags:
-            self.is_sleeping = False
-
-    def save_sharded_state(
-        self,
-        path: str,
-        pattern: str | None = None,
-        max_size: int | None = None,
-    ) -> None:
-        self.collective_rpc(
-            "save_sharded_state",
-            kwargs=dict(path=path, pattern=pattern, max_size=max_size),
-        )
-
-    @abstractmethod
-    def check_health(self) -> None:
-        """Checks if the executor is healthy. If not, it should raise an
-        exception."""
-        raise NotImplementedError
-
-    def shutdown(self) -> None:
-        """Shutdown the executor."""
-        self.collective_rpc("shutdown")
-
-    async def execute_model_async(
-        self, execute_model_req: ExecuteModelRequest
-    ) -> list[SamplerOutput]:
-        """Executes one model step on the given sequences."""
-        output = await make_async(self.execute_model)(execute_model_req)
-        return output
-
-    async def stop_remote_worker_execution_loop_async(self) -> None:
-        """Releases parallel workers from model loop."""
-        return
-
-    async def check_health_async(self) -> None:
-        """Checks if the executor is healthy. If not, it should raise an
-        exception."""
-        self.check_health()
-
-    def init_kv_output_aggregator(self, finished_count: int | None) -> None:
-        """Init KVOutputAggregator"""
-        self.kv_output_aggregator = KVOutputAggregator(
-            finished_count or self.parallel_config.world_size
-        )
-
-
-class DistributedExecutorBase(ExecutorBase):
-    """Abstract superclass of distributed executor implementations."""
-
-    def __init__(self, *args, **kwargs):
-        # This is non-None when the execute model loop is running
-        # in the parallel workers. It's a coroutine in the AsyncLLMEngine case.
-        self.parallel_worker_tasks: Any | Awaitable[Any] | None = None
-
-        super().__init__(*args, **kwargs)
-
-    def execute_model(
-        self,
-        execute_model_req: ExecuteModelRequest,
-    ) -> list[SamplerOutput]:
-        # TODO: unify into collective_rpc
-        if self.parallel_worker_tasks is None:
-            self.parallel_worker_tasks = self._run_workers(
-                "start_worker_execution_loop",
-                async_run_tensor_parallel_workers_only=True,
-            )
-
-        # Only the driver worker returns the sampling results.
-        driver_outputs = self._driver_execute_model(execute_model_req)
-        assert driver_outputs is not None
-        return driver_outputs
-
-    def stop_remote_worker_execution_loop(self) -> None:
-        if self.parallel_worker_tasks is None:
-            return
-
-        self._driver_execute_model(execute_model_req=None)
-        parallel_worker_tasks = self.parallel_worker_tasks
-        self.parallel_worker_tasks = None
-        # Ensure that workers exit model loop cleanly
-        # (this will raise otherwise)
-        self._wait_for_tasks_completion(parallel_worker_tasks)
-
-    @abstractmethod
-    def _driver_execute_model(
-        self, execute_model_req: ExecuteModelRequest | None
-    ) -> list[SamplerOutput] | None:
-        """Run execute_model in the driver worker.
-
-        Passing None will cause the driver to stop the model execution loop
-        running in each of the remote workers. In this case, this method
-        returns None. Otherwise, this method returns the model output.
-        """
-        raise NotImplementedError
-
-    def collective_rpc(
-        self,
-        method: str | Callable,
-        timeout: float | None = None,
-        args: tuple = (),
-        kwargs: dict[str, Any] | None = None,
-    ) -> list[Any]:
-        return self._run_workers(method, *args, **(kwargs or {}))
-
-    @abstractmethod
-    def _run_workers(
-        self,
-        method: str | Callable,
-        *args,
-        async_run_tensor_parallel_workers_only: bool = False,
-        max_concurrent_workers: int | None = None,
-        **kwargs,
-    ) -> Any:
-        """Runs the given method on all workers.
-
-        Args:
-            async_run_tensor_parallel_workers_only: If True the method will be
-                run only in the remote TP workers, not the driver worker.
-                It will also be run asynchronously and return a list of futures
-                rather than blocking on the results.
-
-        # TODO: simplify and merge with collective_rpc
-        """
-        raise NotImplementedError
-
-    @abstractmethod
-    def _wait_for_tasks_completion(self, parallel_worker_tasks: Any) -> None:
-        """Wait for futures returned from _run_workers() with
-        async_run_remote_workers_only to complete."""
-        raise NotImplementedError
-
-    async def execute_model_async(
-        self, execute_model_req: ExecuteModelRequest
-    ) -> list[SamplerOutput]:
-        if self.parallel_worker_tasks is None:
-            # Start model execution loop running in the parallel workers
-            self.parallel_worker_tasks = asyncio.create_task(
-                self._start_worker_execution_loop()
-            )
-
-        # Only the driver worker returns the sampling results.
-        return await self._driver_execute_model_async(execute_model_req)
-
-    async def stop_remote_worker_execution_loop_async(self) -> None:
-        if self.parallel_worker_tasks is None:
-            return
-
-        await self._driver_execute_model_async()
-        parallel_worker_tasks = self.parallel_worker_tasks
-        self.parallel_worker_tasks = None
-        # Ensure that workers exit model loop cleanly
-        # (this will raise otherwise)
-        await parallel_worker_tasks
-
-    @abstractmethod
-    async def _driver_execute_model_async(
-        self,
-        execute_model_req: ExecuteModelRequest | None = None,
-    ) -> list[SamplerOutput]:
-        """Execute the model asynchronously in the driver worker.
-
-        Passing None will cause the driver to stop the model execution
-        loop running in each of the remote workers.
-        """
-        raise NotImplementedError
-
-    @abstractmethod
-    async def _start_worker_execution_loop(self):
-        """Run execution loop on all workers. It guarantees all workers run
-        the loop or None of them is running the loop. Loop can be stopped by
-        `stop_remote_worker_execution_loop`.
-        The API is idempotent (guarantee only 1 loop run at any moment)."""
-        raise NotImplementedError
--- a/vllm/executor/msgspec_utils.py
+++ b/vllm/executor/msgspec_utils.py
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-from array import array
-from typing import Any
-
-from vllm.multimodal.inputs import MultiModalKwargs
-from vllm.sequence import VLLM_TOKEN_ID_ARRAY_TYPE
-
-
-def encode_hook(obj: Any) -> Any:
-    """Custom msgspec enc hook that supports array types and MultiModalKwargs.
-
-    See https://jcristharif.com/msgspec/api.html#msgspec.msgpack.Encoder
-    """
-    if isinstance(obj, array):
-        assert obj.typecode == VLLM_TOKEN_ID_ARRAY_TYPE, (
-            f"vLLM array type should use '{VLLM_TOKEN_ID_ARRAY_TYPE}' type. "
-            f"Given array has a type code of {obj.typecode}."
-        )
-        return obj.tobytes()
-    if isinstance(obj, MultiModalKwargs):
-        return dict(obj)
-
-
-def decode_hook(type: type, obj: Any) -> Any:
-    """Custom msgspec dec hook that supports array types and MultiModalKwargs.
-
-    See https://jcristharif.com/msgspec/api.html#msgspec.msgpack.Encoder
-    """
-    if type is array:
-        deserialized = array(VLLM_TOKEN_ID_ARRAY_TYPE)
-        deserialized.frombytes(obj)
-        return deserialized
-    if type is MultiModalKwargs:
-        return MultiModalKwargs(obj)
--- a/vllm/sequence.py
+++ b/vllm/sequence.py
@@ -5,7 +5,6 @@
 from dataclasses import dataclass
 from typing import TYPE_CHECKING, Any

-import msgspec
 import torch

 if TYPE_CHECKING:
@@ -92,12 +91,3 @@ class IntermediateTensors:

    def __repr__(self) -> str:
        return f"IntermediateTensors(tensors={self.tensors})"
-
-
-class ExecuteModelRequest(
-    msgspec.Struct,
-    array_like=True,  # type: ignore[call-arg]
-    omit_defaults=True,
-):  # type: ignore[call-arg]
-    # Placeholder. Remove.
-    pass
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -943,7 +943,7 @@ def maybe_register_config_serialize_by_value() -> None:
            cloudpickle.register_pickle_by_value(transformers_modules)

            # ray vendors its own version of cloudpickle
-            from vllm.executor.ray_utils import ray
+            from vllm.v1.executor.ray_utils import ray

            if ray:
                ray.cloudpickle.register_pickle_by_value(transformers_modules)

--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -39,7 +39,7 @@ from vllm.v1.engine.exceptions import EngineDeadError, EngineGenerateError
 from vllm.v1.engine.output_processor import OutputProcessor, RequestOutputCollector
 from vllm.v1.engine.parallel_sampling import ParentRequest
 from vllm.v1.engine.processor import Processor
-from vllm.v1.executor.abstract import Executor
+from vllm.v1.executor import Executor
 from vllm.v1.metrics.loggers import (
    StatLoggerFactory,
    StatLoggerManager,

--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -60,7 +60,7 @@ from vllm.v1.engine.utils import (
    EngineZmqAddresses,
    get_device_indices,
 )
-from vllm.v1.executor.abstract import Executor
+from vllm.v1.executor import Executor
 from vllm.v1.kv_cache_interface import KVCacheConfig
 from vllm.v1.metrics.stats import SchedulerStats
 from vllm.v1.outputs import ModelRunnerOutput
@@ -322,7 +322,6 @@ class EngineCore:
        with self.log_error_detail(scheduler_output):
            model_output = self.model_executor.execute_model(scheduler_output)

-        assert isinstance(model_output, ModelRunnerOutput)
        engine_core_outputs = self.scheduler.update_from_output(
            scheduler_output, model_output
        )
@@ -364,7 +363,7 @@ class EngineCore:
        if self.scheduler.has_requests():
            scheduler_output = self.scheduler.schedule()
            future = self.model_executor.execute_model(scheduler_output, non_block=True)
-            batch_queue.appendleft((future, scheduler_output))  # type: ignore[arg-type]
+            batch_queue.appendleft((future, scheduler_output))

            model_executed = scheduler_output.total_num_scheduled_tokens > 0
            if (
@@ -463,14 +462,6 @@ class EngineCore:
    ) -> list[_R]:
        return self.model_executor.collective_rpc(method, timeout, args, kwargs)

-    def save_tensorized_model(
-        self,
-        tensorizer_config,
-    ) -> None:
-        self.model_executor.save_tensorized_model(
-            tensorizer_config=tensorizer_config,
-        )
-
    def preprocess_add_request(self, request: EngineCoreRequest) -> tuple[Request, int]:
        """Preprocess the request.