[core] platform agnostic executor via collective_rpc (#11256)

Signed-off-by: youkaichao <youkaichao@gmail.com>

[core] platform agnostic executor via collective_rpc (#11256)
Signed-off-by: youkaichao <youkaichao@gmail.com>
ad34c0df · youkaichao · GitHub · f218f9c2 · f218f9c2 · ad34c0df
Unverified Commit ad34c0df authored Jan 15, 2025 by youkaichao Committed by GitHub Jan 15, 2025
20 changed files
--- a/vllm/executor/ray_tpu_executor.py
+++ b/vllm/executor/ray_tpu_executor.py
-import asyncio
-import os
-from collections import defaultdict
-from itertools import islice, repeat
-from typing import (TYPE_CHECKING, Any, Awaitable, Dict, List, Optional, Tuple,
-                    Union)
-
-import vllm.envs as envs
-from vllm.executor.executor_base import ExecutorAsyncBase
-from vllm.executor.ray_utils import RayWorkerWrapper, ray
-from vllm.executor.tpu_executor import TPUExecutor
-from vllm.logger import init_logger
-from vllm.model_executor.layers.sampler import SamplerOutput
-from vllm.sequence import ExecuteModelRequest
-from vllm.utils import (get_distributed_init_method, get_ip, get_open_port,
-                        make_async)
-
-if ray is not None:
-    from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy
-
-if TYPE_CHECKING:
-    from ray.util.placement_group import PlacementGroup
-
-logger = init_logger(__name__)
-
-
-class RayTPUExecutor(TPUExecutor):
-
-    uses_ray: bool = True
-
-    def __init__(self, *args, **kwargs):
-        # This is non-None when the execute model loop is running
-        # in the parallel workers. It's a coroutine in the AsyncLLMEngine case.
-        self.parallel_worker_tasks: Optional[Union[Any, Awaitable[Any]]] = None
-        # Updated by implementations that require additional args to be passed
-        # to the _run_workers execute_model call
-        self.extra_execute_model_run_workers_kwargs: Dict[str, Any] = {}
-
-        super().__init__(*args, **kwargs)
-
-    def _init_executor(self) -> None:
-        assert self.parallel_config.distributed_executor_backend == "ray"
-        placement_group = self.parallel_config.placement_group
-
-        # Disable Ray usage stats collection.
-        ray_usage = os.environ.get("RAY_USAGE_STATS_ENABLED", "0")
-        if ray_usage != "1":
-            os.environ["RAY_USAGE_STATS_ENABLED"] = "0"
-
-        # Create the parallel TPU workers.
-        self._init_workers_ray(placement_group)
-
-    def _init_workers_ray(self, placement_group: "PlacementGroup",
-                          **ray_remote_kwargs):
-        # The driver dummy worker does not actually use any resources.
-        # It holds the resource for the driver worker.
-        self.driver_dummy_worker: Optional[RayWorkerWrapper] = None
-        # The remaining workers are the actual ray actors.
-        self.workers: List[RayWorkerWrapper] = []
-
-        # Create the workers.
-        driver_ip = get_ip()
-        for bundle_id, bundle in enumerate(placement_group.bundle_specs):
-            if not bundle.get("TPU", 0):
-                continue
-            scheduling_strategy = PlacementGroupSchedulingStrategy(
-                placement_group=placement_group,
-                placement_group_capture_child_tasks=True,
-                placement_group_bundle_index=bundle_id,
-            )
-
-            # GKE does not fetch environment information from metadata server
-            # and instead sets these from within the Ray process. Therefore we
-            # need to override the Ray environment variables manually.
-            override_env = {}
-            if "TPU_CHIPS_PER_HOST_BOUNDS" in os.environ:
-                override_env.update({
-                    "TPU_CHIPS_PER_HOST_BOUNDS":
-                    os.environ["TPU_CHIPS_PER_HOST_BOUNDS"]
-                })
-            if "TPU_HOST_BOUNDS" in os.environ:
-                override_env.update(
-                    {"TPU_HOST_BOUNDS": os.environ["TPU_HOST_BOUNDS"]})
-
-            worker = ray.remote(
-                num_cpus=0,
-                resources={"TPU": 1},
-                scheduling_strategy=scheduling_strategy,
-                **ray_remote_kwargs,
-            )(RayWorkerWrapper).remote(vllm_config=self.vllm_config)
-            if override_env:
-                worker.override_env_vars.remote(override_env)
-
-            worker_ip = ray.get(worker.get_node_ip.remote())
-            if worker_ip == driver_ip and self.driver_dummy_worker is None:
-                # If the worker is on the same node as the driver, we use it
-                # as the resource holder for the driver process.
-                self.driver_dummy_worker = worker
-                self.driver_worker = RayWorkerWrapper(
-                    vllm_config=self.vllm_config)
-            else:
-                # Else, added to the list of workers.
-                self.workers.append(worker)
-
-        logger.debug("workers: %s", self.workers)
-        logger.debug("driver_dummy_worker: %s", self.driver_dummy_worker)
-        if self.driver_dummy_worker is None:
-            raise ValueError(
-                "Ray does not allocate any TPUs on the driver node. Consider "
-                "adjusting the Ray placement group or running the driver on a "
-                "TPU node.")
-
-        worker_ips = [
-            ray.get(worker.get_node_ip.remote())  # type: ignore[attr-defined]
-            for worker in self.workers
-        ]
-        ip_counts: Dict[str, int] = {}
-        for ip in worker_ips:
-            ip_counts[ip] = ip_counts.get(ip, 0) + 1
-
-        def sort_by_driver_then_worker_ip(worker):
-            """
-            Sort the workers based on 3 properties:
-            1. If the worker is on the same node as the driver (vllm engine),
-                it should be placed first.
-            2. Then, if the worker is on a node with fewer workers, it should
-                be placed first.
-            3. Finally, if the work is on a node with smaller IP address, it
-                should be placed first.
-            """
-            ip = ray.get(worker.get_node_ip.remote())
-            return (ip != driver_ip, ip_counts[ip], ip)
-
-        # After sorting, the workers on the same node will be
-        # close to each other, and the workers on the driver
-        # node will be placed first.
-        self.workers = sorted(self.workers, key=sort_by_driver_then_worker_ip)
-
-        # Get the set of TPU IDs used on each node.
-        worker_node_and_gpu_ids = []
-        for worker in [self.driver_dummy_worker] + self.workers:
-            if worker is None:
-                # driver_dummy_worker can be None when using ray spmd worker.
-                continue
-            worker_node_and_gpu_ids.append(
-                ray.get(worker.get_node_and_gpu_ids.remote()) \
-            ) # type: ignore
-
-        node_workers = defaultdict(list)
-        for i, (node_id, _) in enumerate(worker_node_and_gpu_ids):
-            node_workers[node_id].append(i)
-
-        # Set environment variables for the driver and workers.
-        all_args_to_update_environment_variables = [({
-            "VLLM_TRACE_FUNCTION":
-            str(envs.VLLM_TRACE_FUNCTION),
-        }, ) for _ in worker_node_and_gpu_ids]
-        self._run_workers("update_environment_variables",
-                          all_args=all_args_to_update_environment_variables)
-
-        if len(node_workers) == 1:
-            # in single node case, we don't need to get the IP address.
-            # the loopback address is sufficient
-            # NOTE: a node may have several IP addresses, one for each
-            # network interface. `get_ip()` might return any of them,
-            # while they might not work for communication inside the node
-            # if the network setup is complicated. Using the loopback address
-            # solves this issue, as it always works for communication inside
-            # the node.
-            driver_ip = "127.0.0.1"
-        distributed_init_method = get_distributed_init_method(
-            driver_ip, get_open_port())
-
-        # Initialize the actual workers inside worker wrapper.
-        init_worker_all_kwargs = [
-            self._get_worker_kwargs(
-                local_rank=node_workers[node_id].index(rank),
-                rank=rank,
-                distributed_init_method=distributed_init_method,
-            ) for rank, (node_id, _) in enumerate(worker_node_and_gpu_ids)
-        ]
-        self._run_workers("init_worker", all_kwargs=init_worker_all_kwargs)
-
-        self._run_workers("init_device")
-        self._run_workers("load_model",
-                          max_concurrent_workers=self.parallel_config.
-                          max_parallel_loading_workers)
-
-    def _driver_execute_model(
-        self,
-        execute_model_req: Optional[ExecuteModelRequest] = None
-    ) -> List[SamplerOutput]:
-        """Run execute_model in the driver worker.
-
-        Passing None will cause the driver to stop the model execution
-        loop running in each of the remote workers.
-        """
-        return self.driver_worker.execute_method("execute_model",
-                                                 execute_model_req)
-
-    def _run_workers(
-        self,
-        method: str,
-        *args,
-        async_run_remote_workers_only: bool = False,
-        all_args: Optional[List[Tuple[Any, ...]]] = None,
-        all_kwargs: Optional[List[Dict[str, Any]]] = None,
-        max_concurrent_workers: Optional[int] = None,
-        use_ray_compiled_dag: bool = False,
-        **kwargs,
-    ) -> Any:
-        """Runs the given method on all workers. Can be used in the following
-        ways:
-
-        - async_run_remote_workers_only: If True the method will be run only
-          in the remote workers, not the driver worker. It will also be
-          run asynchronously and return a list of futures rather than blocking
-          on the results.
-        - args/kwargs: All workers share the same args/kwargs
-        - all_args/all_kwargs: args/kwargs for each worker are specified
-          individually
-        """
-
-        if max_concurrent_workers:
-            raise NotImplementedError(
-                "max_concurrent_workers is not supported yet.")
-
-        count = len(self.workers)
-        all_worker_args = repeat(args, count) if all_args is None \
-            else islice(all_args, 1, None)
-        all_worker_kwargs = repeat(kwargs, count) if all_kwargs is None \
-            else islice(all_kwargs, 1, None)
-
-        # Start the ray workers first.
-        ray_worker_outputs = [
-            worker.execute_method.remote(method, *worker_args, **worker_kwargs)
-            for (worker, worker_args, worker_kwargs
-                 ) in zip(self.workers, all_worker_args, all_worker_kwargs)
-        ]
-
-        if async_run_remote_workers_only:
-            # Just return futures
-            return ray_worker_outputs
-
-        driver_args = args if all_args is None else all_args[0]
-        driver_kwargs = kwargs if all_kwargs is None else all_kwargs[0]
-
-        # Start the driver worker after all the ray workers.
-        driver_worker_output = self.driver_worker.execute_method(
-            method, *driver_args, **driver_kwargs)
-        # Get the results of the ray workers.
-        if self.workers:
-            ray_worker_outputs = ray.get(ray_worker_outputs)
-
-        return [driver_worker_output] + ray_worker_outputs
-
-    def _wait_for_tasks_completion(self, parallel_worker_tasks: Any) -> None:
-        """Wait for futures returned from _run_workers() with
-        async_run_remote_workers_only to complete."""
-        ray.get(parallel_worker_tasks)
-
-    def determine_num_available_blocks(self) -> Tuple[int, int]:
-        num_blocks = self._run_workers("determine_num_available_blocks", )
-        num_tpu_blocks = min(b[0] for b in num_blocks)
-        num_cpu_blocks = min(b[1] for b in num_blocks)
-        return num_tpu_blocks, num_cpu_blocks
-
-    def initialize_cache(self, num_gpu_blocks: int,
-                         num_cpu_blocks: int) -> None:
-        logger.info("# TPU blocks: %d, # CPU blocks: %d", num_gpu_blocks,
-                    num_cpu_blocks)
-        self.cache_config.num_gpu_blocks = num_gpu_blocks
-        self.cache_config.num_cpu_blocks = num_cpu_blocks
-        self._run_workers("initialize_cache",
-                          num_gpu_blocks=num_gpu_blocks,
-                          num_cpu_blocks=num_cpu_blocks)
-
-    def execute_model(
-        self,
-        execute_model_req: ExecuteModelRequest,
-    ) -> List[SamplerOutput]:
-        if self.parallel_worker_tasks is None:
-            self.parallel_worker_tasks = self._run_workers(
-                "start_worker_execution_loop",
-                async_run_remote_workers_only=True,
-                **self.extra_execute_model_run_workers_kwargs)
-
-        # Only the driver worker returns the sampling results.
-        return self._driver_execute_model(execute_model_req)
-
-    def stop_remote_worker_execution_loop(self) -> None:
-        if self.parallel_worker_tasks is None:
-            return
-
-        self._driver_execute_model()
-        parallel_worker_tasks = self.parallel_worker_tasks
-        self.parallel_worker_tasks = None
-        # Ensure that workers exit model loop cleanly
-        # (this will raise otherwise)
-        self._wait_for_tasks_completion(parallel_worker_tasks)
-
-
-class RayTPUExecutorAsync(RayTPUExecutor, ExecutorAsyncBase):
-
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self.driver_exec_method = make_async(self.driver_worker.execute_method)
-
-    async def execute_model_async(
-            self,
-            execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]:
-        if self.parallel_worker_tasks is None:
-            # Start model execution loop running in the parallel workers
-            self.parallel_worker_tasks = asyncio.create_task(
-                self._start_worker_execution_loop())
-
-        # Only the driver worker returns the sampling results.
-        return await self._driver_execute_model_async(execute_model_req)
-
-    async def stop_remote_worker_execution_loop_async(self) -> None:
-        if self.parallel_worker_tasks is None:
-            return
-
-        await self._driver_execute_model_async()
-        parallel_worker_tasks = self.parallel_worker_tasks
-        self.parallel_worker_tasks = None
-        # Ensure that workers exit model loop cleanly
-        # (this will raise otherwise)
-        await parallel_worker_tasks
-
-    async def _driver_execute_model_async(
-        self,
-        execute_model_req: Optional[ExecuteModelRequest] = None
-    ) -> List[SamplerOutput]:
-        return await self.driver_exec_method("execute_model",
-                                             execute_model_req)
-
-    async def _start_worker_execution_loop(self):
-        coros = [
-            worker.execute_method.remote("start_worker_execution_loop")
-            for worker in self.workers
-        ]
-        return await asyncio.gather(*coros)
--- a/vllm/executor/ray_utils.py
+++ b/vllm/executor/ray_utils.py
 import os
 import time
 from collections import defaultdict
-from typing import Dict, List, Optional, Tuple, Union
+from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union

 import msgspec

@@ -13,6 +13,10 @@ from vllm.sequence import ExecuteModelRequest, IntermediateTensors
 from vllm.utils import get_ip
 from vllm.worker.worker_base import WorkerWrapperBase

+if TYPE_CHECKING:
+    from vllm.v1.core.scheduler import SchedulerOutput
+    from vllm.v1.outputs import ModelRunnerOutput
+
 logger = init_logger(__name__)
 PG_WAIT_TIMEOUT = 1800

@@ -95,6 +99,26 @@ try:

            return output

+        def setup_device_if_necessary(self):
+            # TODO(swang): This is needed right now because Ray CG executes
+            # on a background thread, so we need to reset torch's current
+            # device.
+            # We can remove this API after it is fixed in compiled graph.
+            import torch
+            assert self.worker is not None, "Worker is not initialized"
+            if not self.compiled_dag_cuda_device_set:
+                torch.cuda.set_device(self.worker.device)
+                self.compiled_dag_cuda_device_set = True
+
+        def execute_model(
+            self,
+            scheduler_output: "SchedulerOutput",
+        ) -> "ModelRunnerOutput":
+            self.setup_device_if_necessary()
+            assert self.worker is not None, "Worker is not initialized"
+            output = self.worker.model_runner.execute_model(scheduler_output)
+            return output
+
        def override_env_vars(self, vars: Dict[str, str]):
            os.environ.update(vars)


--- a/vllm/executor/ray_xpu_executor.py
+++ b/vllm/executor/ray_xpu_executor.py
-import asyncio
-from typing import List, Optional
-
-import ray
-
-import vllm.envs as envs
-from vllm.executor.ray_gpu_executor import RayGPUExecutor, RayGPUExecutorAsync
-from vllm.executor.xpu_executor import XPUExecutor
-from vllm.logger import init_logger
-from vllm.utils import make_async
-
-logger = init_logger(__name__)
-
-
-class RayXPUExecutor(RayGPUExecutor, XPUExecutor):
-
-    def _get_env_vars_to_be_updated(self):
-        # Get the set of GPU IDs used on each node.
-        worker_node_and_gpu_ids = []
-        for worker in [self.driver_dummy_worker] + self.workers:
-            if worker is None:
-                # driver_dummy_worker can be None when using ray spmd worker.
-                continue
-            worker_node_and_gpu_ids.append(
-                ray.get(worker.get_node_and_gpu_ids.remote()))  # type: ignore
-
-        # Set environment variables for the driver and workers.
-        all_args_to_update_environment_variables = [({
-            "VLLM_TRACE_FUNCTION":
-            str(envs.VLLM_TRACE_FUNCTION),
-        }, ) for (_, _) in worker_node_and_gpu_ids]
-        return all_args_to_update_environment_variables
-
-
-class RayXPUExecutorAsync(RayXPUExecutor, RayGPUExecutorAsync):
-
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self.driver_exec_method = make_async(self.driver_worker.execute_method)
-        self.pp_locks: Optional[List[asyncio.Lock]] = None
--- a/vllm/executor/tpu_executor.py
+++ b/vllm/executor/tpu_executor.py
-from typing import Any, Dict, List, Optional, Set, Tuple
-
-import torch
-
-from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase
-from vllm.logger import init_logger
-from vllm.lora.request import LoRARequest
-from vllm.model_executor.layers.sampler import SamplerOutput
-from vllm.sequence import ExecuteModelRequest
-from vllm.utils import (get_distributed_init_method, get_ip, get_open_port,
-                        make_async)
-
-logger = init_logger(__name__)
-
-
-class TPUExecutor(ExecutorBase):
-
-    uses_ray: bool = False
-
-    def _init_executor(self) -> None:
-        assert not self.scheduler_config.chunked_prefill_enabled, (
-            "Chunked prefill is not yet supported for TPU backend")
-        assert not self.speculative_config, (
-            "Speculative decoding is not yet supported for TPU backend")
-        if self.model_config.dtype in (torch.float16, torch.float32):
-            logger.warning(
-                "The TPU backend currently does not support %s. "
-                "Using bfloat16 instead.", self.model_config.dtype)
-            self.model_config.dtype = torch.bfloat16
-
-        # Instantiate the worker and load the model to the device.
-        self.driver_worker = self._create_worker()
-        self.driver_worker.init_device()
-        self.driver_worker.load_model()
-
-    def _get_worker_kwargs(
-        self,
-        local_rank: int = 0,
-        rank: int = 0,
-        distributed_init_method: Optional[str] = None,
-    ) -> Dict[str, Any]:
-        """Return worker init args for a given rank."""
-        if distributed_init_method is None:
-            distributed_init_method = get_distributed_init_method(
-                get_ip(), get_open_port())
-        return dict(
-            vllm_config=self.vllm_config,
-            local_rank=local_rank,
-            rank=rank,
-            distributed_init_method=distributed_init_method,
-            is_driver_worker=rank == 0,
-        )
-
-    def _create_worker(
-        self,
-        local_rank: int = 0,
-        rank: int = 0,
-        distributed_init_method: Optional[str] = None,
-    ):
-        if self.scheduler_config.is_multi_step:
-            from vllm.worker.multi_step_tpu_worker import MultiStepTPUWorker
-            worker = MultiStepTPUWorker(**self._get_worker_kwargs(
-                local_rank, rank, distributed_init_method))
-            return worker
-        else:
-            from vllm.worker.tpu_worker import TPUWorker
-
-            worker = TPUWorker(**self._get_worker_kwargs(
-                local_rank, rank, distributed_init_method))
-            return worker
-
-    def initialize_cache(
-        self,
-        num_gpu_blocks: int,
-        num_cpu_blocks: int,
-    ) -> None:
-        """Initialize the KV cache by invoking the underlying worker."""
-        # NOTE: This is logged in the executor because there can be >1 worker
-        # with other executors. We could log in the engine level, but work
-        # remains to abstract away the device for non-GPU configurations.
-        logger.info("# TPU blocks: %d, # CPU blocks: %d", num_gpu_blocks,
-                    num_cpu_blocks)
-        self.driver_worker.initialize_cache(num_gpu_blocks, num_cpu_blocks)
-
-    def determine_num_available_blocks(self) -> Tuple[int, int]:
-        """Determine the number of available KV blocks by invoking the
-        underlying worker."""
-        return self.driver_worker.determine_num_available_blocks()
-
-    def execute_model(
-        self,
-        execute_model_req: ExecuteModelRequest,
-    ) -> List[SamplerOutput]:
-        output = self.driver_worker.execute_model(execute_model_req)
-        return output
-
-    def add_lora(self, lora_request: LoRARequest) -> bool:
-        raise NotImplementedError(
-            "LoRA is currently not supported by the TPU backend.")
-
-    def remove_lora(self, lora_id: int) -> bool:
-        raise NotImplementedError(
-            "LoRA is currently not supported by the TPU backend.")
-
-    def pin_lora(self, lora_id: int) -> bool:
-        raise NotImplementedError(
-            "LoRA is currently not supported by the TPU backend.")
-
-    def list_loras(self) -> Set[int]:
-        raise NotImplementedError(
-            "LoRA is currently not supported by the TPU backend.")
-
-    def add_prompt_adapter(self, prompt_adapter_request) -> bool:
-        raise NotImplementedError(
-            "Soft prompt is currently not supported by the TPU backend.")
-
-    def remove_prompt_adapter(self, prompt_adapter_id: int) -> bool:
-        raise NotImplementedError(
-            "Soft prompt is currently not supported by the TPU backend.")
-
-    def pin_prompt_adapter(self, prompt_adapter_id: int) -> bool:
-        raise NotImplementedError(
-            "Soft prompt is currently not supported by the TPU backend.")
-
-    def list_prompt_adapters(self) -> Set[int]:
-        raise NotImplementedError(
-            "Soft prompt is currently not supported by the TPU backend.")
-
-    def check_health(self) -> None:
-        # TPUExecutor will always be healthy as long as it's running.
-        return
-
-
-class TPUExecutorAsync(TPUExecutor, ExecutorAsyncBase):
-
-    async def execute_model_async(
-        self,
-        sexecute_model_req: ExecuteModelRequest,
-    ) -> SamplerOutput:
-        output = await make_async(self.driver_worker.execute_model
-                                  )(sexecute_model_req)
-        return output
--- a/vllm/executor/uniproc_executor.py
+++ b/vllm/executor/uniproc_executor.py
+from typing import Any, Dict, List, Optional, Tuple
+
+from vllm.executor.executor_base import ExecutorBase
+from vllm.logger import init_logger
+from vllm.utils import get_distributed_init_method, get_ip, get_open_port
+from vllm.worker.worker_base import WorkerWrapperBase
+
+logger = init_logger(__name__)
+
+
+class UniProcExecutor(ExecutorBase):
+
+    uses_ray: bool = False
+
+    def _init_executor(self) -> None:
+        """Initialize the worker and load the model.
+        """
+        self.driver_worker = WorkerWrapperBase(vllm_config=self.vllm_config,
+                                               rank=0)
+        distributed_init_method = get_distributed_init_method(
+            get_ip(), get_open_port())
+        local_rank = 0
+        rank = 0
+        kwargs = dict(
+            vllm_config=self.vllm_config,
+            local_rank=local_rank,
+            rank=rank,
+            distributed_init_method=distributed_init_method,
+            is_driver_worker=(not self.parallel_config)
+            or (rank % self.parallel_config.tensor_parallel_size == 0),
+        )
+        self.collective_rpc("init_worker", args=([kwargs], ))
+        self.collective_rpc("init_device")
+        self.collective_rpc("load_model")
+
+    def collective_rpc(self,
+                       method: str,
+                       timeout: Optional[float] = None,
+                       args: Tuple = (),
+                       kwargs: Optional[Dict] = None) -> List[Any]:
+        if kwargs is None:
+            kwargs = {}
+        try:
+            func = getattr(self.driver_worker, method)
+        except AttributeError:
+            raise NotImplementedError(f"Method {method} is not implemented.") \
+                from None
+        answer = func(*args, **kwargs)
+        return [answer]
+
+    def check_health(self) -> None:
+        # UniProcExecutor will always be healthy as long as
+        # it's running.
+        return
+
+
+UniProcExecutorAsync = UniProcExecutor
--- a/vllm/executor/xpu_executor.py
+++ b/vllm/executor/xpu_executor.py
-from typing import List, Optional, Union
-
-from vllm.executor.executor_base import ExecutorAsyncBase
-from vllm.executor.gpu_executor import GPUExecutor
-from vllm.logger import init_logger
-from vllm.model_executor.layers.sampler import SamplerOutput
-from vllm.sequence import ExecuteModelRequest, PoolerOutput
-from vllm.utils import make_async
-
-logger = init_logger(__name__)
-
-
-class XPUExecutor(GPUExecutor):
-
-    uses_ray: bool = False
-
-    def _init_executor(self) -> None:
-        assert self.device_config.device_type == "xpu"
-        assert self.speculative_config is None, (
-            "Speculative decoding not yet supported for XPU backend")
-
-        GPUExecutor._init_executor(self)
-
-    def execute_model(
-        self, execute_model_req: ExecuteModelRequest
-    ) -> Optional[List[Union[SamplerOutput, PoolerOutput]]]:
-        output = self.driver_worker.execute_model(execute_model_req)
-        return output
-
-
-class XPUExecutorAsync(XPUExecutor, ExecutorAsyncBase):
-
-    async def execute_model_async(
-        self,
-        execute_model_req: ExecuteModelRequest,
-    ) -> List[SamplerOutput]:
-        output = await make_async(self.driver_worker.execute_model
-                                  )(execute_model_req=execute_model_req)
-        return output
--- a/vllm/platforms/cpu.py
+++ b/vllm/platforms/cpu.py
+import os
 from typing import TYPE_CHECKING, Optional

 import psutil
@@ -105,6 +106,32 @@ class CpuPlatform(Platform):
            else:
                parallel_config.worker_cls = "vllm.worker.cpu_worker.CPUWorker"

+        assert vllm_config.device_config.device_type == "cpu"
+
+        #
+        # Environment variables for CPU executor
+        #
+
+        # Disable torch async compiling which won't work with daemonic processes
+        os.environ["TORCHINDUCTOR_COMPILE_THREADS"] = "1"
+
+        # Intel OpenMP setting
+        ld_prealod_str = os.getenv("LD_PRELOAD", "")
+        if "libiomp5.so" in ld_prealod_str:
+            # The time(milliseconds) that a thread should wait after
+            # completing the execution of a parallel region, before sleeping.
+            os.environ['KMP_BLOCKTIME'] = "1"
+            # Prevents the CPU to run into low performance state
+            os.environ['KMP_TPAUSE'] = "0"
+            # Provides fine granularity parallelism
+            os.environ['KMP_FORKJOIN_BARRIER_PATTERN'] = "dist,dist"
+            os.environ['KMP_PLAIN_BARRIER_PATTERN'] = "dist,dist"
+            os.environ['KMP_REDUCTION_BARRIER_PATTERN'] = "dist,dist"
+
+        # To hint IPEX uses shared memory based AllReduce
+        os.environ["LOCAL_WORLD_SIZE"] = str(
+            vllm_config.parallel_config.tensor_parallel_size)
+
    @classmethod
    def is_pin_memory_available(cls) -> bool:
        logger.warning("Pin memory is not supported on CPU.")

--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -139,6 +139,28 @@ class CudaPlatformBase(Platform):
                else:
                    parallel_config.worker_cls = "vllm.worker.worker.Worker"

+        world_size = parallel_config.world_size
+        tensor_parallel_size = parallel_config.tensor_parallel_size
+
+        from vllm.utils import (cuda_device_count_stateless,
+                                update_environment_variables)
+
+        # Set CUDA_VISIBLE_DEVICES for the driver, inherited by workers
+        if "CUDA_VISIBLE_DEVICES" not in os.environ:
+            update_environment_variables({
+                "CUDA_VISIBLE_DEVICES": (",".join(map(str, range(world_size))))
+            })
+
+        cuda_device_count = cuda_device_count_stateless()
+        # Use confusing message for more common TP-only case.
+        assert tensor_parallel_size <= cuda_device_count, (
+            f"please set tensor_parallel_size ({tensor_parallel_size}) "
+            f"to less than max local gpu count ({cuda_device_count})")
+
+        assert world_size <= cuda_device_count, (
+            f"please ensure that world_size ({world_size}) "
+            f"is less than than max local gpu count ({cuda_device_count})")
+
        cache_config = vllm_config.cache_config
        if cache_config and cache_config.block_size is None:
            cache_config.block_size = 16

--- a/vllm/platforms/neuron.py
+++ b/vllm/platforms/neuron.py
@@ -35,6 +35,14 @@ class NeuronPlatform(Platform):
            parallel_config.worker_cls = \
                "vllm.worker.neuron_worker.NeuronWorker"

+        if parallel_config.world_size > 1:
+            parallel_config.distributed_executor_backend = "uni"
+
+        assert (vllm_config.lora_config is
+                None), "LoRA is not supported for Neuron backend."
+        assert (not vllm_config.speculative_config
+                ), "Speculative decoding not yet supported for Neuron backend."
+
        cache_config = vllm_config.cache_config
        if cache_config:
            # neuron needs block_size = max_model_len

--- a/vllm/platforms/openvino.py
+++ b/vllm/platforms/openvino.py
@@ -66,9 +66,8 @@ class OpenVinoPlatform(Platform):
        from vllm.utils import GiB_bytes

        parallel_config = vllm_config.parallel_config
-        assert (
-            parallel_config.world_size == 1
-        ), "OpenVINOExecutor only supports single CPU socket currently."
+        assert (parallel_config.world_size == 1
+                ), "OpenVINO only supports single CPU socket currently."

        if parallel_config.worker_cls == "auto":
            parallel_config.worker_cls = \
@@ -141,3 +140,10 @@ class OpenVinoPlatform(Platform):
            raise RuntimeError(
                "Invalid environment variable VLLM_OPENVINO_KVCACHE_SPACE"
                f" {kv_cache_space}, expect a positive integer value.")
+
+        assert vllm_config.device_config.device_type == "openvino"
+        assert vllm_config.lora_config is None, \
+            "OpenVINO backend doesn't support LoRA"
+        assert cls.is_openvino_cpu() or \
+            cls.is_openvino_gpu(), \
+            "OpenVINO backend supports only CPU and GPU devices"
--- a/vllm/platforms/tpu.py
+++ b/vllm/platforms/tpu.py
@@ -72,6 +72,16 @@ class TpuPlatform(Platform):
        assert vllm_config.speculative_config is None, \
            "TPU does not support speculative decoding"

+        assert not vllm_config.scheduler_config.chunked_prefill_enabled, (
+            "Chunked prefill is not yet supported for TPU backend")
+        assert not vllm_config.speculative_config, (
+            "Speculative decoding is not yet supported for TPU backend")
+        if vllm_config.model_config.dtype in (torch.float16, torch.float32):
+            logger.warning(
+                "The TPU backend currently does not support %s. "
+                "Using bfloat16 instead.", vllm_config.model_config.dtype)
+            vllm_config.model_config.dtype = torch.bfloat16
+
        parallel_config = vllm_config.parallel_config
        scheduler_config = vllm_config.scheduler_config
        if parallel_config.worker_cls == "auto":

--- a/vllm/platforms/xpu.py
+++ b/vllm/platforms/xpu.py
@@ -78,17 +78,31 @@ class XPUPlatform(Platform):
            raise NotImplementedError(
                "XPU does not support speculative decoding")

+        if vllm_config.device_config is not None:
+            assert vllm_config.device_config.device_type == "xpu"
+
        # check and update parallel config
        parallel_config = vllm_config.parallel_config
-        if (parallel_config.distributed_executor_backend is not None
-                and parallel_config.distributed_executor_backend != "ray"):
+        if parallel_config.worker_cls == "auto":
+            parallel_config.worker_cls = "vllm.worker.xpu_worker.XPUWorker"
+
+        if parallel_config.distributed_executor_backend is None:
+            parallel_config.distributed_executor_backend = "ray"
+        elif parallel_config.distributed_executor_backend == "mp":
+            # FIXME(kunshang):
+            # spawn needs calling `if __name__ == '__main__':``
+            # fork is not supported for xpu start new process.
+            logger.error(
+                "Both start methods (spawn and fork) have issue "
+                "on XPU if you use mp backend, setting it to ray instead.")
+            parallel_config.distributed_executor_backend = "ray"
+
+        elif parallel_config.distributed_executor_backend != "ray":
            logger.warning(
                "%s is not supported on XPU, fallback to ray distributed"
                " executor backend.",
                parallel_config.distributed_executor_backend)
            parallel_config.distributed_executor_backend = "ray"
-        if parallel_config.worker_cls == "auto":
-            parallel_config.worker_cls = "vllm.worker.xpu_worker.XPUWorker"

    @classmethod
    def is_pin_memory_available(cls):

--- a/vllm/spec_decode/medusa_worker.py
+++ b/vllm/spec_decode/medusa_worker.py
@@ -9,17 +9,15 @@ from vllm.sequence import ExecuteModelRequest, SequenceGroupMetadata
 from vllm.spec_decode.interfaces import SpeculativeProposals
 from vllm.spec_decode.proposer_worker_base import NonLLMProposerWorkerBase
 from vllm.spec_decode.top1_proposer import Top1Proposer
-from vllm.worker.worker_base import WorkerWrapperBase
+from vllm.worker.worker_base import DelegateWorkerBase


-class MedusaWorker(NonLLMProposerWorkerBase, WorkerWrapperBase):
+class MedusaWorker(NonLLMProposerWorkerBase, DelegateWorkerBase):
    """Worker for Medusa.
    """

    def __init__(self, *args, **kwargs):
-        super().__init__(kwargs.get("vllm_config"))
-        self.init_worker(*args, **kwargs)
-
+        DelegateWorkerBase.__init__(self, *args, **kwargs)
        # Lazy initialization list.
        self._proposer: Top1Proposer


--- a/vllm/spec_decode/multi_step_worker.py
+++ b/vllm/spec_decode/multi_step_worker.py
@@ -16,10 +16,10 @@ from vllm.spec_decode.interfaces import (SpeculativeProposals,
                                         SpeculativeProposer)
 from vllm.spec_decode.proposer_worker_base import ProposerWorkerBase
 from vllm.spec_decode.top1_proposer import Top1Proposer
-from vllm.worker.worker_base import WorkerWrapperBase
+from vllm.worker.worker_base import DelegateWorkerBase


-class MultiStepWorker(ProposerWorkerBase, WorkerWrapperBase):
+class MultiStepWorker(ProposerWorkerBase, DelegateWorkerBase):
    """The MultiStepWorker is equivalent to a Worker except that it allows
    multiple forward passes in a single call, assuming the scheduler has
    allocated enough space to store the additional KV. This reduces overhead
@@ -32,15 +32,12 @@ class MultiStepWorker(ProposerWorkerBase, WorkerWrapperBase):
    """

    def __init__(self, *args, **kwargs):
-        super().__init__(kwargs.get("vllm_config"))
-        self.init_worker(*args, **kwargs)
-
+        DelegateWorkerBase.__init__(self, *args, **kwargs)
        # Lazy initialization list.
        self._proposer: SpeculativeProposer

    def init_device(self) -> None:
        self.worker.init_device()
-
        self._proposer = Top1Proposer(
            weakref.proxy(self),  # type: ignore[arg-type]
            self.device,
@@ -56,18 +53,6 @@ class MultiStepWorker(ProposerWorkerBase, WorkerWrapperBase):
        self.model_runner.model.sampler.should_modify_greedy_probs_inplace = (
            True)

-    def determine_num_available_blocks(self) -> Tuple[int, int]:
-        return self.worker.determine_num_available_blocks()
-
-    def get_cache_block_size_bytes(self) -> int:
-        return self.worker.get_cache_block_size_bytes()
-
-    def initialize_cache(self, *args, **kwargs) -> None:
-        self.worker.initialize_cache(*args, **kwargs)
-
-    def execute_model(self, *args, **kwargs) -> List[SamplerOutput]:
-        return self.worker.execute_model(*args, **kwargs)
-
    @torch.inference_mode()
    def sampler_output(
        self,

--- a/vllm/spec_decode/spec_decode_worker.py
+++ b/vllm/spec_decode/spec_decode_worker.py
@@ -40,8 +40,8 @@ from vllm.spec_decode.util import (Timer, create_logprobs_output,
                                   get_all_num_logprobs,
                                   get_sampled_token_logprobs, nvtx_range,
                                   split_batch_by_proposal_len)
-from vllm.worker.worker_base import (LoraNotSupportedWorkerBase, WorkerBase,
-                                     WorkerWrapperBase)
+from vllm.utils import resolve_obj_by_qualname
+from vllm.worker.worker_base import LoraNotSupportedWorkerBase, WorkerBase

 logger = init_logger(__name__)

@@ -64,8 +64,9 @@ def create_spec_worker(*args, **kwargs) -> "SpecDecodeWorker":
    target_worker_config = copy.deepcopy(vllm_config)
    target_worker_config.parallel_config.worker_cls =\
        target_worker_config.parallel_config.sd_worker_cls
-    target_worker = WorkerWrapperBase(vllm_config=target_worker_config)
-    target_worker.init_worker(*args, **kwargs)
+    cls = resolve_obj_by_qualname(
+        target_worker_config.parallel_config.worker_cls)
+    target_worker = cls(*args, **kwargs)
    # Set the disable_logprobs variable in the TargetModelRunner instance
    # as per its value specified in the SpeculativeConfig.
    target_worker.model_runner.disable_logprobs =\

--- a/vllm/v1/executor/abstract.py
+++ b/vllm/v1/executor/abstract.py
@@ -14,8 +14,9 @@ class Executor(ABC):
        distributed_executor_backend = (
            vllm_config.parallel_config.distributed_executor_backend)
        if distributed_executor_backend == "ray":
-            from vllm.v1.executor.ray_executor import RayExecutor
-            executor_class = RayExecutor
+            from vllm.executor.ray_distributed_executor import (  # noqa
+                RayDistributedExecutor)
+            executor_class = RayDistributedExecutor
        elif distributed_executor_backend == "mp":
            from vllm.v1.executor.multiproc_executor import MultiprocExecutor
            executor_class = MultiprocExecutor

--- a/vllm/v1/executor/multiproc_executor.py
+++ b/vllm/v1/executor/multiproc_executor.py
@@ -246,9 +246,18 @@ class WorkerProc:
        ready_path: str,
    ):
        self.rank = rank
-        wrapper = WorkerWrapperBase(vllm_config=vllm_config)
-        wrapper.init_worker(vllm_config, local_rank, rank,
-                            distributed_init_method)
+        wrapper = WorkerWrapperBase(vllm_config=vllm_config, rank=rank)
+        # TODO: move `init_worker` to executor level as a collective rpc call
+        all_kwargs: List[Dict] = [
+            {} for _ in range(vllm_config.parallel_config.world_size)
+        ]
+        all_kwargs[rank] = {
+            "vllm_config": vllm_config,
+            "local_rank": local_rank,
+            "rank": rank,
+            "distributed_init_method": distributed_init_method,
+        }
+        wrapper.init_worker(all_kwargs)
        self.worker = wrapper.worker

        pid = os.getpid()
@@ -270,7 +279,7 @@ class WorkerProc:
            ready_socket.send_string(WorkerProc.READY_STR)
            ready_socket.send(payload)

-        self.worker.initialize()
+        self.worker.init_device()
        self.worker.load_model()

    @staticmethod

--- a/vllm/v1/executor/uniproc_executor.py
+++ b/vllm/v1/executor/uniproc_executor.py
@@ -27,7 +27,7 @@ class UniprocExecutor(Executor):
        self.observability_config = vllm_config.observability_config

        self.worker: Worker = self._create_worker()
-        self.worker.initialize()
+        self.worker.init_device()
        self.worker.load_model()

    def _create_worker(

--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@@ -33,6 +33,7 @@ class Worker:
        local_rank: int,
        rank: int,
        distributed_init_method: str,
+        is_driver_worker: bool = False,
    ):

        # TODO: use WorkerBase.__init__(self, vllm_config=vllm_config)
@@ -75,7 +76,7 @@ class Worker:
        else:
            self.profiler = None

-    def initialize(self):
+    def init_device(self):
        if self.device_config.device.type == "cuda":
            # torch.distributed.all_reduce does not free the input tensor until
            # the synchronization point. This causes the memory usage to grow

--- a/vllm/worker/hpu_worker.py
+++ b/vllm/worker/hpu_worker.py
@@ -2,6 +2,7 @@
 # Copyright (C) 2024 Habana Labs, Ltd. an Intel Company
 ###############################################################################

+import contextlib
 import gc
 import os
 from typing import List, Optional, Set, Tuple, Type
@@ -18,6 +19,7 @@ from vllm.distributed import (ensure_model_parallel_initialized,
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
 from vllm.model_executor import set_random_seed
+from vllm.model_executor.layers.sampler import SamplerOutput
 from vllm.prompt_adapter.request import PromptAdapterRequest
 from vllm.sequence import ExecuteModelRequest
 from vllm.utils import bind_kv_cache
@@ -124,6 +126,70 @@ class HPUWorker(LocalOrDistributedWorkerBase):
    def load_model(self):
        self.model_runner.load_model()

+    def execute_model(
+        self,
+        execute_model_req: Optional[ExecuteModelRequest] = None,
+    ) -> Optional[List[SamplerOutput]]:
+        assert execute_model_req is not None
+        # VLLM_HPU_LOG_STEP_GRAPH_COMPILATION     - will log graph compilations per engine step, only when there was any - highly recommended to use alongside PT_HPU_METRICS_GC_DETAILS! # noqa:E501
+        # VLLM_HPU_LOG_STEP_GRAPH_COMPILATION_ALL - will log graph compilations per engine step, always, even if there were none # noqa:E501
+        # VLLM_HPU_LOG_STEP_CPU_FALLBACKS         - will log cpu fallbacks per engine step, only when there was any # noqa:E501
+        # VLLM_HPU_LOG_STEP_CPU_FALLBACKS_ALL     - will log cpu fallbacks per engine step, always, even if there were none # noqa:E501
+        log_graph_compilation_all = os.environ.get(
+            'VLLM_HPU_LOG_STEP_GRAPH_COMPILATION_ALL', '0') != '0'
+        log_graph_compilation = os.environ.get(
+            'VLLM_HPU_LOG_STEP_GRAPH_COMPILATION',
+            '0') != '0' or log_graph_compilation_all
+        log_cpu_fallbacks_all = os.environ.get(
+            'VLLM_HPU_LOG_STEP_CPU_FALLBACKS_ALL', '0') != '0'
+        log_cpu_fallbacks = os.environ.get('VLLM_HPU_LOG_STEP_CPU_FALLBACKS',
+                                           '0') != '0' or log_cpu_fallbacks_all
+        if log_graph_compilation or log_cpu_fallbacks:
+            from habana_frameworks.torch.hpu.metrics import metric_localcontext
+            seq_group_metadata_list = execute_model_req.seq_group_metadata_list
+            is_prompt = any([
+                seq_group_metadata.is_prompt
+                for seq_group_metadata in seq_group_metadata_list
+            ])
+            max_context_len = max([
+                max([
+                    len(v.prompt_token_ids) + len(v.output_token_ids)
+                    for v in seq_group_metadata.seq_data.values()
+                ]) for seq_group_metadata in seq_group_metadata_list
+            ])  # whoa, that's some spicy stuff right here
+            max_num_blocks = (
+                (max_context_len - 1) // self.cache_config.block_size) + 1
+            input_stats = (f'is_prompt: {is_prompt}, '
+                           f'num_seqs: {len(seq_group_metadata_list)}, '
+                           f'max_context_len: {max_context_len}, '
+                           f'max_num_blocks {max_num_blocks}')
+            gc_ctx = metric_localcontext(
+                "graph_compilation"
+            ) if log_graph_compilation else contextlib.nullcontext()
+            cpu_fallback_ctx = metric_localcontext(
+                "cpu_fallback"
+            ) if log_cpu_fallbacks else contextlib.nullcontext()
+            with gc_ctx as gc_local_metric, \
+                cpu_fallback_ctx as cpu_fallback_local_metric:
+                output = LocalOrDistributedWorkerBase.execute_model(
+                    self, execute_model_req)
+            if (log_graph_compilation and gc_local_metric.stats()[0][1] > 0
+                ) or log_graph_compilation_all:
+                msg = ("VLLM_HPU_STEP_GRAPH_COMPILATION: "
+                       f"{gc_local_metric.stats()}, {input_stats}")
+                logger.warning(msg)
+            if (log_cpu_fallbacks and cpu_fallback_local_metric.stats()[0][1] >
+                    0) or log_cpu_fallbacks_all:
+                msg = ("VLLM_HPU_STEP_CPU_FALLBACK: "
+                       f"{cpu_fallback_local_metric.stats()}, {input_stats}")
+                logger.warning(msg)
+
+            return output
+
+        output = LocalOrDistributedWorkerBase.execute_model(
+            self, execute_model_req)
+        return output
+
    @torch.inference_mode()
    def determine_num_available_blocks(self) -> Tuple[int, int]:
        """Profiles the peak memory usage of the model to determine how many