[core] platform agnostic executor via collective_rpc (#11256)

Signed-off-by: youkaichao <youkaichao@gmail.com>

[core] platform agnostic executor via collective_rpc (#11256)
Signed-off-by: youkaichao <youkaichao@gmail.com>
ad34c0df · youkaichao · GitHub · f218f9c2 · ad34c0df · ad34c0df
Unverified Commit ad34c0df authored Jan 15, 2025 by youkaichao Committed by GitHub Jan 15, 2025
Showing with 101 additions and 19 deletions

vllm/worker/neuron_worker.py vllm/worker/neuron_worker.py +23 -5

vllm/worker/openvino_worker.py vllm/worker/openvino_worker.py +2 -4

vllm/worker/worker_base.py vllm/worker/worker_base.py +76 -10

No files found.
--- a/vllm/worker/neuron_worker.py
+++ b/vllm/worker/neuron_worker.py
@@ -8,6 +8,7 @@ from vllm.config import VllmConfig
 from vllm.distributed import (ensure_model_parallel_initialized,
                              init_distributed_environment)
 from vllm.model_executor import set_random_seed
+from vllm.model_executor.layers.sampler import SamplerOutput
 from vllm.sequence import ExecuteModelRequest
 from vllm.worker.neuron_model_runner import NeuronModelRunner
 from vllm.worker.worker_base import (LocalOrDistributedWorkerBase,
@@ -25,6 +26,7 @@ class NeuronWorker(LoraNotSupportedWorkerBase, LocalOrDistributedWorkerBase):
        local_rank: int,
        rank: int,
        distributed_init_method: str,
+        is_driver_worker: bool = True,
    ) -> None:
        WorkerBase.__init__(self, vllm_config=vllm_config)
        self.local_rank = local_rank
@@ -37,7 +39,22 @@ class NeuronWorker(LoraNotSupportedWorkerBase, LocalOrDistributedWorkerBase):

        self.model_runner: NeuronModelRunner = NeuronModelRunner(
            vllm_config=vllm_config)
-        self.is_driver_worker = True
+        self.is_driver_worker = is_driver_worker
+
+    def execute_model(
+        self,
+        execute_model_req: Optional[ExecuteModelRequest] = None,
+    ) -> Optional[List[SamplerOutput]]:
+        assert execute_model_req is not None
+        assert (not execute_model_req.blocks_to_swap_in
+                and not execute_model_req.blocks_to_swap_out
+                and not execute_model_req.blocks_to_copy), (
+                    "Cache operations are not supported for Neuron backend.")
+        assert execute_model_req.num_lookahead_slots == 0, (
+            "lookahead not supported for Neuron backend.")
+        output = LocalOrDistributedWorkerBase.execute_model(
+            self, execute_model_req)
+        return output

    def init_device(self) -> None:
        self.init_distributed_environment()
@@ -103,13 +120,14 @@ class NeuronWorker(LoraNotSupportedWorkerBase, LocalOrDistributedWorkerBase):

    def init_distributed_environment(self):
        """Neuron uses transformers-neuronx for tensor parallelism.
-
-        vLLM still needs the environment inited when TP/PP > 1
+        It has only one process to control multiple devices.
+        vLLM still needs the environment initialized when TP/PP > 1,
+        so we initialize a distributed environment with one process.
        """
        init_distributed_environment(
            world_size=1,
-            rank=self.rank,
-            local_rank=self.local_rank,
+            rank=0,
+            local_rank=0,
            distributed_init_method=self.distributed_init_method,
            backend="gloo",
        )

--- a/vllm/worker/openvino_worker.py
+++ b/vllm/worker/openvino_worker.py
@@ -211,16 +211,14 @@ class OpenVINOWorker(LoraNotSupportedWorkerBase):

    def __init__(
        self,
-        ov_core: ov.Core,
        vllm_config: VllmConfig,
        local_rank: int,
        rank: int,
        distributed_init_method: str,
-        kv_cache_dtype: Optional[ov.Type] = ov.Type.undefined,
        is_driver_worker: bool = False,
    ) -> None:
-        self.ov_core = ov_core
        WorkerBase.__init__(self, vllm_config)
+        self.ov_core = ov.Core()
        self.parallel_config.rank = rank
        self.local_rank = local_rank
        self.rank = rank
@@ -237,7 +235,7 @@ class OpenVINOWorker(LoraNotSupportedWorkerBase):
        self.model_runner = OpenVINOModelRunner(
            self.ov_core,
            vllm_config=self.vllm_config,
-            kv_cache_dtype=kv_cache_dtype,
+            kv_cache_dtype=self.vllm_config.cache_config.cache_dtype,
            is_driver_worker=is_driver_worker,
        )
        # Uninitialized cache engine. Will be initialized by

--- a/vllm/worker/worker_base.py
+++ b/vllm/worker/worker_base.py
@@ -88,7 +88,6 @@ class WorkerBase(ABC):
                if output is None:
                    return None

-    @abstractmethod
    def execute_model(
        self,
        execute_model_req: Optional[ExecuteModelRequest] = None
@@ -119,6 +118,58 @@ class WorkerBase(ABC):
        raise NotImplementedError


+class DelegateWorkerBase(WorkerBase):
+    """
+    A class that delegates all methods to another WorkerBase instance. This is
+    useful for creating a WorkerBase that wraps another WorkerBase instance,
+    e.g. speculative decoding.
+    """
+    worker: WorkerBase
+
+    def __init__(
+        self,
+        *args,
+        **kwargs,
+    ) -> None:
+        vllm_config: VllmConfig = kwargs.get("vllm_config")
+        cls = resolve_obj_by_qualname(vllm_config.parallel_config.worker_cls)
+        self.worker = cls(*args, **kwargs)
+
+    def init_device(self) -> None:
+        self.worker.init_device()
+
+    def determine_num_available_blocks(self) -> Tuple[int, int]:
+        return self.worker.determine_num_available_blocks()
+
+    def initialize_cache(self, num_gpu_blocks: int,
+                         num_cpu_blocks: int) -> None:
+        self.worker.initialize_cache(num_gpu_blocks, num_cpu_blocks)
+
+    def execute_model(
+        self,
+        execute_model_req: Optional[ExecuteModelRequest] = None
+    ) -> Optional[List[SamplerOutput]]:
+        return self.worker.execute_model(execute_model_req)
+
+    def get_cache_block_size_bytes(self) -> int:
+        return self.worker.get_cache_block_size_bytes()
+
+    def add_lora(self, lora_request: LoRARequest) -> bool:
+        return self.worker.add_lora(lora_request)
+
+    def remove_lora(self, lora_id: int) -> bool:
+        return self.worker.remove_lora(lora_id)
+
+    def pin_lora(self, lora_id: int) -> bool:
+        return self.worker.pin_lora(lora_id)
+
+    def list_loras(self) -> Set[int]:
+        return self.worker.list_loras()
+
+    def __getattr__(self, attr):
+        return getattr(self.worker, attr)
+
+
 class LoraNotSupportedWorkerBase(WorkerBase):
    """Partial implementation of WorkerBase that raises exceptions when LoRA
    methods are invoked.
@@ -419,17 +470,31 @@ class WorkerWrapperBase:
    def __init__(
        self,
        vllm_config: VllmConfig,
+        rank: int = 0,
    ) -> None:
+        self.rank = rank
        self.vllm_config = vllm_config
-        trust_remote_code = vllm_config.model_config.trust_remote_code
        self.worker: Optional[WorkerBase] = None
-        if trust_remote_code:
-            # note: lazy import to avoid importing torch before initializing
-            from vllm.utils import init_cached_hf_modules
-            init_cached_hf_modules()
+        if vllm_config.model_config is not None:
+            # it can be None in tests
+            trust_remote_code = vllm_config.model_config.trust_remote_code
+            if trust_remote_code:
+                # note: lazy import to avoid importing torch before initializing
+                from vllm.utils import init_cached_hf_modules
+                init_cached_hf_modules()
+
+    def adjust_rank(self, rank_mapping: Dict[int, int]) -> None:
+        """
+        Adjust the rank based on the given mapping.
+        It is only used during the initialization of the executor,
+        to adjust the rank of workers after we create all workers.
+        """
+        if self.rank in rank_mapping:
+            self.rank = rank_mapping[self.rank]

-    @staticmethod
-    def update_environment_variables(envs: Dict[str, str]) -> None:
+    def update_environment_variables(self, envs_list: List[Dict[str,
+                                                                str]]) -> None:
+        envs = envs_list[self.rank]
        key = 'CUDA_VISIBLE_DEVICES'
        if key in envs and key in os.environ:
            # overwriting CUDA_VISIBLE_DEVICES is desired behavior
@@ -437,11 +502,12 @@ class WorkerWrapperBase:
            del os.environ[key]
        update_environment_variables(envs)

-    def init_worker(self, *args, **kwargs):
+    def init_worker(self, all_kwargs: List[Dict[str, Any]]) -> None:
        """
        Here we inject some common logic before initializing the worker.
        Arguments are passed to the worker class constructor.
        """
+        kwargs = all_kwargs[self.rank]
        enable_trace_function_call_for_thread(self.vllm_config)

        # see https://github.com/NVIDIA/nccl/issues/1234
@@ -452,7 +518,7 @@ class WorkerWrapperBase:

        worker_class = resolve_obj_by_qualname(
            self.vllm_config.parallel_config.worker_cls)
-        self.worker = worker_class(*args, **kwargs)
+        self.worker = worker_class(**kwargs)
        assert self.worker is not None

    def execute_method(self, method: str, *args, **kwargs):