Unverified Commit ad34c0df authored by youkaichao's avatar youkaichao Committed by GitHub
Browse files

[core] platform agnostic executor via collective_rpc (#11256)


Signed-off-by: default avataryoukaichao <youkaichao@gmail.com>
parent f218f9c2
...@@ -8,6 +8,7 @@ from vllm.config import VllmConfig ...@@ -8,6 +8,7 @@ from vllm.config import VllmConfig
from vllm.distributed import (ensure_model_parallel_initialized, from vllm.distributed import (ensure_model_parallel_initialized,
init_distributed_environment) init_distributed_environment)
from vllm.model_executor import set_random_seed from vllm.model_executor import set_random_seed
from vllm.model_executor.layers.sampler import SamplerOutput
from vllm.sequence import ExecuteModelRequest from vllm.sequence import ExecuteModelRequest
from vllm.worker.neuron_model_runner import NeuronModelRunner from vllm.worker.neuron_model_runner import NeuronModelRunner
from vllm.worker.worker_base import (LocalOrDistributedWorkerBase, from vllm.worker.worker_base import (LocalOrDistributedWorkerBase,
...@@ -25,6 +26,7 @@ class NeuronWorker(LoraNotSupportedWorkerBase, LocalOrDistributedWorkerBase): ...@@ -25,6 +26,7 @@ class NeuronWorker(LoraNotSupportedWorkerBase, LocalOrDistributedWorkerBase):
local_rank: int, local_rank: int,
rank: int, rank: int,
distributed_init_method: str, distributed_init_method: str,
is_driver_worker: bool = True,
) -> None: ) -> None:
WorkerBase.__init__(self, vllm_config=vllm_config) WorkerBase.__init__(self, vllm_config=vllm_config)
self.local_rank = local_rank self.local_rank = local_rank
...@@ -37,7 +39,22 @@ class NeuronWorker(LoraNotSupportedWorkerBase, LocalOrDistributedWorkerBase): ...@@ -37,7 +39,22 @@ class NeuronWorker(LoraNotSupportedWorkerBase, LocalOrDistributedWorkerBase):
self.model_runner: NeuronModelRunner = NeuronModelRunner( self.model_runner: NeuronModelRunner = NeuronModelRunner(
vllm_config=vllm_config) vllm_config=vllm_config)
self.is_driver_worker = True self.is_driver_worker = is_driver_worker
def execute_model(
self,
execute_model_req: Optional[ExecuteModelRequest] = None,
) -> Optional[List[SamplerOutput]]:
assert execute_model_req is not None
assert (not execute_model_req.blocks_to_swap_in
and not execute_model_req.blocks_to_swap_out
and not execute_model_req.blocks_to_copy), (
"Cache operations are not supported for Neuron backend.")
assert execute_model_req.num_lookahead_slots == 0, (
"lookahead not supported for Neuron backend.")
output = LocalOrDistributedWorkerBase.execute_model(
self, execute_model_req)
return output
def init_device(self) -> None: def init_device(self) -> None:
self.init_distributed_environment() self.init_distributed_environment()
...@@ -103,13 +120,14 @@ class NeuronWorker(LoraNotSupportedWorkerBase, LocalOrDistributedWorkerBase): ...@@ -103,13 +120,14 @@ class NeuronWorker(LoraNotSupportedWorkerBase, LocalOrDistributedWorkerBase):
def init_distributed_environment(self): def init_distributed_environment(self):
"""Neuron uses transformers-neuronx for tensor parallelism. """Neuron uses transformers-neuronx for tensor parallelism.
It has only one process to control multiple devices.
vLLM still needs the environment inited when TP/PP > 1 vLLM still needs the environment initialized when TP/PP > 1,
so we initialize a distributed environment with one process.
""" """
init_distributed_environment( init_distributed_environment(
world_size=1, world_size=1,
rank=self.rank, rank=0,
local_rank=self.local_rank, local_rank=0,
distributed_init_method=self.distributed_init_method, distributed_init_method=self.distributed_init_method,
backend="gloo", backend="gloo",
) )
......
...@@ -211,16 +211,14 @@ class OpenVINOWorker(LoraNotSupportedWorkerBase): ...@@ -211,16 +211,14 @@ class OpenVINOWorker(LoraNotSupportedWorkerBase):
def __init__( def __init__(
self, self,
ov_core: ov.Core,
vllm_config: VllmConfig, vllm_config: VllmConfig,
local_rank: int, local_rank: int,
rank: int, rank: int,
distributed_init_method: str, distributed_init_method: str,
kv_cache_dtype: Optional[ov.Type] = ov.Type.undefined,
is_driver_worker: bool = False, is_driver_worker: bool = False,
) -> None: ) -> None:
self.ov_core = ov_core
WorkerBase.__init__(self, vllm_config) WorkerBase.__init__(self, vllm_config)
self.ov_core = ov.Core()
self.parallel_config.rank = rank self.parallel_config.rank = rank
self.local_rank = local_rank self.local_rank = local_rank
self.rank = rank self.rank = rank
...@@ -237,7 +235,7 @@ class OpenVINOWorker(LoraNotSupportedWorkerBase): ...@@ -237,7 +235,7 @@ class OpenVINOWorker(LoraNotSupportedWorkerBase):
self.model_runner = OpenVINOModelRunner( self.model_runner = OpenVINOModelRunner(
self.ov_core, self.ov_core,
vllm_config=self.vllm_config, vllm_config=self.vllm_config,
kv_cache_dtype=kv_cache_dtype, kv_cache_dtype=self.vllm_config.cache_config.cache_dtype,
is_driver_worker=is_driver_worker, is_driver_worker=is_driver_worker,
) )
# Uninitialized cache engine. Will be initialized by # Uninitialized cache engine. Will be initialized by
......
...@@ -88,7 +88,6 @@ class WorkerBase(ABC): ...@@ -88,7 +88,6 @@ class WorkerBase(ABC):
if output is None: if output is None:
return None return None
@abstractmethod
def execute_model( def execute_model(
self, self,
execute_model_req: Optional[ExecuteModelRequest] = None execute_model_req: Optional[ExecuteModelRequest] = None
...@@ -119,6 +118,58 @@ class WorkerBase(ABC): ...@@ -119,6 +118,58 @@ class WorkerBase(ABC):
raise NotImplementedError raise NotImplementedError
class DelegateWorkerBase(WorkerBase):
"""
A class that delegates all methods to another WorkerBase instance. This is
useful for creating a WorkerBase that wraps another WorkerBase instance,
e.g. speculative decoding.
"""
worker: WorkerBase
def __init__(
self,
*args,
**kwargs,
) -> None:
vllm_config: VllmConfig = kwargs.get("vllm_config")
cls = resolve_obj_by_qualname(vllm_config.parallel_config.worker_cls)
self.worker = cls(*args, **kwargs)
def init_device(self) -> None:
self.worker.init_device()
def determine_num_available_blocks(self) -> Tuple[int, int]:
return self.worker.determine_num_available_blocks()
def initialize_cache(self, num_gpu_blocks: int,
num_cpu_blocks: int) -> None:
self.worker.initialize_cache(num_gpu_blocks, num_cpu_blocks)
def execute_model(
self,
execute_model_req: Optional[ExecuteModelRequest] = None
) -> Optional[List[SamplerOutput]]:
return self.worker.execute_model(execute_model_req)
def get_cache_block_size_bytes(self) -> int:
return self.worker.get_cache_block_size_bytes()
def add_lora(self, lora_request: LoRARequest) -> bool:
return self.worker.add_lora(lora_request)
def remove_lora(self, lora_id: int) -> bool:
return self.worker.remove_lora(lora_id)
def pin_lora(self, lora_id: int) -> bool:
return self.worker.pin_lora(lora_id)
def list_loras(self) -> Set[int]:
return self.worker.list_loras()
def __getattr__(self, attr):
return getattr(self.worker, attr)
class LoraNotSupportedWorkerBase(WorkerBase): class LoraNotSupportedWorkerBase(WorkerBase):
"""Partial implementation of WorkerBase that raises exceptions when LoRA """Partial implementation of WorkerBase that raises exceptions when LoRA
methods are invoked. methods are invoked.
...@@ -419,17 +470,31 @@ class WorkerWrapperBase: ...@@ -419,17 +470,31 @@ class WorkerWrapperBase:
def __init__( def __init__(
self, self,
vllm_config: VllmConfig, vllm_config: VllmConfig,
rank: int = 0,
) -> None: ) -> None:
self.rank = rank
self.vllm_config = vllm_config self.vllm_config = vllm_config
trust_remote_code = vllm_config.model_config.trust_remote_code
self.worker: Optional[WorkerBase] = None self.worker: Optional[WorkerBase] = None
if trust_remote_code: if vllm_config.model_config is not None:
# note: lazy import to avoid importing torch before initializing # it can be None in tests
from vllm.utils import init_cached_hf_modules trust_remote_code = vllm_config.model_config.trust_remote_code
init_cached_hf_modules() if trust_remote_code:
# note: lazy import to avoid importing torch before initializing
from vllm.utils import init_cached_hf_modules
init_cached_hf_modules()
def adjust_rank(self, rank_mapping: Dict[int, int]) -> None:
"""
Adjust the rank based on the given mapping.
It is only used during the initialization of the executor,
to adjust the rank of workers after we create all workers.
"""
if self.rank in rank_mapping:
self.rank = rank_mapping[self.rank]
@staticmethod def update_environment_variables(self, envs_list: List[Dict[str,
def update_environment_variables(envs: Dict[str, str]) -> None: str]]) -> None:
envs = envs_list[self.rank]
key = 'CUDA_VISIBLE_DEVICES' key = 'CUDA_VISIBLE_DEVICES'
if key in envs and key in os.environ: if key in envs and key in os.environ:
# overwriting CUDA_VISIBLE_DEVICES is desired behavior # overwriting CUDA_VISIBLE_DEVICES is desired behavior
...@@ -437,11 +502,12 @@ class WorkerWrapperBase: ...@@ -437,11 +502,12 @@ class WorkerWrapperBase:
del os.environ[key] del os.environ[key]
update_environment_variables(envs) update_environment_variables(envs)
def init_worker(self, *args, **kwargs): def init_worker(self, all_kwargs: List[Dict[str, Any]]) -> None:
""" """
Here we inject some common logic before initializing the worker. Here we inject some common logic before initializing the worker.
Arguments are passed to the worker class constructor. Arguments are passed to the worker class constructor.
""" """
kwargs = all_kwargs[self.rank]
enable_trace_function_call_for_thread(self.vllm_config) enable_trace_function_call_for_thread(self.vllm_config)
# see https://github.com/NVIDIA/nccl/issues/1234 # see https://github.com/NVIDIA/nccl/issues/1234
...@@ -452,7 +518,7 @@ class WorkerWrapperBase: ...@@ -452,7 +518,7 @@ class WorkerWrapperBase:
worker_class = resolve_obj_by_qualname( worker_class = resolve_obj_by_qualname(
self.vllm_config.parallel_config.worker_cls) self.vllm_config.parallel_config.worker_cls)
self.worker = worker_class(*args, **kwargs) self.worker = worker_class(**kwargs)
assert self.worker is not None assert self.worker is not None
def execute_method(self, method: str, *args, **kwargs): def execute_method(self, method: str, *args, **kwargs):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment