Unverified Commit ad34c0df authored by youkaichao's avatar youkaichao Committed by GitHub
Browse files

[core] platform agnostic executor via collective_rpc (#11256)


Signed-off-by: default avataryoukaichao <youkaichao@gmail.com>
parent f218f9c2
......@@ -8,6 +8,7 @@ from vllm.config import VllmConfig
from vllm.distributed import (ensure_model_parallel_initialized,
init_distributed_environment)
from vllm.model_executor import set_random_seed
from vllm.model_executor.layers.sampler import SamplerOutput
from vllm.sequence import ExecuteModelRequest
from vllm.worker.neuron_model_runner import NeuronModelRunner
from vllm.worker.worker_base import (LocalOrDistributedWorkerBase,
......@@ -25,6 +26,7 @@ class NeuronWorker(LoraNotSupportedWorkerBase, LocalOrDistributedWorkerBase):
local_rank: int,
rank: int,
distributed_init_method: str,
is_driver_worker: bool = True,
) -> None:
WorkerBase.__init__(self, vllm_config=vllm_config)
self.local_rank = local_rank
......@@ -37,7 +39,22 @@ class NeuronWorker(LoraNotSupportedWorkerBase, LocalOrDistributedWorkerBase):
self.model_runner: NeuronModelRunner = NeuronModelRunner(
vllm_config=vllm_config)
self.is_driver_worker = True
self.is_driver_worker = is_driver_worker
def execute_model(
self,
execute_model_req: Optional[ExecuteModelRequest] = None,
) -> Optional[List[SamplerOutput]]:
assert execute_model_req is not None
assert (not execute_model_req.blocks_to_swap_in
and not execute_model_req.blocks_to_swap_out
and not execute_model_req.blocks_to_copy), (
"Cache operations are not supported for Neuron backend.")
assert execute_model_req.num_lookahead_slots == 0, (
"lookahead not supported for Neuron backend.")
output = LocalOrDistributedWorkerBase.execute_model(
self, execute_model_req)
return output
def init_device(self) -> None:
self.init_distributed_environment()
......@@ -103,13 +120,14 @@ class NeuronWorker(LoraNotSupportedWorkerBase, LocalOrDistributedWorkerBase):
def init_distributed_environment(self):
"""Neuron uses transformers-neuronx for tensor parallelism.
vLLM still needs the environment inited when TP/PP > 1
It has only one process to control multiple devices.
vLLM still needs the environment initialized when TP/PP > 1,
so we initialize a distributed environment with one process.
"""
init_distributed_environment(
world_size=1,
rank=self.rank,
local_rank=self.local_rank,
rank=0,
local_rank=0,
distributed_init_method=self.distributed_init_method,
backend="gloo",
)
......
......@@ -211,16 +211,14 @@ class OpenVINOWorker(LoraNotSupportedWorkerBase):
def __init__(
self,
ov_core: ov.Core,
vllm_config: VllmConfig,
local_rank: int,
rank: int,
distributed_init_method: str,
kv_cache_dtype: Optional[ov.Type] = ov.Type.undefined,
is_driver_worker: bool = False,
) -> None:
self.ov_core = ov_core
WorkerBase.__init__(self, vllm_config)
self.ov_core = ov.Core()
self.parallel_config.rank = rank
self.local_rank = local_rank
self.rank = rank
......@@ -237,7 +235,7 @@ class OpenVINOWorker(LoraNotSupportedWorkerBase):
self.model_runner = OpenVINOModelRunner(
self.ov_core,
vllm_config=self.vllm_config,
kv_cache_dtype=kv_cache_dtype,
kv_cache_dtype=self.vllm_config.cache_config.cache_dtype,
is_driver_worker=is_driver_worker,
)
# Uninitialized cache engine. Will be initialized by
......
......@@ -88,7 +88,6 @@ class WorkerBase(ABC):
if output is None:
return None
@abstractmethod
def execute_model(
self,
execute_model_req: Optional[ExecuteModelRequest] = None
......@@ -119,6 +118,58 @@ class WorkerBase(ABC):
raise NotImplementedError
class DelegateWorkerBase(WorkerBase):
"""
A class that delegates all methods to another WorkerBase instance. This is
useful for creating a WorkerBase that wraps another WorkerBase instance,
e.g. speculative decoding.
"""
worker: WorkerBase
def __init__(
self,
*args,
**kwargs,
) -> None:
vllm_config: VllmConfig = kwargs.get("vllm_config")
cls = resolve_obj_by_qualname(vllm_config.parallel_config.worker_cls)
self.worker = cls(*args, **kwargs)
def init_device(self) -> None:
self.worker.init_device()
def determine_num_available_blocks(self) -> Tuple[int, int]:
return self.worker.determine_num_available_blocks()
def initialize_cache(self, num_gpu_blocks: int,
num_cpu_blocks: int) -> None:
self.worker.initialize_cache(num_gpu_blocks, num_cpu_blocks)
def execute_model(
self,
execute_model_req: Optional[ExecuteModelRequest] = None
) -> Optional[List[SamplerOutput]]:
return self.worker.execute_model(execute_model_req)
def get_cache_block_size_bytes(self) -> int:
return self.worker.get_cache_block_size_bytes()
def add_lora(self, lora_request: LoRARequest) -> bool:
return self.worker.add_lora(lora_request)
def remove_lora(self, lora_id: int) -> bool:
return self.worker.remove_lora(lora_id)
def pin_lora(self, lora_id: int) -> bool:
return self.worker.pin_lora(lora_id)
def list_loras(self) -> Set[int]:
return self.worker.list_loras()
def __getattr__(self, attr):
return getattr(self.worker, attr)
class LoraNotSupportedWorkerBase(WorkerBase):
"""Partial implementation of WorkerBase that raises exceptions when LoRA
methods are invoked.
......@@ -419,17 +470,31 @@ class WorkerWrapperBase:
def __init__(
self,
vllm_config: VllmConfig,
rank: int = 0,
) -> None:
self.rank = rank
self.vllm_config = vllm_config
trust_remote_code = vllm_config.model_config.trust_remote_code
self.worker: Optional[WorkerBase] = None
if trust_remote_code:
# note: lazy import to avoid importing torch before initializing
from vllm.utils import init_cached_hf_modules
init_cached_hf_modules()
if vllm_config.model_config is not None:
# it can be None in tests
trust_remote_code = vllm_config.model_config.trust_remote_code
if trust_remote_code:
# note: lazy import to avoid importing torch before initializing
from vllm.utils import init_cached_hf_modules
init_cached_hf_modules()
def adjust_rank(self, rank_mapping: Dict[int, int]) -> None:
"""
Adjust the rank based on the given mapping.
It is only used during the initialization of the executor,
to adjust the rank of workers after we create all workers.
"""
if self.rank in rank_mapping:
self.rank = rank_mapping[self.rank]
@staticmethod
def update_environment_variables(envs: Dict[str, str]) -> None:
def update_environment_variables(self, envs_list: List[Dict[str,
str]]) -> None:
envs = envs_list[self.rank]
key = 'CUDA_VISIBLE_DEVICES'
if key in envs and key in os.environ:
# overwriting CUDA_VISIBLE_DEVICES is desired behavior
......@@ -437,11 +502,12 @@ class WorkerWrapperBase:
del os.environ[key]
update_environment_variables(envs)
def init_worker(self, *args, **kwargs):
def init_worker(self, all_kwargs: List[Dict[str, Any]]) -> None:
"""
Here we inject some common logic before initializing the worker.
Arguments are passed to the worker class constructor.
"""
kwargs = all_kwargs[self.rank]
enable_trace_function_call_for_thread(self.vllm_config)
# see https://github.com/NVIDIA/nccl/issues/1234
......@@ -452,7 +518,7 @@ class WorkerWrapperBase:
worker_class = resolve_obj_by_qualname(
self.vllm_config.parallel_config.worker_cls)
self.worker = worker_class(*args, **kwargs)
self.worker = worker_class(**kwargs)
assert self.worker is not None
def execute_method(self, method: str, *args, **kwargs):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment