Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
ad34c0df
Unverified
Commit
ad34c0df
authored
Jan 15, 2025
by
youkaichao
Committed by
GitHub
Jan 15, 2025
Browse files
[core] platform agnostic executor via collective_rpc (#11256)
Signed-off-by:
youkaichao
<
youkaichao@gmail.com
>
parent
f218f9c2
Changes
43
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
101 additions
and
19 deletions
+101
-19
vllm/worker/neuron_worker.py
vllm/worker/neuron_worker.py
+23
-5
vllm/worker/openvino_worker.py
vllm/worker/openvino_worker.py
+2
-4
vllm/worker/worker_base.py
vllm/worker/worker_base.py
+76
-10
No files found.
vllm/worker/neuron_worker.py
View file @
ad34c0df
...
@@ -8,6 +8,7 @@ from vllm.config import VllmConfig
...
@@ -8,6 +8,7 @@ from vllm.config import VllmConfig
from
vllm.distributed
import
(
ensure_model_parallel_initialized
,
from
vllm.distributed
import
(
ensure_model_parallel_initialized
,
init_distributed_environment
)
init_distributed_environment
)
from
vllm.model_executor
import
set_random_seed
from
vllm.model_executor
import
set_random_seed
from
vllm.model_executor.layers.sampler
import
SamplerOutput
from
vllm.sequence
import
ExecuteModelRequest
from
vllm.sequence
import
ExecuteModelRequest
from
vllm.worker.neuron_model_runner
import
NeuronModelRunner
from
vllm.worker.neuron_model_runner
import
NeuronModelRunner
from
vllm.worker.worker_base
import
(
LocalOrDistributedWorkerBase
,
from
vllm.worker.worker_base
import
(
LocalOrDistributedWorkerBase
,
...
@@ -25,6 +26,7 @@ class NeuronWorker(LoraNotSupportedWorkerBase, LocalOrDistributedWorkerBase):
...
@@ -25,6 +26,7 @@ class NeuronWorker(LoraNotSupportedWorkerBase, LocalOrDistributedWorkerBase):
local_rank
:
int
,
local_rank
:
int
,
rank
:
int
,
rank
:
int
,
distributed_init_method
:
str
,
distributed_init_method
:
str
,
is_driver_worker
:
bool
=
True
,
)
->
None
:
)
->
None
:
WorkerBase
.
__init__
(
self
,
vllm_config
=
vllm_config
)
WorkerBase
.
__init__
(
self
,
vllm_config
=
vllm_config
)
self
.
local_rank
=
local_rank
self
.
local_rank
=
local_rank
...
@@ -37,7 +39,22 @@ class NeuronWorker(LoraNotSupportedWorkerBase, LocalOrDistributedWorkerBase):
...
@@ -37,7 +39,22 @@ class NeuronWorker(LoraNotSupportedWorkerBase, LocalOrDistributedWorkerBase):
self
.
model_runner
:
NeuronModelRunner
=
NeuronModelRunner
(
self
.
model_runner
:
NeuronModelRunner
=
NeuronModelRunner
(
vllm_config
=
vllm_config
)
vllm_config
=
vllm_config
)
self
.
is_driver_worker
=
True
self
.
is_driver_worker
=
is_driver_worker
def
execute_model
(
self
,
execute_model_req
:
Optional
[
ExecuteModelRequest
]
=
None
,
)
->
Optional
[
List
[
SamplerOutput
]]:
assert
execute_model_req
is
not
None
assert
(
not
execute_model_req
.
blocks_to_swap_in
and
not
execute_model_req
.
blocks_to_swap_out
and
not
execute_model_req
.
blocks_to_copy
),
(
"Cache operations are not supported for Neuron backend."
)
assert
execute_model_req
.
num_lookahead_slots
==
0
,
(
"lookahead not supported for Neuron backend."
)
output
=
LocalOrDistributedWorkerBase
.
execute_model
(
self
,
execute_model_req
)
return
output
def
init_device
(
self
)
->
None
:
def
init_device
(
self
)
->
None
:
self
.
init_distributed_environment
()
self
.
init_distributed_environment
()
...
@@ -103,13 +120,14 @@ class NeuronWorker(LoraNotSupportedWorkerBase, LocalOrDistributedWorkerBase):
...
@@ -103,13 +120,14 @@ class NeuronWorker(LoraNotSupportedWorkerBase, LocalOrDistributedWorkerBase):
def
init_distributed_environment
(
self
):
def
init_distributed_environment
(
self
):
"""Neuron uses transformers-neuronx for tensor parallelism.
"""Neuron uses transformers-neuronx for tensor parallelism.
It has only one process to control multiple devices.
vLLM still needs the environment inited when TP/PP > 1
vLLM still needs the environment initialized when TP/PP > 1,
so we initialize a distributed environment with one process.
"""
"""
init_distributed_environment
(
init_distributed_environment
(
world_size
=
1
,
world_size
=
1
,
rank
=
self
.
rank
,
rank
=
0
,
local_rank
=
self
.
local_rank
,
local_rank
=
0
,
distributed_init_method
=
self
.
distributed_init_method
,
distributed_init_method
=
self
.
distributed_init_method
,
backend
=
"gloo"
,
backend
=
"gloo"
,
)
)
...
...
vllm/worker/openvino_worker.py
View file @
ad34c0df
...
@@ -211,16 +211,14 @@ class OpenVINOWorker(LoraNotSupportedWorkerBase):
...
@@ -211,16 +211,14 @@ class OpenVINOWorker(LoraNotSupportedWorkerBase):
def
__init__
(
def
__init__
(
self
,
self
,
ov_core
:
ov
.
Core
,
vllm_config
:
VllmConfig
,
vllm_config
:
VllmConfig
,
local_rank
:
int
,
local_rank
:
int
,
rank
:
int
,
rank
:
int
,
distributed_init_method
:
str
,
distributed_init_method
:
str
,
kv_cache_dtype
:
Optional
[
ov
.
Type
]
=
ov
.
Type
.
undefined
,
is_driver_worker
:
bool
=
False
,
is_driver_worker
:
bool
=
False
,
)
->
None
:
)
->
None
:
self
.
ov_core
=
ov_core
WorkerBase
.
__init__
(
self
,
vllm_config
)
WorkerBase
.
__init__
(
self
,
vllm_config
)
self
.
ov_core
=
ov
.
Core
()
self
.
parallel_config
.
rank
=
rank
self
.
parallel_config
.
rank
=
rank
self
.
local_rank
=
local_rank
self
.
local_rank
=
local_rank
self
.
rank
=
rank
self
.
rank
=
rank
...
@@ -237,7 +235,7 @@ class OpenVINOWorker(LoraNotSupportedWorkerBase):
...
@@ -237,7 +235,7 @@ class OpenVINOWorker(LoraNotSupportedWorkerBase):
self
.
model_runner
=
OpenVINOModelRunner
(
self
.
model_runner
=
OpenVINOModelRunner
(
self
.
ov_core
,
self
.
ov_core
,
vllm_config
=
self
.
vllm_config
,
vllm_config
=
self
.
vllm_config
,
kv_cache_dtype
=
kv_
cache_dtype
,
kv_cache_dtype
=
self
.
vllm_config
.
cache_config
.
cache_dtype
,
is_driver_worker
=
is_driver_worker
,
is_driver_worker
=
is_driver_worker
,
)
)
# Uninitialized cache engine. Will be initialized by
# Uninitialized cache engine. Will be initialized by
...
...
vllm/worker/worker_base.py
View file @
ad34c0df
...
@@ -88,7 +88,6 @@ class WorkerBase(ABC):
...
@@ -88,7 +88,6 @@ class WorkerBase(ABC):
if
output
is
None
:
if
output
is
None
:
return
None
return
None
@
abstractmethod
def
execute_model
(
def
execute_model
(
self
,
self
,
execute_model_req
:
Optional
[
ExecuteModelRequest
]
=
None
execute_model_req
:
Optional
[
ExecuteModelRequest
]
=
None
...
@@ -119,6 +118,58 @@ class WorkerBase(ABC):
...
@@ -119,6 +118,58 @@ class WorkerBase(ABC):
raise
NotImplementedError
raise
NotImplementedError
class
DelegateWorkerBase
(
WorkerBase
):
"""
A class that delegates all methods to another WorkerBase instance. This is
useful for creating a WorkerBase that wraps another WorkerBase instance,
e.g. speculative decoding.
"""
worker
:
WorkerBase
def
__init__
(
self
,
*
args
,
**
kwargs
,
)
->
None
:
vllm_config
:
VllmConfig
=
kwargs
.
get
(
"vllm_config"
)
cls
=
resolve_obj_by_qualname
(
vllm_config
.
parallel_config
.
worker_cls
)
self
.
worker
=
cls
(
*
args
,
**
kwargs
)
def
init_device
(
self
)
->
None
:
self
.
worker
.
init_device
()
def
determine_num_available_blocks
(
self
)
->
Tuple
[
int
,
int
]:
return
self
.
worker
.
determine_num_available_blocks
()
def
initialize_cache
(
self
,
num_gpu_blocks
:
int
,
num_cpu_blocks
:
int
)
->
None
:
self
.
worker
.
initialize_cache
(
num_gpu_blocks
,
num_cpu_blocks
)
def
execute_model
(
self
,
execute_model_req
:
Optional
[
ExecuteModelRequest
]
=
None
)
->
Optional
[
List
[
SamplerOutput
]]:
return
self
.
worker
.
execute_model
(
execute_model_req
)
def
get_cache_block_size_bytes
(
self
)
->
int
:
return
self
.
worker
.
get_cache_block_size_bytes
()
def
add_lora
(
self
,
lora_request
:
LoRARequest
)
->
bool
:
return
self
.
worker
.
add_lora
(
lora_request
)
def
remove_lora
(
self
,
lora_id
:
int
)
->
bool
:
return
self
.
worker
.
remove_lora
(
lora_id
)
def
pin_lora
(
self
,
lora_id
:
int
)
->
bool
:
return
self
.
worker
.
pin_lora
(
lora_id
)
def
list_loras
(
self
)
->
Set
[
int
]:
return
self
.
worker
.
list_loras
()
def
__getattr__
(
self
,
attr
):
return
getattr
(
self
.
worker
,
attr
)
class
LoraNotSupportedWorkerBase
(
WorkerBase
):
class
LoraNotSupportedWorkerBase
(
WorkerBase
):
"""Partial implementation of WorkerBase that raises exceptions when LoRA
"""Partial implementation of WorkerBase that raises exceptions when LoRA
methods are invoked.
methods are invoked.
...
@@ -419,17 +470,31 @@ class WorkerWrapperBase:
...
@@ -419,17 +470,31 @@ class WorkerWrapperBase:
def
__init__
(
def
__init__
(
self
,
self
,
vllm_config
:
VllmConfig
,
vllm_config
:
VllmConfig
,
rank
:
int
=
0
,
)
->
None
:
)
->
None
:
self
.
rank
=
rank
self
.
vllm_config
=
vllm_config
self
.
vllm_config
=
vllm_config
trust_remote_code
=
vllm_config
.
model_config
.
trust_remote_code
self
.
worker
:
Optional
[
WorkerBase
]
=
None
self
.
worker
:
Optional
[
WorkerBase
]
=
None
if
trust_remote_code
:
if
vllm_config
.
model_config
is
not
None
:
# note: lazy import to avoid importing torch before initializing
# it can be None in tests
from
vllm.utils
import
init_cached_hf_modules
trust_remote_code
=
vllm_config
.
model_config
.
trust_remote_code
init_cached_hf_modules
()
if
trust_remote_code
:
# note: lazy import to avoid importing torch before initializing
from
vllm.utils
import
init_cached_hf_modules
init_cached_hf_modules
()
def
adjust_rank
(
self
,
rank_mapping
:
Dict
[
int
,
int
])
->
None
:
"""
Adjust the rank based on the given mapping.
It is only used during the initialization of the executor,
to adjust the rank of workers after we create all workers.
"""
if
self
.
rank
in
rank_mapping
:
self
.
rank
=
rank_mapping
[
self
.
rank
]
@
staticmethod
def
update_environment_variables
(
self
,
envs_list
:
List
[
Dict
[
str
,
def
update_environment_variables
(
envs
:
Dict
[
str
,
str
])
->
None
:
str
]])
->
None
:
envs
=
envs_list
[
self
.
rank
]
key
=
'CUDA_VISIBLE_DEVICES'
key
=
'CUDA_VISIBLE_DEVICES'
if
key
in
envs
and
key
in
os
.
environ
:
if
key
in
envs
and
key
in
os
.
environ
:
# overwriting CUDA_VISIBLE_DEVICES is desired behavior
# overwriting CUDA_VISIBLE_DEVICES is desired behavior
...
@@ -437,11 +502,12 @@ class WorkerWrapperBase:
...
@@ -437,11 +502,12 @@ class WorkerWrapperBase:
del
os
.
environ
[
key
]
del
os
.
environ
[
key
]
update_environment_variables
(
envs
)
update_environment_variables
(
envs
)
def
init_worker
(
self
,
*
args
,
**
kwargs
)
:
def
init_worker
(
self
,
all_kwargs
:
List
[
Dict
[
str
,
Any
]])
->
None
:
"""
"""
Here we inject some common logic before initializing the worker.
Here we inject some common logic before initializing the worker.
Arguments are passed to the worker class constructor.
Arguments are passed to the worker class constructor.
"""
"""
kwargs
=
all_kwargs
[
self
.
rank
]
enable_trace_function_call_for_thread
(
self
.
vllm_config
)
enable_trace_function_call_for_thread
(
self
.
vllm_config
)
# see https://github.com/NVIDIA/nccl/issues/1234
# see https://github.com/NVIDIA/nccl/issues/1234
...
@@ -452,7 +518,7 @@ class WorkerWrapperBase:
...
@@ -452,7 +518,7 @@ class WorkerWrapperBase:
worker_class
=
resolve_obj_by_qualname
(
worker_class
=
resolve_obj_by_qualname
(
self
.
vllm_config
.
parallel_config
.
worker_cls
)
self
.
vllm_config
.
parallel_config
.
worker_cls
)
self
.
worker
=
worker_class
(
*
args
,
**
kwargs
)
self
.
worker
=
worker_class
(
**
kwargs
)
assert
self
.
worker
is
not
None
assert
self
.
worker
is
not
None
def
execute_method
(
self
,
method
:
str
,
*
args
,
**
kwargs
):
def
execute_method
(
self
,
method
:
str
,
*
args
,
**
kwargs
):
...
...
Prev
1
2
3
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment